{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0315, "eval_steps": 500, "global_step": 3150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 213.78125, "completions/mean_terminated_length": 213.78125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 6.318180561065674, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.011238914914429188, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0008, "num_tokens": 20513.0, "reward": 0.3707788288593292, "reward_std": 0.38860762119293213, "rewards/rollout_reward_func/mean": 0.3707788288593292, "rewards/rollout_reward_func/std": 0.5297516584396362, "sampling/importance_sampling_ratio/max": 0.07105467468500137, "sampling/importance_sampling_ratio/mean": 0.019949276000261307, "sampling/importance_sampling_ratio/min": 1.3622365807385503e-30, "sampling/sampling_logp_difference/max": 3.5185647010803223, "sampling/sampling_logp_difference/mean": 0.9081237316131592, "step": 1, "step_time": 6.137671710006543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.318180561065674, "epoch": 2e-05, "grad_norm": 0.011280897073447704, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": 0.0008, "step": 2, "step_time": 2.2216806479918887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 144.21875, "completions/mean_terminated_length": 144.21875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 6.059298813343048, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.021376298740506172, "kl": 0.0002831164820236154, "learning_rate": 5.714285714285715e-07, "loss": -0.0013, "num_tokens": 38776.0, "reward": 0.2769230604171753, "reward_std": 0.7105696201324463, "rewards/rollout_reward_func/mean": 0.2769230604171753, "rewards/rollout_reward_func/std": 0.724294900894165, "sampling/importance_sampling_ratio/max": 0.10380198061466217, "sampling/importance_sampling_ratio/mean": 0.04086197167634964, "sampling/importance_sampling_ratio/min": 0.007925329729914665, "sampling/sampling_logp_difference/max": 1.4128023386001587, "sampling/sampling_logp_difference/mean": 0.7035855054855347, "step": 3, "step_time": 4.233524005001527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.053636968135834, "epoch": 4e-05, "grad_norm": 0.02135491743683815, "kl": 0.00025913441822922323, "learning_rate": 8.571428571428572e-07, "loss": -0.0013, "step": 4, "step_time": 2.5380245560008916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.14617919921875, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.016698598861694336, "kl": 0.00020167687216599006, "learning_rate": 1.142857142857143e-06, "loss": -0.001, "num_tokens": 57984.0, "reward": 0.30893269181251526, "reward_std": 0.5330058932304382, "rewards/rollout_reward_func/mean": 0.30893269181251526, "rewards/rollout_reward_func/std": 0.6374653577804565, "sampling/importance_sampling_ratio/max": 0.19727075099945068, "sampling/importance_sampling_ratio/mean": 0.03639305382966995, "sampling/importance_sampling_ratio/min": 0.00026282493490725756, "sampling/sampling_logp_difference/max": 2.0114047527313232, "sampling/sampling_logp_difference/mean": 0.7383692264556885, "step": 5, "step_time": 4.559834393992787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.139570653438568, "epoch": 6e-05, "grad_norm": 0.01746007800102234, "kl": 0.00018666892901819665, "learning_rate": 1.4285714285714286e-06, "loss": -0.001, "step": 6, "step_time": 2.0458921219978947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.376411259174347, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.021232394501566887, "kl": 0.0002023973811446922, "learning_rate": 1.7142857142857145e-06, "loss": 0.001, "num_tokens": 77068.0, "reward": 0.16982212662696838, "reward_std": 0.41566693782806396, "rewards/rollout_reward_func/mean": 0.16982212662696838, "rewards/rollout_reward_func/std": 0.600350558757782, "sampling/importance_sampling_ratio/max": 0.23079627752304077, "sampling/importance_sampling_ratio/mean": 0.03541427105665207, "sampling/importance_sampling_ratio/min": 5.698367333550891e-10, "sampling/sampling_logp_difference/max": 3.432051181793213, "sampling/sampling_logp_difference/mean": 0.8584157228469849, "step": 7, "step_time": 4.664938173998962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.381335377693176, "epoch": 8e-05, "grad_norm": 0.019963594153523445, "kl": 0.00017121505879913457, "learning_rate": 2.0000000000000003e-06, "loss": 0.001, "step": 8, "step_time": 2.069503334008914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 154.3125, "completions/mean_terminated_length": 154.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.343003988265991, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.021648207679390907, "kl": 0.00022298503699857974, "learning_rate": 2.285714285714286e-06, "loss": 0.0006, "num_tokens": 96206.0, "reward": 0.3761764466762543, "reward_std": 0.6243375539779663, "rewards/rollout_reward_func/mean": 0.3761764466762543, "rewards/rollout_reward_func/std": 0.8092405200004578, "sampling/importance_sampling_ratio/max": 0.19819098711013794, "sampling/importance_sampling_ratio/mean": 0.04393341392278671, "sampling/importance_sampling_ratio/min": 2.319273982330678e-17, "sampling/sampling_logp_difference/max": 4.037425994873047, "sampling/sampling_logp_difference/mean": 0.9531519412994385, "step": 9, "step_time": 4.95478629200079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.34534227848053, "epoch": 0.0001, "grad_norm": 0.021308815106749535, "kl": 0.00018406673643767135, "learning_rate": 2.571428571428571e-06, "loss": 0.0006, "step": 10, "step_time": 3.0241339720014366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 148.53125, "completions/mean_terminated_length": 148.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.093482971191406, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 0.035945598036050797, "kl": 0.00022760698630008847, "learning_rate": 2.8571428571428573e-06, "loss": 0.0015, "num_tokens": 114095.0, "reward": 0.2384110689163208, "reward_std": 0.6384598016738892, "rewards/rollout_reward_func/mean": 0.2384110689163208, "rewards/rollout_reward_func/std": 0.743732750415802, "sampling/importance_sampling_ratio/max": 0.19490653276443481, "sampling/importance_sampling_ratio/mean": 0.054266225546598434, "sampling/importance_sampling_ratio/min": 9.548673551762477e-05, "sampling/sampling_logp_difference/max": 1.3587796688079834, "sampling/sampling_logp_difference/mean": 0.7091835737228394, "step": 11, "step_time": 4.91355219399702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.096036195755005, "epoch": 0.00012, "grad_norm": 0.03608441725373268, "kl": 0.0002221288905275287, "learning_rate": 3.142857142857143e-06, "loss": 0.0015, "step": 12, "step_time": 2.1413763299933635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 169.6875, "completions/mean_terminated_length": 169.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.166547477245331, "epoch": 0.00013, "frac_reward_zero_std": 0.0, "grad_norm": 0.023656906560063362, "kl": 0.00022778238781029359, "learning_rate": 3.428571428571429e-06, "loss": 0.0014, "num_tokens": 133829.0, "reward": 0.13587836921215057, "reward_std": 0.44072431325912476, "rewards/rollout_reward_func/mean": 0.13587836921215057, "rewards/rollout_reward_func/std": 0.5671783089637756, "sampling/importance_sampling_ratio/max": 0.18540146946907043, "sampling/importance_sampling_ratio/mean": 0.03570510074496269, "sampling/importance_sampling_ratio/min": 0.0002666503714863211, "sampling/sampling_logp_difference/max": 1.4642088413238525, "sampling/sampling_logp_difference/mean": 0.7285475730895996, "step": 13, "step_time": 4.99781812098081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.175786137580872, "epoch": 0.00014, "grad_norm": 0.023865394294261932, "kl": 0.00035001405922230333, "learning_rate": 3.7142857142857146e-06, "loss": 0.0014, "step": 14, "step_time": 2.048917366017122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 168.28125, "completions/mean_terminated_length": 168.28125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 6.103998899459839, "epoch": 0.00015, "frac_reward_zero_std": 0.0, "grad_norm": 0.0193563811480999, "kl": 0.0004031004173157271, "learning_rate": 4.000000000000001e-06, "loss": 0.0009, "num_tokens": 153022.0, "reward": 0.4260672926902771, "reward_std": 0.7013793587684631, "rewards/rollout_reward_func/mean": 0.4260672926902771, "rewards/rollout_reward_func/std": 0.7473291158676147, "sampling/importance_sampling_ratio/max": 0.09069618582725525, "sampling/importance_sampling_ratio/mean": 0.03282327204942703, "sampling/importance_sampling_ratio/min": 0.0007477927138097584, "sampling/sampling_logp_difference/max": 1.456470012664795, "sampling/sampling_logp_difference/mean": 0.7276224493980408, "step": 15, "step_time": 5.269229458994232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.1084665060043335, "epoch": 0.00016, "grad_norm": 0.019984809681773186, "kl": 0.000508938735947595, "learning_rate": 4.2857142857142855e-06, "loss": 0.0009, "step": 16, "step_time": 2.5679185959961615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 153.34375, "completions/mean_terminated_length": 153.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.369441449642181, "epoch": 0.00017, "frac_reward_zero_std": 0.0, "grad_norm": 0.03828444704413414, "kl": 0.0010212267989118118, "learning_rate": 4.571428571428572e-06, "loss": 0.0012, "num_tokens": 172641.0, "reward": 0.04596009850502014, "reward_std": 0.6843416094779968, "rewards/rollout_reward_func/mean": 0.04596009850502014, "rewards/rollout_reward_func/std": 0.80553138256073, "sampling/importance_sampling_ratio/max": 0.16345424950122833, "sampling/importance_sampling_ratio/mean": 0.037420034408569336, "sampling/importance_sampling_ratio/min": 0.00011569278285605833, "sampling/sampling_logp_difference/max": 1.7240861654281616, "sampling/sampling_logp_difference/mean": 0.7741081714630127, "step": 17, "step_time": 4.586126177004189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.395192801952362, "epoch": 0.00018, "grad_norm": 0.037725985050201416, "kl": 0.0018255019022035412, "learning_rate": 4.857142857142858e-06, "loss": 0.001, "step": 18, "step_time": 2.0578153189926525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 151.40625, "completions/mean_terminated_length": 151.40625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 6.259436130523682, "epoch": 0.00019, "frac_reward_zero_std": 0.0, "grad_norm": 0.015240487642586231, "kl": 0.0015583754611725453, "learning_rate": 5.142857142857142e-06, "loss": -0.0014, "num_tokens": 191734.0, "reward": 0.21492548286914825, "reward_std": 0.40267953276634216, "rewards/rollout_reward_func/mean": 0.21492548286914825, "rewards/rollout_reward_func/std": 0.6627923846244812, "sampling/importance_sampling_ratio/max": 0.07089456915855408, "sampling/importance_sampling_ratio/mean": 0.033812373876571655, "sampling/importance_sampling_ratio/min": 0.001519239624030888, "sampling/sampling_logp_difference/max": 1.4596322774887085, "sampling/sampling_logp_difference/mean": 0.7431936860084534, "step": 19, "step_time": 4.401513666001847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.294578969478607, "epoch": 0.0002, "grad_norm": 0.01650054194033146, "kl": 0.0024163076013792306, "learning_rate": 5.428571428571429e-06, "loss": -0.0014, "step": 20, "step_time": 2.0457186899875524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 143.15625, "completions/mean_terminated_length": 143.15625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 6.355331778526306, "epoch": 0.00021, "frac_reward_zero_std": 0.0, "grad_norm": 0.016620617359876633, "kl": 0.002897746118833311, "learning_rate": 5.7142857142857145e-06, "loss": -0.0004, "num_tokens": 210539.0, "reward": 0.1558317393064499, "reward_std": 0.48602035641670227, "rewards/rollout_reward_func/mean": 0.1558317393064499, "rewards/rollout_reward_func/std": 0.5772840976715088, "sampling/importance_sampling_ratio/max": 0.07566096633672714, "sampling/importance_sampling_ratio/mean": 0.03981533646583557, "sampling/importance_sampling_ratio/min": 1.1003792241717561e-20, "sampling/sampling_logp_difference/max": 3.236738681793213, "sampling/sampling_logp_difference/mean": 0.8582298755645752, "step": 21, "step_time": 5.320935021001787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.363956272602081, "epoch": 0.00022, "grad_norm": 0.01660221256315708, "kl": 0.0036136633425485343, "learning_rate": 6e-06, "loss": -0.0004, "step": 22, "step_time": 2.038360332000593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 110.8125, "completions/mean_terminated_length": 110.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.514486312866211, "epoch": 0.00023, "frac_reward_zero_std": 0.0, "grad_norm": 0.01773517020046711, "kl": 0.02421660786785651, "learning_rate": 6.285714285714286e-06, "loss": -0.0003, "num_tokens": 228565.0, "reward": 0.21886056661605835, "reward_std": 0.3477890193462372, "rewards/rollout_reward_func/mean": 0.21886056661605835, "rewards/rollout_reward_func/std": 0.41540753841400146, "sampling/importance_sampling_ratio/max": 0.0940227285027504, "sampling/importance_sampling_ratio/mean": 0.05011742562055588, "sampling/importance_sampling_ratio/min": 0.006764286197721958, "sampling/sampling_logp_difference/max": 1.3192501068115234, "sampling/sampling_logp_difference/mean": 0.7538418769836426, "step": 23, "step_time": 4.26801799700479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.535672426223755, "epoch": 0.00024, "grad_norm": 0.019195299595594406, "kl": 0.029898254113504663, "learning_rate": 6.571428571428572e-06, "loss": -0.0003, "step": 24, "step_time": 2.0375789660029113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 168.3125, "completions/mean_terminated_length": 168.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.400487124919891, "epoch": 0.00025, "frac_reward_zero_std": 0.0, "grad_norm": 0.026253944262862206, "kl": 0.022160244203405455, "learning_rate": 6.857142857142858e-06, "loss": -0.0009, "num_tokens": 248295.0, "reward": 0.30614277720451355, "reward_std": 0.7937732934951782, "rewards/rollout_reward_func/mean": 0.30614277720451355, "rewards/rollout_reward_func/std": 0.8782454133033752, "sampling/importance_sampling_ratio/max": 0.10510750114917755, "sampling/importance_sampling_ratio/mean": 0.03348657488822937, "sampling/importance_sampling_ratio/min": 1.0914378775761513e-11, "sampling/sampling_logp_difference/max": 4.6481733322143555, "sampling/sampling_logp_difference/mean": 0.8546884059906006, "step": 25, "step_time": 4.4315741040045395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.40515923500061, "epoch": 0.00026, "grad_norm": 0.02559761516749859, "kl": 0.02860373890143819, "learning_rate": 7.1428571428571436e-06, "loss": -0.001, "step": 26, "step_time": 2.0436719589924905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 132.78125, "completions/mean_terminated_length": 132.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.4124574065208435, "epoch": 0.00027, "frac_reward_zero_std": 0.0, "grad_norm": 0.028218060731887817, "kl": 0.03506819810718298, "learning_rate": 7.428571428571429e-06, "loss": -0.0008, "num_tokens": 266392.0, "reward": 0.22468125820159912, "reward_std": 0.6538606882095337, "rewards/rollout_reward_func/mean": 0.22468125820159912, "rewards/rollout_reward_func/std": 0.6769688725471497, "sampling/importance_sampling_ratio/max": 0.0809626579284668, "sampling/importance_sampling_ratio/mean": 0.0421845018863678, "sampling/importance_sampling_ratio/min": 0.0010805927449837327, "sampling/sampling_logp_difference/max": 1.4789752960205078, "sampling/sampling_logp_difference/mean": 0.758473813533783, "step": 27, "step_time": 5.1994121150055435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.400072991847992, "epoch": 0.00028, "grad_norm": 0.02617870643734932, "kl": 0.05108885921072215, "learning_rate": 7.714285714285716e-06, "loss": -0.001, "step": 28, "step_time": 2.0315786709907115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 316.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 171.15625, "completions/mean_terminated_length": 166.48387145996094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.222193717956543, "epoch": 0.00029, "frac_reward_zero_std": 0.25, "grad_norm": 0.011240425519645214, "kl": 0.03439442935632542, "learning_rate": 8.000000000000001e-06, "loss": -0.0005, "num_tokens": 285629.0, "reward": 0.5602259635925293, "reward_std": 0.4541664123535156, "rewards/rollout_reward_func/mean": 0.5602259635925293, "rewards/rollout_reward_func/std": 0.5573967099189758, "sampling/importance_sampling_ratio/max": 0.09481674432754517, "sampling/importance_sampling_ratio/mean": 0.03462404012680054, "sampling/importance_sampling_ratio/min": 7.695953952712993e-16, "sampling/sampling_logp_difference/max": 3.0370688438415527, "sampling/sampling_logp_difference/mean": 0.7842601537704468, "step": 29, "step_time": 4.632423835006193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.191247999668121, "epoch": 0.0003, "grad_norm": 0.0103544220328331, "kl": 0.05221366789191961, "learning_rate": 8.285714285714287e-06, "loss": -0.0005, "step": 30, "step_time": 2.064297381999495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 6.223719954490662, "epoch": 0.00031, "frac_reward_zero_std": 0.0, "grad_norm": 0.016674742102622986, "kl": 0.043891083099879324, "learning_rate": 8.571428571428571e-06, "loss": -0.001, "num_tokens": 303965.0, "reward": 0.4255528748035431, "reward_std": 0.6444106101989746, "rewards/rollout_reward_func/mean": 0.4255528748035431, "rewards/rollout_reward_func/std": 0.7306278347969055, "sampling/importance_sampling_ratio/max": 0.1014309898018837, "sampling/importance_sampling_ratio/mean": 0.04159087315201759, "sampling/importance_sampling_ratio/min": 0.00017158477567136288, "sampling/sampling_logp_difference/max": 1.794924259185791, "sampling/sampling_logp_difference/mean": 0.7321090698242188, "step": 31, "step_time": 4.444377136002004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.161008894443512, "epoch": 0.00032, "grad_norm": 0.018346747383475304, "kl": 0.05746693373657763, "learning_rate": 8.857142857142858e-06, "loss": -0.001, "step": 32, "step_time": 2.0547469360099058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 201.53125, "completions/mean_terminated_length": 201.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.3364198207855225, "epoch": 0.00033, "frac_reward_zero_std": 0.0, "grad_norm": 0.01631326414644718, "kl": 0.1552216641139239, "learning_rate": 9.142857142857144e-06, "loss": 0.0006, "num_tokens": 324662.0, "reward": 0.24601441621780396, "reward_std": 0.5056146383285522, "rewards/rollout_reward_func/mean": 0.24601441621780396, "rewards/rollout_reward_func/std": 0.6424553990364075, "sampling/importance_sampling_ratio/max": 0.10611812770366669, "sampling/importance_sampling_ratio/mean": 0.017595795914530754, "sampling/importance_sampling_ratio/min": 5.425171063920781e-19, "sampling/sampling_logp_difference/max": 3.7742984294891357, "sampling/sampling_logp_difference/mean": 0.8700766563415527, "step": 33, "step_time": 5.6060161929999595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.260313510894775, "epoch": 0.00034, "grad_norm": 0.018634339794516563, "kl": 0.1902990792877972, "learning_rate": 9.42857142857143e-06, "loss": 0.0006, "step": 34, "step_time": 2.089104572987708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 176.46875, "completions/mean_terminated_length": 177.06451416015625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 6.033062160015106, "epoch": 0.00035, "frac_reward_zero_std": 0.0, "grad_norm": 0.02348935417830944, "kl": 0.07808680832386017, "learning_rate": 9.714285714285715e-06, "loss": -0.0001, "num_tokens": 344797.0, "reward": 0.17253848910331726, "reward_std": 0.6019571423530579, "rewards/rollout_reward_func/mean": 0.17253848910331726, "rewards/rollout_reward_func/std": 0.8127042651176453, "sampling/importance_sampling_ratio/max": 0.1421411633491516, "sampling/importance_sampling_ratio/mean": 0.04218871146440506, "sampling/importance_sampling_ratio/min": 7.920036296127364e-06, "sampling/sampling_logp_difference/max": 2.5535621643066406, "sampling/sampling_logp_difference/mean": 0.7141014337539673, "step": 35, "step_time": 4.754231936996803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.9296910762786865, "epoch": 0.00036, "grad_norm": 0.023296985775232315, "kl": 0.08497131802141666, "learning_rate": 1e-05, "loss": -0.0002, "step": 36, "step_time": 2.1414879059957457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 160.96875, "completions/mean_terminated_length": 160.96875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 5.537781715393066, "epoch": 0.00037, "frac_reward_zero_std": 0.0, "grad_norm": 0.030350618064403534, "kl": 0.06367162615060806, "learning_rate": 9.9999999995372e-06, "loss": 0.0012, "num_tokens": 363812.0, "reward": 0.5607500076293945, "reward_std": 0.2481352686882019, "rewards/rollout_reward_func/mean": 0.5607500076293945, "rewards/rollout_reward_func/std": 0.36610597372055054, "sampling/importance_sampling_ratio/max": 0.19953575730323792, "sampling/importance_sampling_ratio/mean": 0.07802990078926086, "sampling/importance_sampling_ratio/min": 2.2207308640788273e-15, "sampling/sampling_logp_difference/max": 3.5133328437805176, "sampling/sampling_logp_difference/mean": 0.663939893245697, "step": 37, "step_time": 4.498871269985102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.468046545982361, "epoch": 0.00038, "grad_norm": 0.031963150948286057, "kl": 0.06316748354583979, "learning_rate": 9.999999998148802e-06, "loss": 0.0011, "step": 38, "step_time": 2.9504653550029616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 167.8125, "completions/mean_terminated_length": 167.8125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 5.337741315364838, "epoch": 0.00039, "frac_reward_zero_std": 0.0, "grad_norm": 0.051337819546461105, "kl": 0.06781293265521526, "learning_rate": 9.999999995834804e-06, "loss": -0.0016, "num_tokens": 383454.0, "reward": 0.47475963830947876, "reward_std": 0.3449661135673523, "rewards/rollout_reward_func/mean": 0.47475963830947876, "rewards/rollout_reward_func/std": 0.5837082266807556, "sampling/importance_sampling_ratio/max": 0.17015193402767181, "sampling/importance_sampling_ratio/mean": 0.07414740324020386, "sampling/importance_sampling_ratio/min": 0.005440629553049803, "sampling/sampling_logp_difference/max": 1.586467981338501, "sampling/sampling_logp_difference/mean": 0.5425010919570923, "step": 39, "step_time": 4.402966535009909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.31275075674057, "epoch": 0.0004, "grad_norm": 0.05208338797092438, "kl": 0.06711141718551517, "learning_rate": 9.999999992595207e-06, "loss": -0.0017, "step": 40, "step_time": 2.0329786370130023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 205.96875, "completions/mean_terminated_length": 205.96875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 5.904362499713898, "epoch": 0.00041, "frac_reward_zero_std": 0.0, "grad_norm": 0.023412104696035385, "kl": 0.08375711599364877, "learning_rate": 9.999999988430008e-06, "loss": -0.0012, "num_tokens": 404069.0, "reward": 0.5477193593978882, "reward_std": 0.8641973733901978, "rewards/rollout_reward_func/mean": 0.5477193593978882, "rewards/rollout_reward_func/std": 0.8305984735488892, "sampling/importance_sampling_ratio/max": 0.11356031149625778, "sampling/importance_sampling_ratio/mean": 0.0378519669175148, "sampling/importance_sampling_ratio/min": 1.2530767450542058e-21, "sampling/sampling_logp_difference/max": 3.5100507736206055, "sampling/sampling_logp_difference/mean": 0.8287367820739746, "step": 41, "step_time": 5.44500681199861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.842765271663666, "epoch": 0.00042, "grad_norm": 0.022786077111959457, "kl": 0.0933526256121695, "learning_rate": 9.999999983339212e-06, "loss": -0.0012, "step": 42, "step_time": 2.3092844570055604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 180.71875, "completions/mean_terminated_length": 180.71875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 5.468380093574524, "epoch": 0.00043, "frac_reward_zero_std": 0.0, "grad_norm": 0.022990530356764793, "kl": 0.09828441217541695, "learning_rate": 9.999999977322818e-06, "loss": -0.003, "num_tokens": 423196.0, "reward": 0.39269471168518066, "reward_std": 0.6338191032409668, "rewards/rollout_reward_func/mean": 0.39269471168518066, "rewards/rollout_reward_func/std": 0.6726635098457336, "sampling/importance_sampling_ratio/max": 0.11798857897520065, "sampling/importance_sampling_ratio/mean": 0.04524100571870804, "sampling/importance_sampling_ratio/min": 4.877846367890015e-05, "sampling/sampling_logp_difference/max": 2.8651647567749023, "sampling/sampling_logp_difference/mean": 0.609872579574585, "step": 43, "step_time": 4.5918282859929604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.326870024204254, "epoch": 0.00044, "grad_norm": 0.022513724863529205, "kl": 0.11218210496008396, "learning_rate": 9.999999970380822e-06, "loss": -0.0031, "step": 44, "step_time": 2.9544766199906007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 168.8125, "completions/mean_terminated_length": 168.8125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 5.001081109046936, "epoch": 0.00045, "frac_reward_zero_std": 0.0, "grad_norm": 0.05581960454583168, "kl": 0.11106578819453716, "learning_rate": 9.999999962513228e-06, "loss": -0.0072, "num_tokens": 442334.0, "reward": 0.2800697386264801, "reward_std": 0.581593930721283, "rewards/rollout_reward_func/mean": 0.2800697386264801, "rewards/rollout_reward_func/std": 0.6922678351402283, "sampling/importance_sampling_ratio/max": 0.29021334648132324, "sampling/importance_sampling_ratio/mean": 0.1033790111541748, "sampling/importance_sampling_ratio/min": 4.952572680849698e-07, "sampling/sampling_logp_difference/max": 3.437936544418335, "sampling/sampling_logp_difference/mean": 0.5619679689407349, "step": 45, "step_time": 4.555076233991713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.739388167858124, "epoch": 0.00046, "grad_norm": 0.05674294754862785, "kl": 0.13751274161040783, "learning_rate": 9.999999953720035e-06, "loss": -0.0078, "step": 46, "step_time": 2.0324454859946854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 4.109928399324417, "epoch": 0.00047, "frac_reward_zero_std": 0.0, "grad_norm": 0.13545507192611694, "kl": 0.20194115489721298, "learning_rate": 9.99999994400124e-06, "loss": 0.0152, "num_tokens": 461370.0, "reward": 0.49797117710113525, "reward_std": 0.1802864670753479, "rewards/rollout_reward_func/mean": 0.49797117710113525, "rewards/rollout_reward_func/std": 0.3071024417877197, "sampling/importance_sampling_ratio/max": 0.4181329607963562, "sampling/importance_sampling_ratio/mean": 0.225805401802063, "sampling/importance_sampling_ratio/min": 0.03210187330842018, "sampling/sampling_logp_difference/max": 1.582364797592163, "sampling/sampling_logp_difference/mean": 0.3348347246646881, "step": 47, "step_time": 4.235932894007419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.06506422162056, "epoch": 0.00048, "grad_norm": 0.13803869485855103, "kl": 0.20422465167939663, "learning_rate": 9.999999933356848e-06, "loss": 0.0151, "step": 48, "step_time": 2.0568857609760016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 151.4375, "completions/mean_terminated_length": 151.4375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 4.215419411659241, "epoch": 0.00049, "frac_reward_zero_std": 0.0, "grad_norm": 0.03651956841349602, "kl": 0.202836025506258, "learning_rate": 9.999999921786855e-06, "loss": -0.0032, "num_tokens": 480384.0, "reward": 0.5348029136657715, "reward_std": 0.4840479791164398, "rewards/rollout_reward_func/mean": 0.5348029136657715, "rewards/rollout_reward_func/std": 0.5687469244003296, "sampling/importance_sampling_ratio/max": 0.3900764584541321, "sampling/importance_sampling_ratio/mean": 0.1851857602596283, "sampling/importance_sampling_ratio/min": 0.011451692320406437, "sampling/sampling_logp_difference/max": 1.7213799953460693, "sampling/sampling_logp_difference/mean": 0.3911520838737488, "step": 49, "step_time": 4.405365323997103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.29188996553421, "epoch": 0.0005, "grad_norm": 0.03585308417677879, "kl": 0.19191867113113403, "learning_rate": 9.999999909291265e-06, "loss": -0.0032, "step": 50, "step_time": 2.502391122005065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 169.84375, "completions/mean_terminated_length": 169.84375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 4.863133549690247, "epoch": 0.00051, "frac_reward_zero_std": 0.0, "grad_norm": 0.04627097025513649, "kl": 0.21235091798007488, "learning_rate": 9.999999895870075e-06, "loss": -0.0065, "num_tokens": 499579.0, "reward": 0.3892519176006317, "reward_std": 0.6398563385009766, "rewards/rollout_reward_func/mean": 0.3892519176006317, "rewards/rollout_reward_func/std": 0.769768238067627, "sampling/importance_sampling_ratio/max": 0.3140711188316345, "sampling/importance_sampling_ratio/mean": 0.09457612037658691, "sampling/importance_sampling_ratio/min": 0.0004148186417296529, "sampling/sampling_logp_difference/max": 2.383612871170044, "sampling/sampling_logp_difference/mean": 0.5321005582809448, "step": 51, "step_time": 4.795493976009311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.827674329280853, "epoch": 0.00052, "grad_norm": 0.04619019106030464, "kl": 0.217450438067317, "learning_rate": 9.999999881523285e-06, "loss": -0.0066, "step": 52, "step_time": 2.1374445860055857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 169.65625, "completions/mean_terminated_length": 169.65625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 4.651445984840393, "epoch": 0.00053, "frac_reward_zero_std": 0.0, "grad_norm": 0.034604378044605255, "kl": 0.15654421783983707, "learning_rate": 9.999999866250896e-06, "loss": -0.0024, "num_tokens": 519176.0, "reward": 0.5331298112869263, "reward_std": 0.4350280165672302, "rewards/rollout_reward_func/mean": 0.5331298112869263, "rewards/rollout_reward_func/std": 0.6508305072784424, "sampling/importance_sampling_ratio/max": 0.3156212568283081, "sampling/importance_sampling_ratio/mean": 0.13457071781158447, "sampling/importance_sampling_ratio/min": 2.61600772511264e-14, "sampling/sampling_logp_difference/max": 3.3303427696228027, "sampling/sampling_logp_difference/mean": 0.5218971967697144, "step": 53, "step_time": 4.838458831000025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.586353689432144, "epoch": 0.00054, "grad_norm": 0.03528258204460144, "kl": 0.16010682471096516, "learning_rate": 9.999999850052909e-06, "loss": -0.0024, "step": 54, "step_time": 2.107818987002247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 156.71875, "completions/mean_terminated_length": 156.71875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 4.240657269954681, "epoch": 0.00055, "frac_reward_zero_std": 0.0, "grad_norm": 0.054169911891222, "kl": 0.15653510205447674, "learning_rate": 9.99999983292932e-06, "loss": 0.0017, "num_tokens": 538143.0, "reward": 0.5741778612136841, "reward_std": 0.4072805345058441, "rewards/rollout_reward_func/mean": 0.5741778612136841, "rewards/rollout_reward_func/std": 0.552071213722229, "sampling/importance_sampling_ratio/max": 0.3528154492378235, "sampling/importance_sampling_ratio/mean": 0.19346165657043457, "sampling/importance_sampling_ratio/min": 1.3970789802808525e-10, "sampling/sampling_logp_difference/max": 3.0404343605041504, "sampling/sampling_logp_difference/mean": 0.4651739299297333, "step": 55, "step_time": 4.980107946001226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.211510062217712, "epoch": 0.00056, "grad_norm": 0.05441330000758171, "kl": 0.15884790569543839, "learning_rate": 9.999999814880132e-06, "loss": 0.0014, "step": 56, "step_time": 2.5387764799961587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 157.9375, "completions/mean_terminated_length": 157.9375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 4.269396603107452, "epoch": 0.00057, "frac_reward_zero_std": 0.0, "grad_norm": 0.0776439756155014, "kl": 0.19561146385967731, "learning_rate": 9.999999795905347e-06, "loss": -0.0079, "num_tokens": 556757.0, "reward": 0.3412307798862457, "reward_std": 0.36109843850135803, "rewards/rollout_reward_func/mean": 0.3412307798862457, "rewards/rollout_reward_func/std": 0.5923658013343811, "sampling/importance_sampling_ratio/max": 0.3088909685611725, "sampling/importance_sampling_ratio/mean": 0.1735788732767105, "sampling/importance_sampling_ratio/min": 0.006455164402723312, "sampling/sampling_logp_difference/max": 1.4889906644821167, "sampling/sampling_logp_difference/mean": 0.39270031452178955, "step": 57, "step_time": 4.350772520003375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.216601848602295, "epoch": 0.00058, "grad_norm": 0.07735991477966309, "kl": 0.1946492325514555, "learning_rate": 9.999999776004962e-06, "loss": -0.0079, "step": 58, "step_time": 2.0335188130120514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 178.40625, "completions/mean_terminated_length": 178.40625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 3.7514399886131287, "epoch": 0.00059, "frac_reward_zero_std": 0.0, "grad_norm": 0.12250988930463791, "kl": 0.18474525026977062, "learning_rate": 9.999999755178978e-06, "loss": -0.0074, "num_tokens": 576258.0, "reward": 0.2313653826713562, "reward_std": 0.34375447034835815, "rewards/rollout_reward_func/mean": 0.2313653826713562, "rewards/rollout_reward_func/std": 0.43315064907073975, "sampling/importance_sampling_ratio/max": 0.37744683027267456, "sampling/importance_sampling_ratio/mean": 0.18810975551605225, "sampling/importance_sampling_ratio/min": 0.03320171684026718, "sampling/sampling_logp_difference/max": 1.1409138441085815, "sampling/sampling_logp_difference/mean": 0.315265417098999, "step": 59, "step_time": 4.464941604011983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.5991591811180115, "epoch": 0.0006, "grad_norm": 0.1216268241405487, "kl": 0.1898319572210312, "learning_rate": 9.999999733427394e-06, "loss": -0.0083, "step": 60, "step_time": 2.045966247023898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 178.8125, "completions/mean_terminated_length": 178.8125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 3.388431578874588, "epoch": 0.00061, "frac_reward_zero_std": 0.0, "grad_norm": 0.09579762071371078, "kl": 0.19098741188645363, "learning_rate": 9.99999971075021e-06, "loss": -0.0098, "num_tokens": 595468.0, "reward": 0.5771379470825195, "reward_std": 0.1559896171092987, "rewards/rollout_reward_func/mean": 0.5771379470825195, "rewards/rollout_reward_func/std": 0.35235175490379333, "sampling/importance_sampling_ratio/max": 0.45347991585731506, "sampling/importance_sampling_ratio/mean": 0.2518460750579834, "sampling/importance_sampling_ratio/min": 0.006605512462556362, "sampling/sampling_logp_difference/max": 1.4142683744430542, "sampling/sampling_logp_difference/mean": 0.2720300853252411, "step": 61, "step_time": 5.609027408005204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 3.1836721897125244, "epoch": 0.00062, "grad_norm": 0.0983501747250557, "kl": 0.2021306548267603, "learning_rate": 9.999999687147426e-06, "loss": -0.0103, "step": 62, "step_time": 2.126420820990461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 209.78125, "completions/mean_terminated_length": 209.78125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 3.0924576222896576, "epoch": 0.00063, "frac_reward_zero_std": 0.0, "grad_norm": 0.12938039004802704, "kl": 0.251254303380847, "learning_rate": 9.999999662619046e-06, "loss": -0.0092, "num_tokens": 616565.0, "reward": 0.46156248450279236, "reward_std": 0.4734688401222229, "rewards/rollout_reward_func/mean": 0.46156248450279236, "rewards/rollout_reward_func/std": 0.5781429409980774, "sampling/importance_sampling_ratio/max": 0.5893012881278992, "sampling/importance_sampling_ratio/mean": 0.2689833641052246, "sampling/importance_sampling_ratio/min": 0.02033422887325287, "sampling/sampling_logp_difference/max": 1.2860329151153564, "sampling/sampling_logp_difference/mean": 0.25520336627960205, "step": 63, "step_time": 4.997500559002219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8255865275859833, "epoch": 0.00064, "grad_norm": 0.12757636606693268, "kl": 0.2822006978094578, "learning_rate": 9.999999637165062e-06, "loss": -0.0103, "step": 64, "step_time": 2.062449101016682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 144.0625, "completions/mean_terminated_length": 144.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.705535411834717, "epoch": 0.00065, "frac_reward_zero_std": 0.0, "grad_norm": 0.18380282819271088, "kl": 0.4319814331829548, "learning_rate": 9.999999610785483e-06, "loss": -0.0201, "num_tokens": 634991.0, "reward": 0.4115000069141388, "reward_std": 0.370111346244812, "rewards/rollout_reward_func/mean": 0.4115000069141388, "rewards/rollout_reward_func/std": 0.5065022110939026, "sampling/importance_sampling_ratio/max": 0.7925047874450684, "sampling/importance_sampling_ratio/mean": 0.461337685585022, "sampling/importance_sampling_ratio/min": 7.694298090155075e-19, "sampling/sampling_logp_difference/max": 4.21630334854126, "sampling/sampling_logp_difference/mean": 0.4003519117832184, "step": 65, "step_time": 4.4071671109995805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013020833488553762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "entropy": 2.4360391795635223, "epoch": 0.00066, "grad_norm": 0.19248098134994507, "kl": 0.4671599566936493, "learning_rate": 9.999999583480304e-06, "loss": -0.0204, "step": 66, "step_time": 2.545272505005414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 123.21875, "completions/mean_terminated_length": 123.21875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 2.09667506814003, "epoch": 0.00067, "frac_reward_zero_std": 0.0, "grad_norm": 0.24670377373695374, "kl": 0.3206424117088318, "learning_rate": 9.999999555249524e-06, "loss": 0.0056, "num_tokens": 653278.0, "reward": 0.43550482392311096, "reward_std": 0.22033660113811493, "rewards/rollout_reward_func/mean": 0.43550482392311096, "rewards/rollout_reward_func/std": 0.30788350105285645, "sampling/importance_sampling_ratio/max": 1.0072073936462402, "sampling/importance_sampling_ratio/mean": 0.551132082939148, "sampling/importance_sampling_ratio/min": 0.09401676803827286, "sampling/sampling_logp_difference/max": 1.1408579349517822, "sampling/sampling_logp_difference/mean": 0.20106574892997742, "step": 67, "step_time": 4.5618851960025495 }, { "clip_ratio/high_max": 0.053125000558793545, "clip_ratio/high_mean": 0.03281250037252903, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03281250037252903, "entropy": 1.9240393936634064, "epoch": 0.00068, "grad_norm": 0.1703520268201828, "kl": 0.32166899368166924, "learning_rate": 9.999999526093148e-06, "loss": 0.0045, "step": 68, "step_time": 2.039196751007694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 158.71875, "completions/mean_terminated_length": 163.32257080078125, "completions/min_length": 16.0, "completions/min_terminated_length": 87.0, "entropy": 2.0143593847751617, "epoch": 0.00069, "frac_reward_zero_std": 0.0, "grad_norm": 0.25430789589881897, "kl": 0.40453479066491127, "learning_rate": 9.999999496011169e-06, "loss": -0.0536, "num_tokens": 672557.0, "reward": 0.7002788782119751, "reward_std": 0.29865896701812744, "rewards/rollout_reward_func/mean": 0.7002788782119751, "rewards/rollout_reward_func/std": 0.4287244379520416, "sampling/importance_sampling_ratio/max": 1.0489133596420288, "sampling/importance_sampling_ratio/mean": 0.6243404150009155, "sampling/importance_sampling_ratio/min": 2.877993954260233e-17, "sampling/sampling_logp_difference/max": 3.032127857208252, "sampling/sampling_logp_difference/mean": 0.31619083881378174, "step": 69, "step_time": 4.460530787007883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02708333358168602, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02708333358168602, "entropy": 1.864421084523201, "epoch": 0.0007, "grad_norm": 0.1905698925256729, "kl": 0.44295522570610046, "learning_rate": 9.999999465003593e-06, "loss": -0.0556, "step": 70, "step_time": 2.047188971999276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 148.4375, "completions/mean_terminated_length": 148.4375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 1.2894380241632462, "epoch": 0.00071, "frac_reward_zero_std": 0.0, "grad_norm": 0.360466331243515, "kl": 0.3969869762659073, "learning_rate": 9.999999433070417e-06, "loss": -0.0428, "num_tokens": 691555.0, "reward": 0.45427405834198, "reward_std": 0.09095782041549683, "rewards/rollout_reward_func/mean": 0.45427405834198, "rewards/rollout_reward_func/std": 0.1647215038537979, "sampling/importance_sampling_ratio/max": 1.0287127494812012, "sampling/importance_sampling_ratio/mean": 0.7440685629844666, "sampling/importance_sampling_ratio/min": 0.04800695925951004, "sampling/sampling_logp_difference/max": 1.4454832077026367, "sampling/sampling_logp_difference/mean": 0.12979662418365479, "step": 71, "step_time": 4.483743023993156 }, { "clip_ratio/high_max": 0.08437500055879354, "clip_ratio/high_mean": 0.04218750027939677, "clip_ratio/low_mean": 0.04278273927047849, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.08497023908421397, "entropy": 1.1478152871131897, "epoch": 0.00072, "grad_norm": 0.12801404297351837, "kl": 0.4550657086074352, "learning_rate": 9.999999400211643e-06, "loss": -0.0471, "step": 72, "step_time": 2.568255642989243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 136.3125, "completions/mean_terminated_length": 136.3125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.9863167256116867, "epoch": 0.00073, "frac_reward_zero_std": 0.0, "grad_norm": 0.3658084571361542, "kl": 0.6247481927275658, "learning_rate": 9.99999936642727e-06, "loss": 0.0121, "num_tokens": 710533.0, "reward": 0.49896636605262756, "reward_std": 0.15873748064041138, "rewards/rollout_reward_func/mean": 0.49896636605262756, "rewards/rollout_reward_func/std": 0.29334598779678345, "sampling/importance_sampling_ratio/max": 1.273625135421753, "sampling/importance_sampling_ratio/mean": 0.8309235572814941, "sampling/importance_sampling_ratio/min": 0.0004809725214727223, "sampling/sampling_logp_difference/max": 3.307723045349121, "sampling/sampling_logp_difference/mean": 0.178628072142601, "step": 73, "step_time": 4.627155068999855 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.05677083320915699, "clip_ratio/low_min": 0.012500000186264515, "clip_ratio/region_mean": 0.06458333320915699, "entropy": 0.9260285720229149, "epoch": 0.00074, "grad_norm": 0.37529072165489197, "kl": 0.7591256983578205, "learning_rate": 9.999999331717294e-06, "loss": 0.0122, "step": 74, "step_time": 2.0258814109911327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7935487478971481, "epoch": 0.00075, "frac_reward_zero_std": 0.0, "grad_norm": 0.372735857963562, "kl": 0.43873413652181625, "learning_rate": 9.999999296081722e-06, "loss": 0.06, "num_tokens": 729001.0, "reward": 0.603413462638855, "reward_std": 0.11327745765447617, "rewards/rollout_reward_func/mean": 0.603413462638855, "rewards/rollout_reward_func/std": 0.34945592284202576, "sampling/importance_sampling_ratio/max": 1.5389275550842285, "sampling/importance_sampling_ratio/mean": 0.8188439607620239, "sampling/importance_sampling_ratio/min": 0.01728057488799095, "sampling/sampling_logp_difference/max": 1.5580322742462158, "sampling/sampling_logp_difference/mean": 0.17791429162025452, "step": 75, "step_time": 4.368624617993191 }, { "clip_ratio/high_max": 0.10312500037252903, "clip_ratio/high_mean": 0.07083333376795053, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07083333376795053, "entropy": 0.7949494421482086, "epoch": 0.00076, "grad_norm": 0.19858868420124054, "kl": 0.4292348548769951, "learning_rate": 9.999999259520549e-06, "loss": 0.0579, "step": 76, "step_time": 2.0330513829976553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 141.71875, "completions/mean_terminated_length": 141.71875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.8335943073034286, "epoch": 0.00077, "frac_reward_zero_std": 0.0, "grad_norm": 0.3287714421749115, "kl": 0.4646935649216175, "learning_rate": 9.99999922203378e-06, "loss": -0.0246, "num_tokens": 747568.0, "reward": 0.5384615659713745, "reward_std": 0.3173660635948181, "rewards/rollout_reward_func/mean": 0.5384615659713745, "rewards/rollout_reward_func/std": 0.453425794839859, "sampling/importance_sampling_ratio/max": 1.3680675029754639, "sampling/importance_sampling_ratio/mean": 0.8376802206039429, "sampling/importance_sampling_ratio/min": 0.14152143895626068, "sampling/sampling_logp_difference/max": 1.4877281188964844, "sampling/sampling_logp_difference/mean": 0.1309281587600708, "step": 77, "step_time": 4.742412109000725 }, { "clip_ratio/high_max": 0.06250000093132257, "clip_ratio/high_mean": 0.03125000046566129, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03906250046566129, "entropy": 0.8168966248631477, "epoch": 0.00078, "grad_norm": 0.2832071781158447, "kl": 0.4783709868788719, "learning_rate": 9.99999918362141e-06, "loss": -0.0247, "step": 78, "step_time": 2.5049541869957466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 152.6875, "completions/mean_terminated_length": 152.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.8380982279777527, "epoch": 0.00079, "frac_reward_zero_std": 0.25, "grad_norm": 0.5512309074401855, "kl": 0.3993317112326622, "learning_rate": 9.99999914428344e-06, "loss": -0.0351, "num_tokens": 767054.0, "reward": 0.48852404952049255, "reward_std": 0.3803469240665436, "rewards/rollout_reward_func/mean": 0.48852404952049255, "rewards/rollout_reward_func/std": 0.5152319669723511, "sampling/importance_sampling_ratio/max": 1.4971524477005005, "sampling/importance_sampling_ratio/mean": 0.9358680248260498, "sampling/importance_sampling_ratio/min": 0.14966991543769836, "sampling/sampling_logp_difference/max": 1.614027738571167, "sampling/sampling_logp_difference/mean": 0.12439900636672974, "step": 79, "step_time": 4.3489922159933485 }, { "clip_ratio/high_max": 0.09583333507180214, "clip_ratio/high_mean": 0.05937500111758709, "clip_ratio/low_mean": 0.05937499972060323, "clip_ratio/low_min": 0.02500000037252903, "clip_ratio/region_mean": 0.11875000363215804, "entropy": 0.817698635160923, "epoch": 0.0008, "grad_norm": 0.16421765089035034, "kl": 0.4066091515123844, "learning_rate": 9.999999104019872e-06, "loss": -0.039, "step": 80, "step_time": 2.04557593401114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 186.90625, "completions/mean_terminated_length": 186.90625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.8255016505718231, "epoch": 0.00081, "frac_reward_zero_std": 0.25, "grad_norm": 0.3710794448852539, "kl": 0.3990647718310356, "learning_rate": 9.999999062830703e-06, "loss": -0.0126, "num_tokens": 787635.0, "reward": 0.7066490650177002, "reward_std": 0.08639651536941528, "rewards/rollout_reward_func/mean": 0.7066490650177002, "rewards/rollout_reward_func/std": 0.3352658152580261, "sampling/importance_sampling_ratio/max": 1.3827565908432007, "sampling/importance_sampling_ratio/mean": 0.7661992907524109, "sampling/importance_sampling_ratio/min": 0.0879908949136734, "sampling/sampling_logp_difference/max": 1.219576120376587, "sampling/sampling_logp_difference/mean": 0.116633340716362, "step": 81, "step_time": 4.832038624997949 }, { "clip_ratio/high_max": 0.09955357294529676, "clip_ratio/high_mean": 0.05498512042686343, "clip_ratio/low_mean": 0.05811012024059892, "clip_ratio/low_min": 0.008928571827709675, "clip_ratio/region_mean": 0.11309524113312364, "entropy": 0.7891900613903999, "epoch": 0.00082, "grad_norm": 0.35570594668388367, "kl": 0.4531635195016861, "learning_rate": 9.999999020715937e-06, "loss": -0.0146, "step": 82, "step_time": 2.039056052984961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 198.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 146.84375, "completions/mean_terminated_length": 145.19354248046875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.8926726058125496, "epoch": 0.00083, "frac_reward_zero_std": 0.25, "grad_norm": 0.38733991980552673, "kl": 0.5863618142902851, "learning_rate": 9.999998977675572e-06, "loss": -0.02, "num_tokens": 805902.0, "reward": 0.48894232511520386, "reward_std": 0.061345599591732025, "rewards/rollout_reward_func/mean": 0.48894232511520386, "rewards/rollout_reward_func/std": 0.2096993327140808, "sampling/importance_sampling_ratio/max": 1.2856169939041138, "sampling/importance_sampling_ratio/mean": 0.9331357479095459, "sampling/importance_sampling_ratio/min": 1.531731719239815e-08, "sampling/sampling_logp_difference/max": 2.246772289276123, "sampling/sampling_logp_difference/mean": 0.18322230875492096, "step": 83, "step_time": 4.867188295989763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03177083423361182, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03177083423361182, "entropy": 0.8781721964478493, "epoch": 0.00084, "grad_norm": 0.43776145577430725, "kl": 0.6348253712058067, "learning_rate": 9.999998933709607e-06, "loss": -0.0208, "step": 84, "step_time": 2.5305817910048063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 160.84375, "completions/mean_terminated_length": 160.84375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6866025924682617, "epoch": 0.00085, "frac_reward_zero_std": 0.5, "grad_norm": 0.9612157344818115, "kl": 0.805127453058958, "learning_rate": 9.999998888818043e-06, "loss": -0.0131, "num_tokens": 825073.0, "reward": 0.38293272256851196, "reward_std": 0.040168844163417816, "rewards/rollout_reward_func/mean": 0.38293272256851196, "rewards/rollout_reward_func/std": 0.09271486103534698, "sampling/importance_sampling_ratio/max": 1.030728816986084, "sampling/importance_sampling_ratio/mean": 0.837787389755249, "sampling/importance_sampling_ratio/min": 0.07166565954685211, "sampling/sampling_logp_difference/max": 2.1112937927246094, "sampling/sampling_logp_difference/mean": 0.114211805164814, "step": 85, "step_time": 4.532980064017465 }, { "clip_ratio/high_max": 0.03720238246023655, "clip_ratio/high_mean": 0.018601191230118275, "clip_ratio/low_mean": 0.04538690624758601, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06398809747770429, "entropy": 0.714985154569149, "epoch": 0.00086, "grad_norm": 0.165265291929245, "kl": 0.8111418820917606, "learning_rate": 9.99999884300088e-06, "loss": -0.0162, "step": 86, "step_time": 2.028771916011465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 195.78125, "completions/mean_terminated_length": 195.78125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6452316120266914, "epoch": 0.00087, "frac_reward_zero_std": 0.25, "grad_norm": 1.796452522277832, "kl": 0.43615154549479485, "learning_rate": 9.999998796258118e-06, "loss": 0.0113, "num_tokens": 845810.0, "reward": 0.1475721150636673, "reward_std": 0.34540414810180664, "rewards/rollout_reward_func/mean": 0.1475721150636673, "rewards/rollout_reward_func/std": 0.6274085640907288, "sampling/importance_sampling_ratio/max": 1.5104280710220337, "sampling/importance_sampling_ratio/mean": 0.933820366859436, "sampling/importance_sampling_ratio/min": 0.11543028056621552, "sampling/sampling_logp_difference/max": 0.7825107574462891, "sampling/sampling_logp_difference/mean": 0.07756420969963074, "step": 87, "step_time": 4.404235591006 }, { "clip_ratio/high_max": 0.09947916865348816, "clip_ratio/high_mean": 0.04973958432674408, "clip_ratio/low_mean": 0.0515625006519258, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1013020845130086, "entropy": 0.6520713791251183, "epoch": 0.00088, "grad_norm": 0.27122628688812256, "kl": 0.43046148121356964, "learning_rate": 9.999998748589757e-06, "loss": 0.0038, "step": 88, "step_time": 2.0544557580069522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 147.75, "completions/mean_terminated_length": 147.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7453471720218658, "epoch": 0.00089, "frac_reward_zero_std": 0.25, "grad_norm": 0.8282150030136108, "kl": 0.4843899719417095, "learning_rate": 9.999998699995797e-06, "loss": 0.0373, "num_tokens": 864570.0, "reward": 0.5839086174964905, "reward_std": 0.06656036525964737, "rewards/rollout_reward_func/mean": 0.5839086174964905, "rewards/rollout_reward_func/std": 0.23198164999485016, "sampling/importance_sampling_ratio/max": 1.1255768537521362, "sampling/importance_sampling_ratio/mean": 0.8180328607559204, "sampling/importance_sampling_ratio/min": 0.07089342176914215, "sampling/sampling_logp_difference/max": 2.1518044471740723, "sampling/sampling_logp_difference/mean": 0.11078162491321564, "step": 89, "step_time": 4.879011905999505 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625000465661287, "entropy": 0.7530738934874535, "epoch": 0.0009, "grad_norm": 0.8121122121810913, "kl": 0.469660684466362, "learning_rate": 9.999998650476238e-06, "loss": 0.0358, "step": 90, "step_time": 2.5155542560023605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 139.5625, "completions/mean_terminated_length": 139.5625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.5695987343788147, "epoch": 0.00091, "frac_reward_zero_std": 0.75, "grad_norm": 0.9353964328765869, "kl": 3.3630687445402145, "learning_rate": 9.99999860003108e-06, "loss": -0.0145, "num_tokens": 882972.0, "reward": 0.6544519662857056, "reward_std": 0.23603567481040955, "rewards/rollout_reward_func/mean": 0.6544519662857056, "rewards/rollout_reward_func/std": 0.4819580912590027, "sampling/importance_sampling_ratio/max": 1.1027783155441284, "sampling/importance_sampling_ratio/mean": 0.7951216101646423, "sampling/importance_sampling_ratio/min": 0.04506540670990944, "sampling/sampling_logp_difference/max": 2.2473206520080566, "sampling/sampling_logp_difference/mean": 0.1005232036113739, "step": 91, "step_time": 4.11563909099641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5864499434828758, "epoch": 0.00092, "grad_norm": 0.3778733015060425, "kl": 1.7065081261098385, "learning_rate": 9.999998548660322e-06, "loss": -0.0203, "step": 92, "step_time": 2.0339119149866747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 207.09375, "completions/mean_terminated_length": 207.09375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.6696218103170395, "epoch": 0.00093, "frac_reward_zero_std": 0.0, "grad_norm": 1.358778715133667, "kl": 0.6092524416744709, "learning_rate": 9.999998496363967e-06, "loss": -0.0559, "num_tokens": 903959.0, "reward": 0.8870336413383484, "reward_std": 0.10681460052728653, "rewards/rollout_reward_func/mean": 0.8870336413383484, "rewards/rollout_reward_func/std": 0.3735429048538208, "sampling/importance_sampling_ratio/max": 1.3013865947723389, "sampling/importance_sampling_ratio/mean": 0.8152920007705688, "sampling/importance_sampling_ratio/min": 0.049400780349969864, "sampling/sampling_logp_difference/max": 1.8049614429473877, "sampling/sampling_logp_difference/mean": 0.08265130966901779, "step": 93, "step_time": 4.59683526999288 }, { "clip_ratio/high_max": 0.06770833488553762, "clip_ratio/high_mean": 0.03831845335662365, "clip_ratio/low_mean": 0.050595239736139774, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.08891369169577956, "entropy": 0.6484490111470222, "epoch": 0.00094, "grad_norm": 0.24157476425170898, "kl": 0.5255782268941402, "learning_rate": 9.999998443142012e-06, "loss": -0.0614, "step": 94, "step_time": 2.473513686993101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6888524144887924, "epoch": 0.00095, "frac_reward_zero_std": 0.5, "grad_norm": 0.34081217646598816, "kl": 0.4898675009608269, "learning_rate": 9.999998388994457e-06, "loss": -0.0238, "num_tokens": 922687.0, "reward": 0.4064711630344391, "reward_std": 0.05422699451446533, "rewards/rollout_reward_func/mean": 0.4064711630344391, "rewards/rollout_reward_func/std": 0.14923517405986786, "sampling/importance_sampling_ratio/max": 1.0668442249298096, "sampling/importance_sampling_ratio/mean": 0.9052597284317017, "sampling/importance_sampling_ratio/min": 0.03153648599982262, "sampling/sampling_logp_difference/max": 2.3929879665374756, "sampling/sampling_logp_difference/mean": 0.07509651780128479, "step": 95, "step_time": 4.46552860099473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.016964286100119352, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016964286100119352, "entropy": 0.6755633428692818, "epoch": 0.00096, "grad_norm": 0.25045859813690186, "kl": 0.4954671338200569, "learning_rate": 9.999998333921305e-06, "loss": -0.0251, "step": 96, "step_time": 2.5494221539993305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 219.625, "completions/mean_terminated_length": 219.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 1.4276337325572968, "epoch": 0.00097, "frac_reward_zero_std": 0.0, "grad_norm": 0.6875065565109253, "kl": 0.4040883034467697, "learning_rate": 9.999998277922554e-06, "loss": -0.0811, "num_tokens": 943931.0, "reward": 0.6053268909454346, "reward_std": 0.3348211944103241, "rewards/rollout_reward_func/mean": 0.6053268909454346, "rewards/rollout_reward_func/std": 0.5739466547966003, "sampling/importance_sampling_ratio/max": 1.277268648147583, "sampling/importance_sampling_ratio/mean": 0.6385126113891602, "sampling/importance_sampling_ratio/min": 2.0449680970401236e-14, "sampling/sampling_logp_difference/max": 3.3090970516204834, "sampling/sampling_logp_difference/mean": 0.36151444911956787, "step": 97, "step_time": 4.719245998996485 }, { "clip_ratio/high_max": 0.051041667349636555, "clip_ratio/high_mean": 0.025520833674818277, "clip_ratio/low_mean": 0.023363096173852682, "clip_ratio/low_min": 0.004464285913854837, "clip_ratio/region_mean": 0.04888392984867096, "entropy": 1.4207729920744896, "epoch": 0.00098, "grad_norm": 0.5037866830825806, "kl": 0.46972375363111496, "learning_rate": 9.999998220998203e-06, "loss": -0.0838, "step": 98, "step_time": 2.052941718007787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 154.5625, "completions/mean_terminated_length": 154.5625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.822303831577301, "epoch": 0.00099, "frac_reward_zero_std": 0.25, "grad_norm": 0.6283177733421326, "kl": 0.5569826737046242, "learning_rate": 9.999998163148253e-06, "loss": 0.0173, "num_tokens": 962493.0, "reward": 0.6644231081008911, "reward_std": 0.2569997012615204, "rewards/rollout_reward_func/mean": 0.6644231081008911, "rewards/rollout_reward_func/std": 0.37717780470848083, "sampling/importance_sampling_ratio/max": 1.1485214233398438, "sampling/importance_sampling_ratio/mean": 0.8087925910949707, "sampling/importance_sampling_ratio/min": 0.09446648508310318, "sampling/sampling_logp_difference/max": 1.2617559432983398, "sampling/sampling_logp_difference/mean": 0.12515072524547577, "step": 99, "step_time": 4.145315246001701 }, { "clip_ratio/high_max": 0.04791666753590107, "clip_ratio/high_mean": 0.029166667256504297, "clip_ratio/low_mean": 0.020312500186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04947916744276881, "entropy": 0.802138514816761, "epoch": 0.001, "grad_norm": 0.2665424942970276, "kl": 0.604221660643816, "learning_rate": 9.999998104372703e-06, "loss": 0.0145, "step": 100, "step_time": 2.456551674004004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6926520615816116, "epoch": 0.00101, "frac_reward_zero_std": 0.25, "grad_norm": 0.5470882654190063, "kl": 0.44920739158988, "learning_rate": 9.999998044671557e-06, "loss": -0.0132, "num_tokens": 982417.0, "reward": 0.9277788400650024, "reward_std": 0.28332653641700745, "rewards/rollout_reward_func/mean": 0.9277788400650024, "rewards/rollout_reward_func/std": 0.5670634508132935, "sampling/importance_sampling_ratio/max": 1.1870566606521606, "sampling/importance_sampling_ratio/mean": 0.869269609451294, "sampling/importance_sampling_ratio/min": 0.004263794515281916, "sampling/sampling_logp_difference/max": 2.1036319732666016, "sampling/sampling_logp_difference/mean": 0.09115777164697647, "step": 101, "step_time": 5.024826976004988 }, { "clip_ratio/high_max": 0.0357142873108387, "clip_ratio/high_mean": 0.01785714365541935, "clip_ratio/low_mean": 0.016741071827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.034598215483129025, "entropy": 0.6821975819766521, "epoch": 0.00102, "grad_norm": 0.15459786355495453, "kl": 0.44813675060868263, "learning_rate": 9.999997984044808e-06, "loss": -0.0155, "step": 102, "step_time": 2.055736098991474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 165.8125, "completions/mean_terminated_length": 165.8125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.5538862906396389, "epoch": 0.00103, "frac_reward_zero_std": 0.25, "grad_norm": 0.40532004833221436, "kl": 0.5724468342959881, "learning_rate": 9.999997922492466e-06, "loss": -0.0273, "num_tokens": 1002083.0, "reward": 0.7358653545379639, "reward_std": 0.11121632158756256, "rewards/rollout_reward_func/mean": 0.7358653545379639, "rewards/rollout_reward_func/std": 0.37248677015304565, "sampling/importance_sampling_ratio/max": 1.2789201736450195, "sampling/importance_sampling_ratio/mean": 0.8399708271026611, "sampling/importance_sampling_ratio/min": 0.1440175175666809, "sampling/sampling_logp_difference/max": 1.507929801940918, "sampling/sampling_logp_difference/mean": 0.09048566222190857, "step": 103, "step_time": 4.224575150990859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.5445560291409492, "epoch": 0.00104, "grad_norm": 0.3867877125740051, "kl": 0.6097253225743771, "learning_rate": 9.999997860014521e-06, "loss": -0.0275, "step": 104, "step_time": 2.015985037003702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.4418506324291229, "epoch": 0.00105, "frac_reward_zero_std": 0.25, "grad_norm": 0.8487100601196289, "kl": 0.4343515895307064, "learning_rate": 9.99999779661098e-06, "loss": -0.0643, "num_tokens": 1024419.0, "reward": 0.8171393871307373, "reward_std": 0.3196622133255005, "rewards/rollout_reward_func/mean": 0.8171393871307373, "rewards/rollout_reward_func/std": 0.6874659061431885, "sampling/importance_sampling_ratio/max": 1.0813404321670532, "sampling/importance_sampling_ratio/mean": 0.8573789000511169, "sampling/importance_sampling_ratio/min": 0.08025678992271423, "sampling/sampling_logp_difference/max": 1.1067590713500977, "sampling/sampling_logp_difference/mean": 0.05147257074713707, "step": 105, "step_time": 4.7914735750091495 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.020647321827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028459821827709675, "entropy": 0.42016830295324326, "epoch": 0.00106, "grad_norm": 0.24882395565509796, "kl": 0.4628661014139652, "learning_rate": 9.999997732281837e-06, "loss": -0.0678, "step": 106, "step_time": 2.4897218479964067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 195.3125, "completions/mean_terminated_length": 195.3125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4964956305921078, "epoch": 0.00107, "frac_reward_zero_std": 0.25, "grad_norm": 0.1429145187139511, "kl": 0.4684891365468502, "learning_rate": 9.999997667027097e-06, "loss": -0.0307, "num_tokens": 1044437.0, "reward": 0.7625721096992493, "reward_std": 0.248875230550766, "rewards/rollout_reward_func/mean": 0.7625721096992493, "rewards/rollout_reward_func/std": 0.5427582859992981, "sampling/importance_sampling_ratio/max": 1.1512364149093628, "sampling/importance_sampling_ratio/mean": 0.8781588077545166, "sampling/importance_sampling_ratio/min": 0.17762744426727295, "sampling/sampling_logp_difference/max": 1.2191526889801025, "sampling/sampling_logp_difference/mean": 0.05857245624065399, "step": 107, "step_time": 4.691359788987029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01145833358168602, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01145833358168602, "entropy": 0.48295333608984947, "epoch": 0.00108, "grad_norm": 0.2051086723804474, "kl": 0.49512577056884766, "learning_rate": 9.999997600846756e-06, "loss": -0.0312, "step": 108, "step_time": 2.0191263419983443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 213.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.41809551417827606, "epoch": 0.00109, "frac_reward_zero_std": 0.5, "grad_norm": 0.11055595427751541, "kl": 0.5030613876879215, "learning_rate": 9.99999753374082e-06, "loss": 0.0016, "num_tokens": 1065849.0, "reward": 0.5163413286209106, "reward_std": 0.05554867535829544, "rewards/rollout_reward_func/mean": 0.5163413286209106, "rewards/rollout_reward_func/std": 0.3906899094581604, "sampling/importance_sampling_ratio/max": 1.093960165977478, "sampling/importance_sampling_ratio/mean": 0.8790750503540039, "sampling/importance_sampling_ratio/min": 0.109795942902565, "sampling/sampling_logp_difference/max": 1.7369632720947266, "sampling/sampling_logp_difference/mean": 0.04672713577747345, "step": 109, "step_time": 4.522850103989185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41253915429115295, "epoch": 0.0011, "grad_norm": 0.0950663834810257, "kl": 0.5275907441973686, "learning_rate": 9.999997465709281e-06, "loss": 0.0018, "step": 110, "step_time": 2.022758277016692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 256.09375, "completions/mean_terminated_length": 256.09375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3792910538613796, "epoch": 0.00111, "frac_reward_zero_std": 0.5, "grad_norm": 0.19756175577640533, "kl": 1.3130620419979095, "learning_rate": 9.999997396752146e-06, "loss": 0.0027, "num_tokens": 1088124.0, "reward": 0.34293267130851746, "reward_std": 0.23875732719898224, "rewards/rollout_reward_func/mean": 0.34293267130851746, "rewards/rollout_reward_func/std": 0.6251739859580994, "sampling/importance_sampling_ratio/max": 0.9974022507667542, "sampling/importance_sampling_ratio/mean": 0.8528587818145752, "sampling/importance_sampling_ratio/min": 0.05195707455277443, "sampling/sampling_logp_difference/max": 1.8544869422912598, "sampling/sampling_logp_difference/mean": 0.04754097759723663, "step": 111, "step_time": 5.145367717006593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3766138106584549, "epoch": 0.00112, "grad_norm": 0.20301929116249084, "kl": 1.3508930206298828, "learning_rate": 9.999997326869412e-06, "loss": 0.0028, "step": 112, "step_time": 2.054332682986569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 222.6875, "completions/mean_terminated_length": 222.6875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.47412626072764397, "epoch": 0.00113, "frac_reward_zero_std": 0.5, "grad_norm": 0.09813446551561356, "kl": 0.5798238180577755, "learning_rate": 9.99999725606108e-06, "loss": -0.008, "num_tokens": 1109370.0, "reward": 0.7972356081008911, "reward_std": 0.06270335614681244, "rewards/rollout_reward_func/mean": 0.7972356081008911, "rewards/rollout_reward_func/std": 0.4464150071144104, "sampling/importance_sampling_ratio/max": 1.0015076398849487, "sampling/importance_sampling_ratio/mean": 0.8779930472373962, "sampling/importance_sampling_ratio/min": 0.036356884986162186, "sampling/sampling_logp_difference/max": 1.642662525177002, "sampling/sampling_logp_difference/mean": 0.05378018319606781, "step": 113, "step_time": 4.946420093001507 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.47480133548378944, "epoch": 0.00114, "grad_norm": 0.06914636492729187, "kl": 0.5566131509840488, "learning_rate": 9.999997184327149e-06, "loss": -0.0083, "step": 114, "step_time": 2.0107204860032653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2819363921880722, "epoch": 0.00115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012708213180303574, "kl": 0.48289668560028076, "learning_rate": 9.999997111667619e-06, "loss": 0.0018, "num_tokens": 1129610.0, "reward": 0.9381923079490662, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9381923079490662, "rewards/rollout_reward_func/std": 0.3043191432952881, "sampling/importance_sampling_ratio/max": 1.0283199548721313, "sampling/importance_sampling_ratio/mean": 0.9460251927375793, "sampling/importance_sampling_ratio/min": 0.9001151919364929, "sampling/sampling_logp_difference/max": 0.0630621612071991, "sampling/sampling_logp_difference/mean": 0.01728997379541397, "step": 115, "step_time": 4.379003516987723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28190871328115463, "epoch": 0.00116, "grad_norm": 0.0013680134434252977, "kl": 0.4824964441359043, "learning_rate": 9.999997038082489e-06, "loss": 0.0018, "step": 116, "step_time": 2.0460987710030167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2728460915386677, "epoch": 0.00117, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007397745503112674, "kl": 0.4727088212966919, "learning_rate": 9.999996963571762e-06, "loss": 0.0016, "num_tokens": 1149186.0, "reward": 1.013076901435852, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.013076901435852, "rewards/rollout_reward_func/std": 0.0662190392613411, "sampling/importance_sampling_ratio/max": 0.9962987303733826, "sampling/importance_sampling_ratio/mean": 0.9412692785263062, "sampling/importance_sampling_ratio/min": 0.9194550514221191, "sampling/sampling_logp_difference/max": 0.03471599519252777, "sampling/sampling_logp_difference/mean": 0.0157632939517498, "step": 117, "step_time": 4.679092300997581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2736970782279968, "epoch": 0.00118, "grad_norm": 0.0007739914581179619, "kl": 0.47249484062194824, "learning_rate": 9.999996888135438e-06, "loss": 0.0016, "step": 118, "step_time": 2.0180456550151575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 212.8125, "completions/mean_terminated_length": 212.8125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.512397401034832, "epoch": 0.00119, "frac_reward_zero_std": 0.75, "grad_norm": 0.2566003203392029, "kl": 0.4521557055413723, "learning_rate": 9.999996811773512e-06, "loss": -0.006, "num_tokens": 1170164.0, "reward": 0.7607860565185547, "reward_std": 0.028340544551610947, "rewards/rollout_reward_func/mean": 0.7607860565185547, "rewards/rollout_reward_func/std": 0.45660683512687683, "sampling/importance_sampling_ratio/max": 1.1244248151779175, "sampling/importance_sampling_ratio/mean": 0.8184391260147095, "sampling/importance_sampling_ratio/min": 0.025854595005512238, "sampling/sampling_logp_difference/max": 2.134464979171753, "sampling/sampling_logp_difference/mean": 0.06455962359905243, "step": 119, "step_time": 5.023383121981169 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02864583395421505, "entropy": 0.5003126822412014, "epoch": 0.0012, "grad_norm": 0.04881490021944046, "kl": 0.4478840231895447, "learning_rate": 9.999996734485989e-06, "loss": -0.0065, "step": 120, "step_time": 2.0445693470101105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 232.21875, "completions/mean_terminated_length": 232.21875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.45351241901516914, "epoch": 0.00121, "frac_reward_zero_std": 0.25, "grad_norm": 0.27817732095718384, "kl": 1.0669042877852917, "learning_rate": 9.999996656272867e-06, "loss": 0.0992, "num_tokens": 1191267.0, "reward": 0.2810577154159546, "reward_std": 0.09675372391939163, "rewards/rollout_reward_func/mean": 0.2810577154159546, "rewards/rollout_reward_func/std": 0.36419445276260376, "sampling/importance_sampling_ratio/max": 0.9445380568504333, "sampling/importance_sampling_ratio/mean": 0.8021700978279114, "sampling/importance_sampling_ratio/min": 0.06572501361370087, "sampling/sampling_logp_difference/max": 1.4142560958862305, "sampling/sampling_logp_difference/mean": 0.0558214969933033, "step": 121, "step_time": 4.77907095100818 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0096726194024086, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0096726194024086, "entropy": 0.45768024027347565, "epoch": 0.00122, "grad_norm": 0.2350851148366928, "kl": 0.9869546517729759, "learning_rate": 9.999996577134147e-06, "loss": 0.0985, "step": 122, "step_time": 2.0492757960018935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 264.0, "completions/mean_terminated_length": 264.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.3218678832054138, "epoch": 0.00123, "frac_reward_zero_std": 0.75, "grad_norm": 0.6779500842094421, "kl": 0.43379848077893257, "learning_rate": 9.999996497069828e-06, "loss": 0.0071, "num_tokens": 1213171.0, "reward": 0.471115380525589, "reward_std": 0.017804231494665146, "rewards/rollout_reward_func/mean": 0.471115380525589, "rewards/rollout_reward_func/std": 0.295259952545166, "sampling/importance_sampling_ratio/max": 1.0691646337509155, "sampling/importance_sampling_ratio/mean": 0.9071656465530396, "sampling/importance_sampling_ratio/min": 0.6794978380203247, "sampling/sampling_logp_difference/max": 0.29879677295684814, "sampling/sampling_logp_difference/mean": 0.017632484436035156, "step": 123, "step_time": 5.268046334997052 }, { "clip_ratio/high_max": 0.0193452388048172, "clip_ratio/high_mean": 0.0096726194024086, "clip_ratio/low_mean": 0.0290178582072258, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0386904776096344, "entropy": 0.3500271327793598, "epoch": 0.00124, "grad_norm": 0.02333991602063179, "kl": 0.43045319989323616, "learning_rate": 9.99999641607991e-06, "loss": 0.0047, "step": 124, "step_time": 2.4743334329978097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.42692895978689194, "epoch": 0.00125, "frac_reward_zero_std": 0.5, "grad_norm": 0.4999599754810333, "kl": 0.4503517113626003, "learning_rate": 9.999996334164396e-06, "loss": -0.0206, "num_tokens": 1232691.0, "reward": 0.7733653783798218, "reward_std": 0.2651650309562683, "rewards/rollout_reward_func/mean": 0.7733653783798218, "rewards/rollout_reward_func/std": 0.5875195264816284, "sampling/importance_sampling_ratio/max": 1.0081208944320679, "sampling/importance_sampling_ratio/mean": 0.9217312335968018, "sampling/importance_sampling_ratio/min": 0.2804102897644043, "sampling/sampling_logp_difference/max": 0.744787335395813, "sampling/sampling_logp_difference/mean": 0.025926943868398666, "step": 125, "step_time": 4.477612863011018 }, { "clip_ratio/high_max": 0.026041666977107525, "clip_ratio/high_mean": 0.013020833488553762, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020833333488553762, "entropy": 0.4356772042810917, "epoch": 0.00126, "grad_norm": 0.14146706461906433, "kl": 0.45405688136816025, "learning_rate": 9.999996251323281e-06, "loss": -0.0226, "step": 126, "step_time": 2.043730356002925 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 170.25, "completions/mean_terminated_length": 170.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5481557995080948, "epoch": 0.00127, "frac_reward_zero_std": 0.5, "grad_norm": 0.3707337975502014, "kl": 0.5045122839510441, "learning_rate": 9.99999616755657e-06, "loss": 0.0432, "num_tokens": 1252611.0, "reward": 0.49551922082901, "reward_std": 0.24945825338363647, "rewards/rollout_reward_func/mean": 0.49551922082901, "rewards/rollout_reward_func/std": 0.7359138131141663, "sampling/importance_sampling_ratio/max": 1.071092128753662, "sampling/importance_sampling_ratio/mean": 0.8959816098213196, "sampling/importance_sampling_ratio/min": 0.29515978693962097, "sampling/sampling_logp_difference/max": 0.8074710965156555, "sampling/sampling_logp_difference/mean": 0.04922895506024361, "step": 127, "step_time": 4.629729659020086 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.5741940587759018, "epoch": 0.00128, "grad_norm": 0.427379310131073, "kl": 0.49634506925940514, "learning_rate": 9.999996082864259e-06, "loss": 0.0423, "step": 128, "step_time": 2.041510464005114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 149.78125, "completions/mean_terminated_length": 149.78125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.6609327346086502, "epoch": 0.00129, "frac_reward_zero_std": 0.0, "grad_norm": 0.23494519293308258, "kl": 0.5831779316067696, "learning_rate": 9.99999599724635e-06, "loss": -0.0208, "num_tokens": 1271332.0, "reward": 0.7279471158981323, "reward_std": 0.25706613063812256, "rewards/rollout_reward_func/mean": 0.7279471158981323, "rewards/rollout_reward_func/std": 0.513251543045044, "sampling/importance_sampling_ratio/max": 1.0057820081710815, "sampling/importance_sampling_ratio/mean": 0.8082960844039917, "sampling/importance_sampling_ratio/min": 0.1452922374010086, "sampling/sampling_logp_difference/max": 1.296034574508667, "sampling/sampling_logp_difference/mean": 0.07588562369346619, "step": 129, "step_time": 4.694458776000829 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.687989242374897, "epoch": 0.0013, "grad_norm": 0.24075686931610107, "kl": 0.5582081452012062, "learning_rate": 9.999995910702842e-06, "loss": -0.0211, "step": 130, "step_time": 2.524534219002817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 186.875, "completions/mean_terminated_length": 186.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7302071303129196, "epoch": 0.00131, "frac_reward_zero_std": 0.25, "grad_norm": 0.1257123202085495, "kl": 0.6189752742648125, "learning_rate": 9.999995823233738e-06, "loss": -0.0256, "num_tokens": 1291392.0, "reward": 0.45788463950157166, "reward_std": 0.26414793729782104, "rewards/rollout_reward_func/mean": 0.45788463950157166, "rewards/rollout_reward_func/std": 0.528126060962677, "sampling/importance_sampling_ratio/max": 0.949287474155426, "sampling/importance_sampling_ratio/mean": 0.8184092044830322, "sampling/importance_sampling_ratio/min": 0.17721177637577057, "sampling/sampling_logp_difference/max": 1.1926491260528564, "sampling/sampling_logp_difference/mean": 0.06649895757436752, "step": 131, "step_time": 4.155932039990148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.754389077425003, "epoch": 0.00132, "grad_norm": 0.1393846720457077, "kl": 0.5981959588825703, "learning_rate": 9.999995734839033e-06, "loss": -0.0258, "step": 132, "step_time": 2.0240524750115583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 215.6875, "completions/mean_terminated_length": 215.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.7712134942412376, "epoch": 0.00133, "frac_reward_zero_std": 0.25, "grad_norm": 0.24594715237617493, "kl": 0.49211446568369865, "learning_rate": 9.999995645518729e-06, "loss": -0.0301, "num_tokens": 1312222.0, "reward": 0.636859118938446, "reward_std": 0.05868319422006607, "rewards/rollout_reward_func/mean": 0.636859118938446, "rewards/rollout_reward_func/std": 0.40322309732437134, "sampling/importance_sampling_ratio/max": 0.9578871726989746, "sampling/importance_sampling_ratio/mean": 0.7699468731880188, "sampling/importance_sampling_ratio/min": 0.11580881476402283, "sampling/sampling_logp_difference/max": 1.1318761110305786, "sampling/sampling_logp_difference/mean": 0.07021896541118622, "step": 133, "step_time": 4.602974660992913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7888321056962013, "epoch": 0.00134, "grad_norm": 0.2196965217590332, "kl": 0.48548898100852966, "learning_rate": 9.999995555272829e-06, "loss": -0.0309, "step": 134, "step_time": 2.0640504689799855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.6513267494738102, "epoch": 0.00135, "frac_reward_zero_std": 0.75, "grad_norm": 0.1891738772392273, "kl": 0.4430716671049595, "learning_rate": 9.99999546410133e-06, "loss": -0.0028, "num_tokens": 1332198.0, "reward": 0.7671586275100708, "reward_std": 0.19627219438552856, "rewards/rollout_reward_func/mean": 0.7671586275100708, "rewards/rollout_reward_func/std": 0.4762621521949768, "sampling/importance_sampling_ratio/max": 1.0112464427947998, "sampling/importance_sampling_ratio/mean": 0.8341033458709717, "sampling/importance_sampling_ratio/min": 0.2246386557817459, "sampling/sampling_logp_difference/max": 0.7838411331176758, "sampling/sampling_logp_difference/mean": 0.04772007465362549, "step": 135, "step_time": 4.996164809999755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6556721776723862, "epoch": 0.00136, "grad_norm": 0.1809471845626831, "kl": 0.444995753467083, "learning_rate": 9.999995372004231e-06, "loss": -0.0028, "step": 136, "step_time": 2.5355993910125108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 177.53125, "completions/mean_terminated_length": 177.53125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.8419390171766281, "epoch": 0.00137, "frac_reward_zero_std": 0.0, "grad_norm": 0.45837846398353577, "kl": 0.4910569526255131, "learning_rate": 9.999995278981537e-06, "loss": -0.019, "num_tokens": 1352263.0, "reward": 0.5626298189163208, "reward_std": 0.3070739507675171, "rewards/rollout_reward_func/mean": 0.5626298189163208, "rewards/rollout_reward_func/std": 0.4722348153591156, "sampling/importance_sampling_ratio/max": 1.0104562044143677, "sampling/importance_sampling_ratio/mean": 0.7761090993881226, "sampling/importance_sampling_ratio/min": 0.2682415843009949, "sampling/sampling_logp_difference/max": 0.7185558080673218, "sampling/sampling_logp_difference/mean": 0.07213786989450455, "step": 137, "step_time": 4.572832166995795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.8488463088870049, "epoch": 0.00138, "grad_norm": 0.4253222346305847, "kl": 0.4996301122009754, "learning_rate": 9.999995185033245e-06, "loss": -0.0206, "step": 138, "step_time": 2.0346182369830785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 225.59375, "completions/mean_terminated_length": 225.59375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.699290819466114, "epoch": 0.00139, "frac_reward_zero_std": 0.5, "grad_norm": 0.1965785175561905, "kl": 0.5273459777235985, "learning_rate": 9.999995090159351e-06, "loss": 0.0014, "num_tokens": 1373786.0, "reward": 0.7027307748794556, "reward_std": 0.14828208088874817, "rewards/rollout_reward_func/mean": 0.7027307748794556, "rewards/rollout_reward_func/std": 0.3861351013183594, "sampling/importance_sampling_ratio/max": 0.9374274015426636, "sampling/importance_sampling_ratio/mean": 0.7445811033248901, "sampling/importance_sampling_ratio/min": 0.10102017968893051, "sampling/sampling_logp_difference/max": 1.6511006355285645, "sampling/sampling_logp_difference/mean": 0.06727895885705948, "step": 139, "step_time": 4.662857297000301 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.7061733528971672, "epoch": 0.0014, "grad_norm": 0.1778503656387329, "kl": 0.524321898818016, "learning_rate": 9.999994994359862e-06, "loss": 0.0008, "step": 140, "step_time": 2.5170528940070653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 198.75, "completions/mean_terminated_length": 198.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.684749111533165, "epoch": 0.00141, "frac_reward_zero_std": 0.5, "grad_norm": 0.5792955160140991, "kl": 0.44160402938723564, "learning_rate": 9.999994897634775e-06, "loss": 0.0157, "num_tokens": 1393906.0, "reward": 0.604442298412323, "reward_std": 0.0593813955783844, "rewards/rollout_reward_func/mean": 0.604442298412323, "rewards/rollout_reward_func/std": 0.38425788283348083, "sampling/importance_sampling_ratio/max": 0.9874537587165833, "sampling/importance_sampling_ratio/mean": 0.813355565071106, "sampling/importance_sampling_ratio/min": 0.3404487371444702, "sampling/sampling_logp_difference/max": 0.6618630886077881, "sampling/sampling_logp_difference/mean": 0.050198450684547424, "step": 141, "step_time": 5.105968815012602 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.035937500186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04040178610011935, "entropy": 0.7233639135956764, "epoch": 0.00142, "grad_norm": 0.19693605601787567, "kl": 0.4301278702914715, "learning_rate": 9.999994799984088e-06, "loss": 0.0124, "step": 142, "step_time": 2.0658846519945655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 252.1875, "completions/mean_terminated_length": 252.1875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.8234691359102726, "epoch": 0.00143, "frac_reward_zero_std": 0.0, "grad_norm": 0.30000877380371094, "kl": 0.6108789965510368, "learning_rate": 9.999994701407805e-06, "loss": -0.0596, "num_tokens": 1415920.0, "reward": 0.8758317232131958, "reward_std": 0.3388770818710327, "rewards/rollout_reward_func/mean": 0.8758317232131958, "rewards/rollout_reward_func/std": 0.6580694317817688, "sampling/importance_sampling_ratio/max": 0.9229819178581238, "sampling/importance_sampling_ratio/mean": 0.7293728590011597, "sampling/importance_sampling_ratio/min": 0.013389226980507374, "sampling/sampling_logp_difference/max": 1.8118468523025513, "sampling/sampling_logp_difference/mean": 0.07523775100708008, "step": 143, "step_time": 4.86981294699217 }, { "clip_ratio/high_max": 0.02566964365541935, "clip_ratio/high_mean": 0.012834821827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012834821827709675, "entropy": 0.8289989233016968, "epoch": 0.00144, "grad_norm": 0.15021733939647675, "kl": 0.6073577925562859, "learning_rate": 9.999994601905921e-06, "loss": -0.0608, "step": 144, "step_time": 2.0816469009805587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 216.4375, "completions/mean_terminated_length": 216.4375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.647574283182621, "epoch": 0.00145, "frac_reward_zero_std": 0.5, "grad_norm": 0.3260931968688965, "kl": 0.6548956781625748, "learning_rate": 9.999994501478441e-06, "loss": -0.0103, "num_tokens": 1436662.0, "reward": 0.7637596130371094, "reward_std": 0.17936696112155914, "rewards/rollout_reward_func/mean": 0.7637596130371094, "rewards/rollout_reward_func/std": 0.4395952820777893, "sampling/importance_sampling_ratio/max": 0.9285555481910706, "sampling/importance_sampling_ratio/mean": 0.8065959215164185, "sampling/importance_sampling_ratio/min": 0.07960981130599976, "sampling/sampling_logp_difference/max": 1.0736305713653564, "sampling/sampling_logp_difference/mean": 0.048066116869449615, "step": 145, "step_time": 4.594964988005813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 0.649588443338871, "epoch": 0.00146, "grad_norm": 0.33694761991500854, "kl": 0.6861728094518185, "learning_rate": 9.999994400125363e-06, "loss": -0.0104, "step": 146, "step_time": 2.4872593590116594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.7206122279167175, "epoch": 0.00147, "frac_reward_zero_std": 0.5, "grad_norm": 0.3406941294670105, "kl": 0.4401327408850193, "learning_rate": 9.999994297846687e-06, "loss": -0.0034, "num_tokens": 1456574.0, "reward": 0.6487019062042236, "reward_std": 0.2752254009246826, "rewards/rollout_reward_func/mean": 0.6487019062042236, "rewards/rollout_reward_func/std": 0.6059682965278625, "sampling/importance_sampling_ratio/max": 1.0095070600509644, "sampling/importance_sampling_ratio/mean": 0.799188494682312, "sampling/importance_sampling_ratio/min": 0.07893651723861694, "sampling/sampling_logp_difference/max": 1.0615453720092773, "sampling/sampling_logp_difference/mean": 0.05821433663368225, "step": 147, "step_time": 5.014773844006413 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014322916977107525, "entropy": 0.7316708862781525, "epoch": 0.00148, "grad_norm": 0.10873699933290482, "kl": 0.43587783724069595, "learning_rate": 9.999994194642413e-06, "loss": -0.0052, "step": 148, "step_time": 2.03321829700144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 218.5625, "completions/mean_terminated_length": 218.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.8663435205817223, "epoch": 0.00149, "frac_reward_zero_std": 0.0, "grad_norm": 0.5704498887062073, "kl": 0.6059122048318386, "learning_rate": 9.99999409051254e-06, "loss": 0.0137, "num_tokens": 1477544.0, "reward": 0.6078639030456543, "reward_std": 0.17057368159294128, "rewards/rollout_reward_func/mean": 0.6078639030456543, "rewards/rollout_reward_func/std": 0.47632309794425964, "sampling/importance_sampling_ratio/max": 0.930121660232544, "sampling/importance_sampling_ratio/mean": 0.7689530253410339, "sampling/importance_sampling_ratio/min": 0.0006234173779375851, "sampling/sampling_logp_difference/max": 2.5743026733398438, "sampling/sampling_logp_difference/mean": 0.08864792436361313, "step": 149, "step_time": 4.625625674998446 }, { "clip_ratio/high_max": 0.05654762126505375, "clip_ratio/high_mean": 0.028273810632526875, "clip_ratio/low_mean": 0.016741071827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04501488246023655, "entropy": 0.8982884883880615, "epoch": 0.0015, "grad_norm": 0.35737839341163635, "kl": 0.5735831186175346, "learning_rate": 9.999993985457072e-06, "loss": 0.0103, "step": 150, "step_time": 2.0547205659895553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.6875264756381512, "epoch": 0.00151, "frac_reward_zero_std": 0.25, "grad_norm": 0.43361929059028625, "kl": 0.42352350801229477, "learning_rate": 9.999993879476003e-06, "loss": -0.0251, "num_tokens": 1497484.0, "reward": 0.511346161365509, "reward_std": 0.07516682147979736, "rewards/rollout_reward_func/mean": 0.511346161365509, "rewards/rollout_reward_func/std": 0.3944481611251831, "sampling/importance_sampling_ratio/max": 0.9507820010185242, "sampling/importance_sampling_ratio/mean": 0.8266797065734863, "sampling/importance_sampling_ratio/min": 0.02416560985147953, "sampling/sampling_logp_difference/max": 2.578049421310425, "sampling/sampling_logp_difference/mean": 0.05102338641881943, "step": 151, "step_time": 4.723864033003338 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.01666666753590107, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022916667629033327, "entropy": 0.7423437386751175, "epoch": 0.00152, "grad_norm": 0.3712540864944458, "kl": 0.42100778594613075, "learning_rate": 9.999993772569339e-06, "loss": -0.0264, "step": 152, "step_time": 2.0846110790153034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 145.625, "completions/mean_terminated_length": 145.625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.7353866100311279, "epoch": 0.00153, "frac_reward_zero_std": 0.25, "grad_norm": 0.575567901134491, "kl": 0.4535021632909775, "learning_rate": 9.999993664737076e-06, "loss": 0.0008, "num_tokens": 1516088.0, "reward": 0.8193268775939941, "reward_std": 0.06941214203834534, "rewards/rollout_reward_func/mean": 0.8193268775939941, "rewards/rollout_reward_func/std": 0.12403899431228638, "sampling/importance_sampling_ratio/max": 0.9917429089546204, "sampling/importance_sampling_ratio/mean": 0.8505151271820068, "sampling/importance_sampling_ratio/min": 0.29596826434135437, "sampling/sampling_logp_difference/max": 0.7507830858230591, "sampling/sampling_logp_difference/mean": 0.05599547177553177, "step": 153, "step_time": 4.793890402987017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.7272352203726768, "epoch": 0.00154, "grad_norm": 0.4695049822330475, "kl": 0.46193474158644676, "learning_rate": 9.999993555979215e-06, "loss": -0.0004, "step": 154, "step_time": 2.0506472829874838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 148.0625, "completions/mean_terminated_length": 148.0625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.7448698580265045, "epoch": 0.00155, "frac_reward_zero_std": 0.25, "grad_norm": 0.08445879817008972, "kl": 1.002717524766922, "learning_rate": 9.999993446295754e-06, "loss": -0.0145, "num_tokens": 1534626.0, "reward": 0.6361105442047119, "reward_std": 0.14019274711608887, "rewards/rollout_reward_func/mean": 0.6361105442047119, "rewards/rollout_reward_func/std": 0.3009372353553772, "sampling/importance_sampling_ratio/max": 0.9405027627944946, "sampling/importance_sampling_ratio/mean": 0.7972164154052734, "sampling/importance_sampling_ratio/min": 0.055187586694955826, "sampling/sampling_logp_difference/max": 1.9447739124298096, "sampling/sampling_logp_difference/mean": 0.0768483430147171, "step": 155, "step_time": 4.251241916012077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.7304767891764641, "epoch": 0.00156, "grad_norm": 0.08169475942850113, "kl": 1.0930012539029121, "learning_rate": 9.999993335686697e-06, "loss": -0.0146, "step": 156, "step_time": 2.0273106199965696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 208.0, "completions/mean_terminated_length": 208.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.7061441913247108, "epoch": 0.00157, "frac_reward_zero_std": 0.25, "grad_norm": 0.7083742618560791, "kl": 0.41956802085042, "learning_rate": 9.999993224152043e-06, "loss": -0.0211, "num_tokens": 1554682.0, "reward": 0.5907163619995117, "reward_std": 0.12547175586223602, "rewards/rollout_reward_func/mean": 0.5907163619995117, "rewards/rollout_reward_func/std": 0.39336398243904114, "sampling/importance_sampling_ratio/max": 1.1162008047103882, "sampling/importance_sampling_ratio/mean": 0.8719906806945801, "sampling/importance_sampling_ratio/min": 0.0029009219724684954, "sampling/sampling_logp_difference/max": 1.8252474069595337, "sampling/sampling_logp_difference/mean": 0.06561589241027832, "step": 157, "step_time": 5.5390508029886405 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.04985119169577956, "clip_ratio/low_min": 0.0386904776096344, "clip_ratio/region_mean": 0.06547619169577956, "entropy": 0.6873180791735649, "epoch": 0.00158, "grad_norm": 0.18927226960659027, "kl": 0.41184067726135254, "learning_rate": 9.999993111691792e-06, "loss": -0.0252, "step": 158, "step_time": 2.076540167981875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 214.625, "completions/mean_terminated_length": 214.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.701908778399229, "epoch": 0.00159, "frac_reward_zero_std": 0.25, "grad_norm": 1.3011517524719238, "kl": 0.5219414457678795, "learning_rate": 9.999992998305941e-06, "loss": -0.047, "num_tokens": 1576014.0, "reward": 0.9826827049255371, "reward_std": 0.09501535445451736, "rewards/rollout_reward_func/mean": 0.9826827049255371, "rewards/rollout_reward_func/std": 0.2405303567647934, "sampling/importance_sampling_ratio/max": 0.9873376488685608, "sampling/importance_sampling_ratio/mean": 0.7861340045928955, "sampling/importance_sampling_ratio/min": 0.08071480691432953, "sampling/sampling_logp_difference/max": 1.8456822633743286, "sampling/sampling_logp_difference/mean": 0.05923395976424217, "step": 159, "step_time": 5.397980513997027 }, { "clip_ratio/high_max": 0.06324404943734407, "clip_ratio/high_mean": 0.04308035736903548, "clip_ratio/low_mean": 0.08809523889794946, "clip_ratio/low_min": 0.03125, "clip_ratio/region_mean": 0.13117559580132365, "entropy": 0.6871124841272831, "epoch": 0.0016, "grad_norm": 0.13591617345809937, "kl": 0.6085598468780518, "learning_rate": 9.999992883994494e-06, "loss": -0.0538, "step": 160, "step_time": 2.0471578639990184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 216.3125, "completions/mean_terminated_length": 216.3125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.6255942992866039, "epoch": 0.00161, "frac_reward_zero_std": 0.5, "grad_norm": 0.12944456934928894, "kl": 0.7876702286303043, "learning_rate": 9.999992768757449e-06, "loss": -0.0023, "num_tokens": 1597072.0, "reward": 0.6469038724899292, "reward_std": 0.07522162795066833, "rewards/rollout_reward_func/mean": 0.6469038724899292, "rewards/rollout_reward_func/std": 0.36303627490997314, "sampling/importance_sampling_ratio/max": 0.9743785858154297, "sampling/importance_sampling_ratio/mean": 0.7579578161239624, "sampling/importance_sampling_ratio/min": 0.0494304783642292, "sampling/sampling_logp_difference/max": 2.216143846511841, "sampling/sampling_logp_difference/mean": 0.07042735815048218, "step": 161, "step_time": 4.6982889230203 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03050595335662365, "entropy": 0.6464008577167988, "epoch": 0.00162, "grad_norm": 0.44611185789108276, "kl": 0.9110904149711132, "learning_rate": 9.999992652594807e-06, "loss": -0.0006, "step": 162, "step_time": 2.082813339002314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3570263050496578, "epoch": 0.00163, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009318491211161017, "kl": 0.552034929394722, "learning_rate": 9.999992535506568e-06, "loss": 0.0016, "num_tokens": 1615392.0, "reward": 0.7130768895149231, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7130768895149231, "rewards/rollout_reward_func/std": 0.39166656136512756, "sampling/importance_sampling_ratio/max": 0.9508215188980103, "sampling/importance_sampling_ratio/mean": 0.9228674173355103, "sampling/importance_sampling_ratio/min": 0.8820335268974304, "sampling/sampling_logp_difference/max": 0.08439537137746811, "sampling/sampling_logp_difference/mean": 0.019341936334967613, "step": 163, "step_time": 4.534401345990773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35062895342707634, "epoch": 0.00164, "grad_norm": 0.0008634374244138598, "kl": 0.5525282397866249, "learning_rate": 9.99999241749273e-06, "loss": 0.0016, "step": 164, "step_time": 2.4684702830199967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 219.4375, "completions/mean_terminated_length": 219.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.48554346710443497, "epoch": 0.00165, "frac_reward_zero_std": 0.5, "grad_norm": 0.1306200474500656, "kl": 1.7252511717379093, "learning_rate": 9.999992298553295e-06, "loss": -0.0522, "num_tokens": 1636830.0, "reward": 0.9830529093742371, "reward_std": 0.18305987119674683, "rewards/rollout_reward_func/mean": 0.9830529093742371, "rewards/rollout_reward_func/std": 0.5063679218292236, "sampling/importance_sampling_ratio/max": 0.9284272789955139, "sampling/importance_sampling_ratio/mean": 0.8270502686500549, "sampling/importance_sampling_ratio/min": 0.018336143344640732, "sampling/sampling_logp_difference/max": 3.1804521083831787, "sampling/sampling_logp_difference/mean": 0.06758012622594833, "step": 165, "step_time": 4.651601577999827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.48403075709939003, "epoch": 0.00166, "grad_norm": 0.1331038773059845, "kl": 1.7347681485116482, "learning_rate": 9.999992178688262e-06, "loss": -0.0522, "step": 166, "step_time": 2.0506485809892183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 274.6875, "completions/mean_terminated_length": 274.6875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.8682501427829266, "epoch": 0.00167, "frac_reward_zero_std": 0.5, "grad_norm": 0.01885235868394375, "kl": 0.41433175653219223, "learning_rate": 9.999992057897633e-06, "loss": -0.0595, "num_tokens": 1660116.0, "reward": 0.9952437877655029, "reward_std": 0.019109569489955902, "rewards/rollout_reward_func/mean": 0.9952437877655029, "rewards/rollout_reward_func/std": 0.35324567556381226, "sampling/importance_sampling_ratio/max": 0.9754359722137451, "sampling/importance_sampling_ratio/mean": 0.8640503883361816, "sampling/importance_sampling_ratio/min": 1.4846269066965018e-17, "sampling/sampling_logp_difference/max": 3.0720055103302, "sampling/sampling_logp_difference/mean": 0.2530848979949951, "step": 167, "step_time": 5.160785936001048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8690757267177105, "epoch": 0.00168, "grad_norm": 0.01904449611902237, "kl": 0.4140298254787922, "learning_rate": 9.999991936181406e-06, "loss": -0.0595, "step": 168, "step_time": 2.1266296550020343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 256.5, "completions/mean_terminated_length": 256.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.5766736529767513, "epoch": 0.00169, "frac_reward_zero_std": 0.25, "grad_norm": 0.3397831618785858, "kl": 0.7836117297410965, "learning_rate": 9.999991813539582e-06, "loss": -0.022, "num_tokens": 1681948.0, "reward": 0.576471209526062, "reward_std": 0.24862347543239594, "rewards/rollout_reward_func/mean": 0.576471209526062, "rewards/rollout_reward_func/std": 0.6121658682823181, "sampling/importance_sampling_ratio/max": 1.0242105722427368, "sampling/importance_sampling_ratio/mean": 0.8163847327232361, "sampling/importance_sampling_ratio/min": 0.004432046785950661, "sampling/sampling_logp_difference/max": 2.676398992538452, "sampling/sampling_logp_difference/mean": 0.07703980058431625, "step": 169, "step_time": 5.313434961986786 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 0.5686189271509647, "epoch": 0.0017, "grad_norm": 0.25804463028907776, "kl": 0.7491049654781818, "learning_rate": 9.999991689972159e-06, "loss": -0.0224, "step": 170, "step_time": 2.513204699011112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2986747808754444, "epoch": 0.00171, "frac_reward_zero_std": 1.0, "grad_norm": 0.002224245108664036, "kl": 0.46086184680461884, "learning_rate": 9.999991565479141e-06, "loss": 0.0018, "num_tokens": 1702388.0, "reward": 0.6837692260742188, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6837692260742188, "rewards/rollout_reward_func/std": 0.5975348949432373, "sampling/importance_sampling_ratio/max": 1.0210628509521484, "sampling/importance_sampling_ratio/mean": 0.9489431381225586, "sampling/importance_sampling_ratio/min": 0.8955190777778625, "sampling/sampling_logp_difference/max": 0.1751788854598999, "sampling/sampling_logp_difference/mean": 0.01916920393705368, "step": 171, "step_time": 4.574438079005631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3028823137283325, "epoch": 0.00172, "grad_norm": 0.0024803997948765755, "kl": 0.4596707336604595, "learning_rate": 9.999991440060524e-06, "loss": 0.0018, "step": 172, "step_time": 2.0558304650039645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 167.9375, "completions/mean_terminated_length": 167.9375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.42419765144586563, "epoch": 0.00173, "frac_reward_zero_std": 0.5, "grad_norm": 0.0789625495672226, "kl": 0.9798491150140762, "learning_rate": 9.99999131371631e-06, "loss": 0.0051, "num_tokens": 1722042.0, "reward": 0.5525480508804321, "reward_std": 0.06051202490925789, "rewards/rollout_reward_func/mean": 0.5525480508804321, "rewards/rollout_reward_func/std": 0.32639655470848083, "sampling/importance_sampling_ratio/max": 0.9678473472595215, "sampling/importance_sampling_ratio/mean": 0.8858781456947327, "sampling/importance_sampling_ratio/min": 0.047336697578430176, "sampling/sampling_logp_difference/max": 2.6366028785705566, "sampling/sampling_logp_difference/mean": 0.044398751109838486, "step": 173, "step_time": 4.2432437379829935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4328473135828972, "epoch": 0.00174, "grad_norm": 0.07855504006147385, "kl": 0.941165916621685, "learning_rate": 9.999991186446498e-06, "loss": 0.0051, "step": 174, "step_time": 2.462651938003546 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 172.03125, "completions/mean_terminated_length": 172.03125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4164646677672863, "epoch": 0.00175, "frac_reward_zero_std": 0.75, "grad_norm": 0.04490351676940918, "kl": 0.640511654317379, "learning_rate": 9.99999105825109e-06, "loss": 0.0329, "num_tokens": 1741139.0, "reward": 0.3309471011161804, "reward_std": 0.028868885710835457, "rewards/rollout_reward_func/mean": 0.3309471011161804, "rewards/rollout_reward_func/std": 0.1906709522008896, "sampling/importance_sampling_ratio/max": 0.9785029292106628, "sampling/importance_sampling_ratio/mean": 0.8790173530578613, "sampling/importance_sampling_ratio/min": 0.046762894839048386, "sampling/sampling_logp_difference/max": 2.483062505722046, "sampling/sampling_logp_difference/mean": 0.03952150046825409, "step": 175, "step_time": 4.56749034600216 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.4253019914031029, "epoch": 0.00176, "grad_norm": 0.05878414958715439, "kl": 0.6470754891633987, "learning_rate": 9.999990929130086e-06, "loss": 0.033, "step": 176, "step_time": 2.476150262002193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 230.6875, "completions/mean_terminated_length": 230.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1465958543121815, "epoch": 0.00177, "frac_reward_zero_std": 0.0, "grad_norm": 0.6719321012496948, "kl": 0.7472339645028114, "learning_rate": 9.999990799083483e-06, "loss": -0.0206, "num_tokens": 1761737.0, "reward": 0.690561056137085, "reward_std": 0.09158394485712051, "rewards/rollout_reward_func/mean": 0.690561056137085, "rewards/rollout_reward_func/std": 0.3917922079563141, "sampling/importance_sampling_ratio/max": 0.961934506893158, "sampling/importance_sampling_ratio/mean": 0.7783572673797607, "sampling/importance_sampling_ratio/min": 1.701696377165461e-19, "sampling/sampling_logp_difference/max": 3.8246779441833496, "sampling/sampling_logp_difference/mean": 0.3572084307670593, "step": 177, "step_time": 4.845252660008555 }, { "clip_ratio/high_max": 0.026785715483129025, "clip_ratio/high_mean": 0.013392857741564512, "clip_ratio/low_mean": 0.012276785913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02566964365541935, "entropy": 1.1816947385668755, "epoch": 0.00178, "grad_norm": 0.13917289674282074, "kl": 0.7392969503998756, "learning_rate": 9.999990668111284e-06, "loss": -0.0233, "step": 178, "step_time": 2.0470369459872018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 212.96875, "completions/mean_terminated_length": 212.96875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.5440964736044407, "epoch": 0.00179, "frac_reward_zero_std": 0.75, "grad_norm": 0.620272696018219, "kl": 0.5418501496315002, "learning_rate": 9.999990536213489e-06, "loss": 0.0142, "num_tokens": 1782392.0, "reward": 0.6598029136657715, "reward_std": 0.04214044660329819, "rewards/rollout_reward_func/mean": 0.6598029136657715, "rewards/rollout_reward_func/std": 0.4251657724380493, "sampling/importance_sampling_ratio/max": 0.9823781251907349, "sampling/importance_sampling_ratio/mean": 0.8468610644340515, "sampling/importance_sampling_ratio/min": 0.03340761736035347, "sampling/sampling_logp_difference/max": 2.5198962688446045, "sampling/sampling_logp_difference/mean": 0.04339824616909027, "step": 179, "step_time": 4.51067532900197 }, { "clip_ratio/high_max": 0.031250000931322575, "clip_ratio/high_mean": 0.015625000465661287, "clip_ratio/low_mean": 0.02566964365541935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04129464412108064, "entropy": 0.6138604693114758, "epoch": 0.0018, "grad_norm": 0.12239273637533188, "kl": 0.5281165726482868, "learning_rate": 9.999990403390095e-06, "loss": 0.0109, "step": 180, "step_time": 2.5066333529975964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 222.375, "completions/mean_terminated_length": 222.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.8737248666584492, "epoch": 0.00181, "frac_reward_zero_std": 0.5, "grad_norm": 0.1874532848596573, "kl": 0.45906462147831917, "learning_rate": 9.999990269641104e-06, "loss": -0.0389, "num_tokens": 1802964.0, "reward": 0.5425707101821899, "reward_std": 0.04155208542943001, "rewards/rollout_reward_func/mean": 0.5425707101821899, "rewards/rollout_reward_func/std": 0.25123709440231323, "sampling/importance_sampling_ratio/max": 0.941186785697937, "sampling/importance_sampling_ratio/mean": 0.7883158922195435, "sampling/importance_sampling_ratio/min": 1.6417001493049949e-13, "sampling/sampling_logp_difference/max": 3.573953151702881, "sampling/sampling_logp_difference/mean": 0.16331179440021515, "step": 181, "step_time": 5.45216344401706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8984296470880508, "epoch": 0.00182, "grad_norm": 0.1454722285270691, "kl": 0.46343305706977844, "learning_rate": 9.999990134966518e-06, "loss": -0.0379, "step": 182, "step_time": 2.1098652299988316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 181.75, "completions/mean_terminated_length": 181.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.9619230702519417, "epoch": 0.00183, "frac_reward_zero_std": 0.5, "grad_norm": 0.21483922004699707, "kl": 0.4928646832704544, "learning_rate": 9.999989999366333e-06, "loss": -0.0217, "num_tokens": 1822812.0, "reward": 0.7738447189331055, "reward_std": 0.051015011966228485, "rewards/rollout_reward_func/mean": 0.7738447189331055, "rewards/rollout_reward_func/std": 0.27267226576805115, "sampling/importance_sampling_ratio/max": 0.9608039855957031, "sampling/importance_sampling_ratio/mean": 0.7767164707183838, "sampling/importance_sampling_ratio/min": 2.898047843098283e-15, "sampling/sampling_logp_difference/max": 3.8185441493988037, "sampling/sampling_logp_difference/mean": 0.1908988356590271, "step": 183, "step_time": 4.902608147000137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9774146787822247, "epoch": 0.00184, "grad_norm": 0.2041073590517044, "kl": 0.49369123578071594, "learning_rate": 9.999989862840553e-06, "loss": -0.0216, "step": 184, "step_time": 2.0963400850159815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6218284331262112, "epoch": 0.00185, "frac_reward_zero_std": 0.5, "grad_norm": 0.15096430480480194, "kl": 0.5023536942899227, "learning_rate": 9.999989725389174e-06, "loss": 0.0515, "num_tokens": 1843504.0, "reward": 0.3414423167705536, "reward_std": 0.058472298085689545, "rewards/rollout_reward_func/mean": 0.3414423167705536, "rewards/rollout_reward_func/std": 0.33901646733283997, "sampling/importance_sampling_ratio/max": 0.9175445437431335, "sampling/importance_sampling_ratio/mean": 0.801535427570343, "sampling/importance_sampling_ratio/min": 0.09411811828613281, "sampling/sampling_logp_difference/max": 1.963516116142273, "sampling/sampling_logp_difference/mean": 0.049242258071899414, "step": 185, "step_time": 4.55286692998925 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009114583488553762, "entropy": 0.662644162774086, "epoch": 0.00186, "grad_norm": 0.1456974744796753, "kl": 0.4865363836288452, "learning_rate": 9.9999895870122e-06, "loss": 0.0508, "step": 186, "step_time": 2.5494712850122596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 200.375, "completions/mean_terminated_length": 200.375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.696576252579689, "epoch": 0.00187, "frac_reward_zero_std": 0.75, "grad_norm": 0.03706799075007439, "kl": 0.4627971835434437, "learning_rate": 9.999989447709628e-06, "loss": -0.0216, "num_tokens": 1863676.0, "reward": 0.5216057300567627, "reward_std": 0.0088388342410326, "rewards/rollout_reward_func/mean": 0.5216057300567627, "rewards/rollout_reward_func/std": 0.5043528079986572, "sampling/importance_sampling_ratio/max": 0.9166255593299866, "sampling/importance_sampling_ratio/mean": 0.8204658031463623, "sampling/importance_sampling_ratio/min": 9.367532038595527e-05, "sampling/sampling_logp_difference/max": 3.836186408996582, "sampling/sampling_logp_difference/mean": 0.07404587417840958, "step": 187, "step_time": 4.871653264002816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7222027480602264, "epoch": 0.00188, "grad_norm": 0.03886116296052933, "kl": 0.4607066288590431, "learning_rate": 9.99998930748146e-06, "loss": -0.0216, "step": 188, "step_time": 2.025939131999621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 176.28125, "completions/mean_terminated_length": 176.28125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 1.4271502643823624, "epoch": 0.00189, "frac_reward_zero_std": 0.25, "grad_norm": 0.07378535717725754, "kl": 0.6327951662242413, "learning_rate": 9.999989166327695e-06, "loss": 0.0327, "num_tokens": 1883157.0, "reward": 0.056216344237327576, "reward_std": 0.11831381916999817, "rewards/rollout_reward_func/mean": 0.056216344237327576, "rewards/rollout_reward_func/std": 0.5764862298965454, "sampling/importance_sampling_ratio/max": 0.8893072009086609, "sampling/importance_sampling_ratio/mean": 0.7047967910766602, "sampling/importance_sampling_ratio/min": 4.113783839595164e-18, "sampling/sampling_logp_difference/max": 3.0606486797332764, "sampling/sampling_logp_difference/mean": 0.30594784021377563, "step": 189, "step_time": 4.376190965012938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4345425069332123, "epoch": 0.0019, "grad_norm": 0.07486236095428467, "kl": 0.61868517100811, "learning_rate": 9.999989024248333e-06, "loss": 0.0326, "step": 190, "step_time": 2.0684155609997106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 206.65625, "completions/mean_terminated_length": 206.65625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 1.2281187325716019, "epoch": 0.00191, "frac_reward_zero_std": 0.25, "grad_norm": 0.14240151643753052, "kl": 0.5368041098117828, "learning_rate": 9.999988881243376e-06, "loss": -0.061, "num_tokens": 1903994.0, "reward": 0.5552479028701782, "reward_std": 0.07191074639558792, "rewards/rollout_reward_func/mean": 0.5552479028701782, "rewards/rollout_reward_func/std": 0.22044754028320312, "sampling/importance_sampling_ratio/max": 0.9010228514671326, "sampling/importance_sampling_ratio/mean": 0.7028883695602417, "sampling/importance_sampling_ratio/min": 3.1412704788551906e-21, "sampling/sampling_logp_difference/max": 3.948270797729492, "sampling/sampling_logp_difference/mean": 0.2789801061153412, "step": 191, "step_time": 5.378220628001145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2372251823544502, "epoch": 0.00192, "grad_norm": 0.1435721516609192, "kl": 0.5341755002737045, "learning_rate": 9.99998873731282e-06, "loss": -0.0612, "step": 192, "step_time": 2.1466313879936934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 285.125, "completions/mean_terminated_length": 285.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.9439372718334198, "epoch": 0.00193, "frac_reward_zero_std": 0.0, "grad_norm": 0.8069420456886292, "kl": 0.4052179716527462, "learning_rate": 9.99998859245667e-06, "loss": -0.0447, "num_tokens": 1927006.0, "reward": 1.1958086490631104, "reward_std": 0.09973421692848206, "rewards/rollout_reward_func/mean": 1.1958086490631104, "rewards/rollout_reward_func/std": 0.1581934690475464, "sampling/importance_sampling_ratio/max": 0.8715280294418335, "sampling/importance_sampling_ratio/mean": 0.672217845916748, "sampling/importance_sampling_ratio/min": 0.05949447676539421, "sampling/sampling_logp_difference/max": 2.257462501525879, "sampling/sampling_logp_difference/mean": 0.06661685556173325, "step": 193, "step_time": 5.333767142008583 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.9038095846772194, "epoch": 0.00194, "grad_norm": 0.6182759404182434, "kl": 0.4086563438177109, "learning_rate": 9.999988446674922e-06, "loss": -0.0495, "step": 194, "step_time": 2.0754467180013307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 240.5625, "completions/mean_terminated_length": 240.5625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.8006806299090385, "epoch": 0.00195, "frac_reward_zero_std": 0.25, "grad_norm": 0.18158534169197083, "kl": 0.44362054020166397, "learning_rate": 9.999988299967575e-06, "loss": -0.0666, "num_tokens": 1948976.0, "reward": 0.837415874004364, "reward_std": 0.0479348748922348, "rewards/rollout_reward_func/mean": 0.837415874004364, "rewards/rollout_reward_func/std": 0.41134268045425415, "sampling/importance_sampling_ratio/max": 0.9396589398384094, "sampling/importance_sampling_ratio/mean": 0.7288820743560791, "sampling/importance_sampling_ratio/min": 0.027741456404328346, "sampling/sampling_logp_difference/max": 2.4265427589416504, "sampling/sampling_logp_difference/mean": 0.06979021430015564, "step": 195, "step_time": 4.6889400579893845 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 0.7372021153569221, "epoch": 0.00196, "grad_norm": 0.21147528290748596, "kl": 0.45048097148537636, "learning_rate": 9.999988152334635e-06, "loss": -0.0669, "step": 196, "step_time": 2.100951188003819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 201.0, "completions/mean_terminated_length": 201.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.7904528565704823, "epoch": 0.00197, "frac_reward_zero_std": 0.25, "grad_norm": 0.2016495019197464, "kl": 0.4971098080277443, "learning_rate": 9.999988003776098e-06, "loss": -0.061, "num_tokens": 1969544.0, "reward": 0.5095047950744629, "reward_std": 0.1403839886188507, "rewards/rollout_reward_func/mean": 0.5095047950744629, "rewards/rollout_reward_func/std": 0.46805068850517273, "sampling/importance_sampling_ratio/max": 0.960533857345581, "sampling/importance_sampling_ratio/mean": 0.7625528573989868, "sampling/importance_sampling_ratio/min": 0.013144024647772312, "sampling/sampling_logp_difference/max": 2.282623767852783, "sampling/sampling_logp_difference/mean": 0.07993334531784058, "step": 197, "step_time": 5.20812436799315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7320220209658146, "epoch": 0.00198, "grad_norm": 0.19221213459968567, "kl": 0.5141875259578228, "learning_rate": 9.999987854291966e-06, "loss": -0.0617, "step": 198, "step_time": 2.50564483099879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 203.75, "completions/mean_terminated_length": 203.75, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.7295905016362667, "epoch": 0.00199, "frac_reward_zero_std": 0.25, "grad_norm": 0.13809426128864288, "kl": 0.5199013240635395, "learning_rate": 9.999987703882235e-06, "loss": -0.0757, "num_tokens": 1990312.0, "reward": 0.6920274496078491, "reward_std": 0.1382773220539093, "rewards/rollout_reward_func/mean": 0.6920274496078491, "rewards/rollout_reward_func/std": 0.3540177643299103, "sampling/importance_sampling_ratio/max": 0.9487156867980957, "sampling/importance_sampling_ratio/mean": 0.8113619685173035, "sampling/importance_sampling_ratio/min": 0.002604670822620392, "sampling/sampling_logp_difference/max": 2.397345781326294, "sampling/sampling_logp_difference/mean": 0.08888885378837585, "step": 199, "step_time": 4.747259955998743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.6828344352543354, "epoch": 0.002, "grad_norm": 0.1519465148448944, "kl": 0.5439648889005184, "learning_rate": 9.999987552546909e-06, "loss": -0.0759, "step": 200, "step_time": 2.0976748219909496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 210.375, "completions/mean_terminated_length": 210.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.5394121408462524, "epoch": 0.00201, "frac_reward_zero_std": 0.25, "grad_norm": 0.8468708395957947, "kl": 0.5888229757547379, "learning_rate": 9.999987400285985e-06, "loss": 0.0259, "num_tokens": 2010988.0, "reward": 0.7241587042808533, "reward_std": 0.15539607405662537, "rewards/rollout_reward_func/mean": 0.7241587042808533, "rewards/rollout_reward_func/std": 0.45020973682403564, "sampling/importance_sampling_ratio/max": 1.389906644821167, "sampling/importance_sampling_ratio/mean": 0.8456813097000122, "sampling/importance_sampling_ratio/min": 0.0640234649181366, "sampling/sampling_logp_difference/max": 2.098507881164551, "sampling/sampling_logp_difference/mean": 0.06825882941484451, "step": 201, "step_time": 4.611790675997327 }, { "clip_ratio/high_max": 0.0357142873108387, "clip_ratio/high_mean": 0.01785714365541935, "clip_ratio/low_mean": 0.014136905316263437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0319940485060215, "entropy": 0.4939466342329979, "epoch": 0.00202, "grad_norm": 0.1548696756362915, "kl": 0.6081962324678898, "learning_rate": 9.999987247099467e-06, "loss": 0.0234, "step": 202, "step_time": 2.0792033469988382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 189.46875, "completions/mean_terminated_length": 189.46875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.35538471676409245, "epoch": 0.00203, "frac_reward_zero_std": 0.5, "grad_norm": 0.024869926273822784, "kl": 0.6739046052098274, "learning_rate": 9.999987092987352e-06, "loss": -0.049, "num_tokens": 2030531.0, "reward": 0.6378990411758423, "reward_std": 0.21263517439365387, "rewards/rollout_reward_func/mean": 0.6378990411758423, "rewards/rollout_reward_func/std": 0.5500175356864929, "sampling/importance_sampling_ratio/max": 0.9887805581092834, "sampling/importance_sampling_ratio/mean": 0.8907865881919861, "sampling/importance_sampling_ratio/min": 0.042732369154691696, "sampling/sampling_logp_difference/max": 2.52138090133667, "sampling/sampling_logp_difference/mean": 0.03932056948542595, "step": 203, "step_time": 5.081753992002632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3391709718853235, "epoch": 0.00204, "grad_norm": 0.02155803143978119, "kl": 0.6857802011072636, "learning_rate": 9.999986937949641e-06, "loss": -0.0491, "step": 204, "step_time": 2.460397273003764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 130.375, "completions/mean_terminated_length": 130.375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.5816357713192701, "epoch": 0.00205, "frac_reward_zero_std": 0.75, "grad_norm": 0.2778152823448181, "kl": 0.6934394240379333, "learning_rate": 9.999986781986334e-06, "loss": 0.0009, "num_tokens": 2049095.0, "reward": 0.8363173007965088, "reward_std": 0.23535454273223877, "rewards/rollout_reward_func/mean": 0.8363173007965088, "rewards/rollout_reward_func/std": 0.6048397421836853, "sampling/importance_sampling_ratio/max": 0.9935559034347534, "sampling/importance_sampling_ratio/mean": 0.9206627011299133, "sampling/importance_sampling_ratio/min": 5.565502760873642e-07, "sampling/sampling_logp_difference/max": 3.9334845542907715, "sampling/sampling_logp_difference/mean": 0.11834601312875748, "step": 205, "step_time": 4.259822145984799 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.5774738937616348, "epoch": 0.00206, "grad_norm": 0.1937899887561798, "kl": 0.6004196107387543, "learning_rate": 9.999986625097431e-06, "loss": 0.0003, "step": 206, "step_time": 2.0175745009910315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 151.25, "completions/mean_terminated_length": 151.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.22287440858781338, "epoch": 0.00207, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006832257495261729, "kl": 0.5447422415018082, "learning_rate": 9.999986467282931e-06, "loss": 0.0017, "num_tokens": 2067919.0, "reward": 0.15280768275260925, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.15280768275260925, "rewards/rollout_reward_func/std": 0.08351969718933105, "sampling/importance_sampling_ratio/max": 0.982151985168457, "sampling/importance_sampling_ratio/mean": 0.9589859247207642, "sampling/importance_sampling_ratio/min": 0.9266688823699951, "sampling/sampling_logp_difference/max": 0.05188522860407829, "sampling/sampling_logp_difference/mean": 0.011395075358450413, "step": 207, "step_time": 4.300959250991582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2200380228459835, "epoch": 0.00208, "grad_norm": 0.0006528455414809287, "kl": 0.5450977832078934, "learning_rate": 9.999986308542834e-06, "loss": 0.0017, "step": 208, "step_time": 2.5050313400133746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 188.0, "completions/mean_terminated_length": 188.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3463075812906027, "epoch": 0.00209, "frac_reward_zero_std": 0.5, "grad_norm": 0.26966869831085205, "kl": 0.6586992777884007, "learning_rate": 9.999986148877143e-06, "loss": -0.0566, "num_tokens": 2087911.0, "reward": 0.6343509554862976, "reward_std": 0.23965206742286682, "rewards/rollout_reward_func/mean": 0.6343509554862976, "rewards/rollout_reward_func/std": 0.4458661675453186, "sampling/importance_sampling_ratio/max": 1.0238475799560547, "sampling/importance_sampling_ratio/mean": 0.906256377696991, "sampling/importance_sampling_ratio/min": 0.037747323513031006, "sampling/sampling_logp_difference/max": 2.1383323669433594, "sampling/sampling_logp_difference/mean": 0.039105020463466644, "step": 209, "step_time": 4.759565813001245 }, { "clip_ratio/high_max": 0.016741071827709675, "clip_ratio/high_mean": 0.008370535913854837, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012834821827709675, "entropy": 0.33876704797148705, "epoch": 0.0021, "grad_norm": 0.040093034505844116, "kl": 0.6278291344642639, "learning_rate": 9.999985988285857e-06, "loss": -0.0572, "step": 210, "step_time": 2.4996835830024793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5269973631948233, "epoch": 0.00211, "frac_reward_zero_std": 0.5, "grad_norm": 0.4274274408817291, "kl": 0.6032895147800446, "learning_rate": 9.999985826768975e-06, "loss": -0.0043, "num_tokens": 2108959.0, "reward": 0.41615867614746094, "reward_std": 0.0623258501291275, "rewards/rollout_reward_func/mean": 0.41615867614746094, "rewards/rollout_reward_func/std": 0.283931702375412, "sampling/importance_sampling_ratio/max": 0.982300341129303, "sampling/importance_sampling_ratio/mean": 0.8950153589248657, "sampling/importance_sampling_ratio/min": 9.998691439250251e-07, "sampling/sampling_logp_difference/max": 3.835904359817505, "sampling/sampling_logp_difference/mean": 0.09280946850776672, "step": 211, "step_time": 4.533358218999638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5256770253181458, "epoch": 0.00212, "grad_norm": 0.4630410075187683, "kl": 0.5838859863579273, "learning_rate": 9.999985664326495e-06, "loss": -0.0046, "step": 212, "step_time": 2.059260086993163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 213.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.16436471045017242, "epoch": 0.00213, "frac_reward_zero_std": 0.75, "grad_norm": 0.22562657296657562, "kl": 0.46412403881549835, "learning_rate": 9.99998550095842e-06, "loss": -0.0114, "num_tokens": 2129795.0, "reward": 1.173471212387085, "reward_std": 0.015721136704087257, "rewards/rollout_reward_func/mean": 1.173471212387085, "rewards/rollout_reward_func/std": 0.18796560168266296, "sampling/importance_sampling_ratio/max": 1.0395857095718384, "sampling/importance_sampling_ratio/mean": 0.9705314636230469, "sampling/importance_sampling_ratio/min": 0.9457858204841614, "sampling/sampling_logp_difference/max": 0.09239654242992401, "sampling/sampling_logp_difference/mean": 0.010313913226127625, "step": 213, "step_time": 4.711728218004282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15919248759746552, "epoch": 0.00214, "grad_norm": 0.13408398628234863, "kl": 0.46542900800704956, "learning_rate": 9.999985336664749e-06, "loss": -0.0118, "step": 214, "step_time": 2.545933021006931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.15608727000653744, "epoch": 0.00215, "frac_reward_zero_std": 1.0, "grad_norm": 0.000512382946908474, "kl": 0.5246022418141365, "learning_rate": 9.999985171445482e-06, "loss": 0.002, "num_tokens": 2150163.0, "reward": 0.1603846251964569, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.1603846251964569, "rewards/rollout_reward_func/std": 0.10663572698831558, "sampling/importance_sampling_ratio/max": 0.9814488887786865, "sampling/importance_sampling_ratio/mean": 0.9665554761886597, "sampling/importance_sampling_ratio/min": 0.9563882350921631, "sampling/sampling_logp_difference/max": 0.025375869125127792, "sampling/sampling_logp_difference/mean": 0.00872244592756033, "step": 215, "step_time": 4.313772187990253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15342451632022858, "epoch": 0.00216, "grad_norm": 0.000488189427414909, "kl": 0.5247949361801147, "learning_rate": 9.99998500530062e-06, "loss": 0.002, "step": 216, "step_time": 2.465021465999598 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00837053544819355, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 246.875, "completions/mean_terminated_length": 246.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.36836712434887886, "epoch": 0.00217, "frac_reward_zero_std": 0.5, "grad_norm": 0.16117314994335175, "kl": 0.4887763150036335, "learning_rate": 9.999984838230163e-06, "loss": 0.0203, "num_tokens": 2172143.0, "reward": 0.8686730861663818, "reward_std": 0.026431426405906677, "rewards/rollout_reward_func/mean": 0.8686730861663818, "rewards/rollout_reward_func/std": 0.2137853354215622, "sampling/importance_sampling_ratio/max": 1.1709622144699097, "sampling/importance_sampling_ratio/mean": 0.8605372905731201, "sampling/importance_sampling_ratio/min": 0.03743837773799896, "sampling/sampling_logp_difference/max": 2.587860345840454, "sampling/sampling_logp_difference/mean": 0.06172894686460495, "step": 217, "step_time": 4.749560869000561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36929964274168015, "epoch": 0.00218, "grad_norm": 0.21529018878936768, "kl": 0.49125149846076965, "learning_rate": 9.999984670234109e-06, "loss": 0.0203, "step": 218, "step_time": 2.0939969540049788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 175.0625, "completions/mean_terminated_length": 175.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4100234657526016, "epoch": 0.00219, "frac_reward_zero_std": 0.75, "grad_norm": 0.10547854006290436, "kl": 0.9231857806444168, "learning_rate": 9.99998450131246e-06, "loss": -0.0127, "num_tokens": 2191289.0, "reward": 0.35576921701431274, "reward_std": 0.03807498514652252, "rewards/rollout_reward_func/mean": 0.35576921701431274, "rewards/rollout_reward_func/std": 0.3469662368297577, "sampling/importance_sampling_ratio/max": 0.9723277688026428, "sampling/importance_sampling_ratio/mean": 0.8960054516792297, "sampling/importance_sampling_ratio/min": 0.014519168995320797, "sampling/sampling_logp_difference/max": 1.9109420776367188, "sampling/sampling_logp_difference/mean": 0.042582470923662186, "step": 219, "step_time": 4.357866947997536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4014459066092968, "epoch": 0.0022, "grad_norm": 0.09942269325256348, "kl": 0.8695190027356148, "learning_rate": 9.999984331465216e-06, "loss": -0.0128, "step": 220, "step_time": 2.544869199002278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 190.5, "completions/mean_terminated_length": 190.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.30176197923719883, "epoch": 0.00221, "frac_reward_zero_std": 0.75, "grad_norm": 0.08799642324447632, "kl": 0.7797574959695339, "learning_rate": 9.999984160692378e-06, "loss": 0.0279, "num_tokens": 2211329.0, "reward": 0.256538450717926, "reward_std": 0.02719641663134098, "rewards/rollout_reward_func/mean": 0.256538450717926, "rewards/rollout_reward_func/std": 0.1815994381904602, "sampling/importance_sampling_ratio/max": 0.9938454627990723, "sampling/importance_sampling_ratio/mean": 0.9165062308311462, "sampling/importance_sampling_ratio/min": 0.055260252207517624, "sampling/sampling_logp_difference/max": 1.2928045988082886, "sampling/sampling_logp_difference/mean": 0.03398607671260834, "step": 221, "step_time": 4.274700904999918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29946769028902054, "epoch": 0.00222, "grad_norm": 0.08525743335485458, "kl": 0.766645148396492, "learning_rate": 9.999983988993942e-06, "loss": 0.0278, "step": 222, "step_time": 2.4755612640074105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 146.25, "completions/mean_terminated_length": 146.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.21632562763988972, "epoch": 0.00223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009730628225952387, "kl": 0.546472929418087, "learning_rate": 9.99998381636991e-06, "loss": 0.0017, "num_tokens": 2229529.0, "reward": 0.5996153354644775, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5996153354644775, "rewards/rollout_reward_func/std": 0.42743319272994995, "sampling/importance_sampling_ratio/max": 1.1132183074951172, "sampling/importance_sampling_ratio/mean": 0.9861915707588196, "sampling/importance_sampling_ratio/min": 0.8969372510910034, "sampling/sampling_logp_difference/max": 0.14998146891593933, "sampling/sampling_logp_difference/mean": 0.017467334866523743, "step": 223, "step_time": 4.286464274999162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21307706460356712, "epoch": 0.00224, "grad_norm": 0.0010097362101078033, "kl": 0.5461630448698997, "learning_rate": 9.999983642820286e-06, "loss": 0.0017, "step": 224, "step_time": 2.0381641119893175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.22587956860661507, "epoch": 0.00225, "frac_reward_zero_std": 0.75, "grad_norm": 0.06549027562141418, "kl": 0.48881887272000313, "learning_rate": 9.999983468345063e-06, "loss": 0.0239, "num_tokens": 2251985.0, "reward": 0.47763457894325256, "reward_std": 0.006799104157835245, "rewards/rollout_reward_func/mean": 0.47763457894325256, "rewards/rollout_reward_func/std": 0.477651447057724, "sampling/importance_sampling_ratio/max": 0.9778825044631958, "sampling/importance_sampling_ratio/mean": 0.9228157997131348, "sampling/importance_sampling_ratio/min": 0.2103714793920517, "sampling/sampling_logp_difference/max": 1.4787771701812744, "sampling/sampling_logp_difference/mean": 0.01663869246840477, "step": 225, "step_time": 4.809860580011446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22949658334255219, "epoch": 0.00226, "grad_norm": 0.06667757779359818, "kl": 0.4835104010999203, "learning_rate": 9.999983292944247e-06, "loss": 0.0238, "step": 226, "step_time": 2.544962867992581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 198.375, "completions/mean_terminated_length": 198.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.2817759495228529, "epoch": 0.00227, "frac_reward_zero_std": 0.75, "grad_norm": 0.0257228035479784, "kl": 0.6115407571196556, "learning_rate": 9.999983116617835e-06, "loss": -0.0144, "num_tokens": 2272557.0, "reward": 0.5541346073150635, "reward_std": 0.028071090579032898, "rewards/rollout_reward_func/mean": 0.5541346073150635, "rewards/rollout_reward_func/std": 0.39673134684562683, "sampling/importance_sampling_ratio/max": 1.0549193620681763, "sampling/importance_sampling_ratio/mean": 0.9115352630615234, "sampling/importance_sampling_ratio/min": 0.018261022865772247, "sampling/sampling_logp_difference/max": 2.395953416824341, "sampling/sampling_logp_difference/mean": 0.042597509920597076, "step": 227, "step_time": 5.0826678579906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2824087142944336, "epoch": 0.00228, "grad_norm": 0.02103164605796337, "kl": 0.5937375500798225, "learning_rate": 9.999982939365828e-06, "loss": -0.0144, "step": 228, "step_time": 2.0893807920001564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 169.8125, "completions/mean_terminated_length": 169.8125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3320293966680765, "epoch": 0.00229, "frac_reward_zero_std": 0.5, "grad_norm": 0.12658746540546417, "kl": 0.7297212816774845, "learning_rate": 9.999982761188226e-06, "loss": 0.0008, "num_tokens": 2292191.0, "reward": 0.6984951496124268, "reward_std": 0.03558650612831116, "rewards/rollout_reward_func/mean": 0.6984951496124268, "rewards/rollout_reward_func/std": 0.37543928623199463, "sampling/importance_sampling_ratio/max": 1.1395678520202637, "sampling/importance_sampling_ratio/mean": 0.9344795942306519, "sampling/importance_sampling_ratio/min": 0.21406127512454987, "sampling/sampling_logp_difference/max": 1.1761265993118286, "sampling/sampling_logp_difference/mean": 0.02986838109791279, "step": 229, "step_time": 4.737592865982151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3312939237803221, "epoch": 0.0023, "grad_norm": 0.13152208924293518, "kl": 0.7140150889754295, "learning_rate": 9.999982582085029e-06, "loss": 0.0006, "step": 230, "step_time": 2.047354821006593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 222.6875, "completions/mean_terminated_length": 222.6875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2255735881626606, "epoch": 0.00231, "frac_reward_zero_std": 0.75, "grad_norm": 0.1558540314435959, "kl": 0.48027053847908974, "learning_rate": 9.999982402056237e-06, "loss": -0.0183, "num_tokens": 2313405.0, "reward": 0.6739615797996521, "reward_std": 0.02719641663134098, "rewards/rollout_reward_func/mean": 0.6739615797996521, "rewards/rollout_reward_func/std": 0.41228950023651123, "sampling/importance_sampling_ratio/max": 0.9629499912261963, "sampling/importance_sampling_ratio/mean": 0.9234582185745239, "sampling/importance_sampling_ratio/min": 0.19418421387672424, "sampling/sampling_logp_difference/max": 0.9132423400878906, "sampling/sampling_logp_difference/mean": 0.017246980220079422, "step": 231, "step_time": 4.604370095003105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23303298093378544, "epoch": 0.00232, "grad_norm": 0.1407303810119629, "kl": 0.4806394465267658, "learning_rate": 9.999982221101849e-06, "loss": -0.0184, "step": 232, "step_time": 2.528648970008362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 210.59375, "completions/mean_terminated_length": 210.59375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5170358046889305, "epoch": 0.00233, "frac_reward_zero_std": 0.0, "grad_norm": 2.1360504627227783, "kl": 0.6248220205307007, "learning_rate": 9.999982039221867e-06, "loss": -0.0399, "num_tokens": 2334176.0, "reward": 0.4184807538986206, "reward_std": 0.23767954111099243, "rewards/rollout_reward_func/mean": 0.4184807538986206, "rewards/rollout_reward_func/std": 0.499830037355423, "sampling/importance_sampling_ratio/max": 1.662024974822998, "sampling/importance_sampling_ratio/mean": 0.8629190921783447, "sampling/importance_sampling_ratio/min": 0.03931980952620506, "sampling/sampling_logp_difference/max": 2.4584407806396484, "sampling/sampling_logp_difference/mean": 0.06382361054420471, "step": 233, "step_time": 5.2076552169965 }, { "clip_ratio/high_max": 0.030357143841683865, "clip_ratio/high_mean": 0.019084821455180645, "clip_ratio/low_mean": 0.04061260027810931, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.05969742266461253, "entropy": 0.5900847278535366, "epoch": 0.00234, "grad_norm": 0.42420753836631775, "kl": 0.6081907786428928, "learning_rate": 9.99998185641629e-06, "loss": -0.0464, "step": 234, "step_time": 2.0944004880002467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 192.9375, "completions/mean_terminated_length": 192.9375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.21673661842942238, "epoch": 0.00235, "frac_reward_zero_std": 0.75, "grad_norm": 0.5043757557868958, "kl": 0.45044081658124924, "learning_rate": 9.999981672685119e-06, "loss": -0.0195, "num_tokens": 2353894.0, "reward": 0.6882451772689819, "reward_std": 0.041515324264764786, "rewards/rollout_reward_func/mean": 0.6882451772689819, "rewards/rollout_reward_func/std": 0.3798646330833435, "sampling/importance_sampling_ratio/max": 1.0315970182418823, "sampling/importance_sampling_ratio/mean": 0.9638964533805847, "sampling/importance_sampling_ratio/min": 0.5445245504379272, "sampling/sampling_logp_difference/max": 0.35381507873535156, "sampling/sampling_logp_difference/mean": 0.014755446463823318, "step": 235, "step_time": 4.5696282949938904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22274428233504295, "epoch": 0.00236, "grad_norm": 0.7129935622215271, "kl": 0.4459890387952328, "learning_rate": 9.999981488028352e-06, "loss": -0.0186, "step": 236, "step_time": 2.057414474991674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 220.6875, "completions/mean_terminated_length": 220.6875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.40892100520431995, "epoch": 0.00237, "frac_reward_zero_std": 0.25, "grad_norm": 0.20972906053066254, "kl": 0.49186496809124947, "learning_rate": 9.99998130244599e-06, "loss": -0.0833, "num_tokens": 2375172.0, "reward": 0.7014840841293335, "reward_std": 0.027701135724782944, "rewards/rollout_reward_func/mean": 0.7014840841293335, "rewards/rollout_reward_func/std": 0.3952644169330597, "sampling/importance_sampling_ratio/max": 1.1601073741912842, "sampling/importance_sampling_ratio/mean": 0.855615496635437, "sampling/importance_sampling_ratio/min": 0.06652221083641052, "sampling/sampling_logp_difference/max": 2.303511381149292, "sampling/sampling_logp_difference/mean": 0.05305309221148491, "step": 237, "step_time": 4.817237536000903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41071304120123386, "epoch": 0.00238, "grad_norm": 0.20853091776371002, "kl": 0.4925862103700638, "learning_rate": 9.999981115938033e-06, "loss": -0.0835, "step": 238, "step_time": 2.5659214360057376 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 189.59375, "completions/mean_terminated_length": 189.59375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.47674281895160675, "epoch": 0.00239, "frac_reward_zero_std": 0.25, "grad_norm": 1.595680832862854, "kl": 0.5763115100562572, "learning_rate": 9.999980928504482e-06, "loss": 0.0268, "num_tokens": 2394967.0, "reward": 0.5185913443565369, "reward_std": 0.08188735693693161, "rewards/rollout_reward_func/mean": 0.5185913443565369, "rewards/rollout_reward_func/std": 0.3085022568702698, "sampling/importance_sampling_ratio/max": 0.9939354658126831, "sampling/importance_sampling_ratio/mean": 0.8466328382492065, "sampling/importance_sampling_ratio/min": 0.1085224449634552, "sampling/sampling_logp_difference/max": 2.035637378692627, "sampling/sampling_logp_difference/mean": 0.05717436969280243, "step": 239, "step_time": 5.056551490997663 }, { "clip_ratio/high_max": 0.06562500074505806, "clip_ratio/high_mean": 0.04248511930927634, "clip_ratio/low_mean": 0.04196428647264838, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0844494067132473, "entropy": 0.4026017375290394, "epoch": 0.0024, "grad_norm": 0.1733778715133667, "kl": 0.5664763562381268, "learning_rate": 9.999980740145336e-06, "loss": 0.021, "step": 240, "step_time": 2.0725459790119203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.416548827663064, "epoch": 0.00241, "frac_reward_zero_std": 0.25, "grad_norm": 0.3058980107307434, "kl": 0.48846491426229477, "learning_rate": 9.999980550860597e-06, "loss": 0.0175, "num_tokens": 2415763.0, "reward": 0.5267196893692017, "reward_std": 0.0686151310801506, "rewards/rollout_reward_func/mean": 0.5267196893692017, "rewards/rollout_reward_func/std": 0.5538945198059082, "sampling/importance_sampling_ratio/max": 1.0288485288619995, "sampling/importance_sampling_ratio/mean": 0.8725289106369019, "sampling/importance_sampling_ratio/min": 0.030778639018535614, "sampling/sampling_logp_difference/max": 2.420903444290161, "sampling/sampling_logp_difference/mean": 0.05195777863264084, "step": 241, "step_time": 4.724262934985745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.4054089691489935, "epoch": 0.00242, "grad_norm": 1.224478840827942, "kl": 0.47535523027181625, "learning_rate": 9.999980360650262e-06, "loss": 0.0199, "step": 242, "step_time": 2.0594026839899016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 169.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3158396780490875, "epoch": 0.00243, "frac_reward_zero_std": 0.5, "grad_norm": 0.5765506029129028, "kl": 0.4718017503619194, "learning_rate": 9.999980169514331e-06, "loss": 0.0012, "num_tokens": 2435119.0, "reward": 0.38682693243026733, "reward_std": 0.020397311076521873, "rewards/rollout_reward_func/mean": 0.38682693243026733, "rewards/rollout_reward_func/std": 0.46480122208595276, "sampling/importance_sampling_ratio/max": 1.0345184803009033, "sampling/importance_sampling_ratio/mean": 0.9173318147659302, "sampling/importance_sampling_ratio/min": 0.3565710186958313, "sampling/sampling_logp_difference/max": 0.6276495456695557, "sampling/sampling_logp_difference/mean": 0.02944820187985897, "step": 243, "step_time": 4.846021142002428 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01145833358168602, "entropy": 0.32716807536780834, "epoch": 0.00244, "grad_norm": 0.24067412316799164, "kl": 0.46641552075743675, "learning_rate": 9.999979977452809e-06, "loss": -0.001, "step": 244, "step_time": 2.027379906990973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 234.71875, "completions/mean_terminated_length": 234.71875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.3380514457821846, "epoch": 0.00245, "frac_reward_zero_std": 0.75, "grad_norm": 0.19026337563991547, "kl": 0.43626438826322556, "learning_rate": 9.999979784465691e-06, "loss": -0.0226, "num_tokens": 2456166.0, "reward": 0.4783894121646881, "reward_std": 0.028053322806954384, "rewards/rollout_reward_func/mean": 0.4783894121646881, "rewards/rollout_reward_func/std": 0.30842843651771545, "sampling/importance_sampling_ratio/max": 0.924182653427124, "sampling/importance_sampling_ratio/mean": 0.8605262041091919, "sampling/importance_sampling_ratio/min": 0.042493633925914764, "sampling/sampling_logp_difference/max": 2.2973246574401855, "sampling/sampling_logp_difference/mean": 0.039000727236270905, "step": 245, "step_time": 5.12235055299243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3396170996129513, "epoch": 0.00246, "grad_norm": 0.19600346684455872, "kl": 0.43506598845124245, "learning_rate": 9.999979590552979e-06, "loss": -0.0217, "step": 246, "step_time": 2.0793030690037995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.31486460752785206, "epoch": 0.00247, "frac_reward_zero_std": 0.75, "grad_norm": 0.04455508291721344, "kl": 0.5158842466771603, "learning_rate": 9.999979395714672e-06, "loss": -0.0215, "num_tokens": 2475358.0, "reward": 0.369567334651947, "reward_std": 0.12014421820640564, "rewards/rollout_reward_func/mean": 0.369567334651947, "rewards/rollout_reward_func/std": 0.4750381112098694, "sampling/importance_sampling_ratio/max": 1.0859401226043701, "sampling/importance_sampling_ratio/mean": 0.8814703822135925, "sampling/importance_sampling_ratio/min": 0.12772288918495178, "sampling/sampling_logp_difference/max": 1.7293624877929688, "sampling/sampling_logp_difference/mean": 0.036700498312711716, "step": 247, "step_time": 4.277090771996882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31320938281714916, "epoch": 0.00248, "grad_norm": 0.04253658279776573, "kl": 0.5154221691191196, "learning_rate": 9.99997919995077e-06, "loss": -0.0214, "step": 248, "step_time": 2.0111258719916805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 142.75, "completions/mean_terminated_length": 142.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5815050899982452, "epoch": 0.00249, "frac_reward_zero_std": 0.25, "grad_norm": 0.6283370852470398, "kl": 0.43893900886178017, "learning_rate": 9.999979003261275e-06, "loss": 0.0248, "num_tokens": 2493550.0, "reward": 0.43389421701431274, "reward_std": 0.08666396141052246, "rewards/rollout_reward_func/mean": 0.43389421701431274, "rewards/rollout_reward_func/std": 0.17113365232944489, "sampling/importance_sampling_ratio/max": 1.2856403589248657, "sampling/importance_sampling_ratio/mean": 0.8588056564331055, "sampling/importance_sampling_ratio/min": 0.16264751553535461, "sampling/sampling_logp_difference/max": 1.1924521923065186, "sampling/sampling_logp_difference/mean": 0.08787184208631516, "step": 249, "step_time": 4.721802212996408 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.5949395261704922, "epoch": 0.0025, "grad_norm": 0.47349876165390015, "kl": 0.4356229528784752, "learning_rate": 9.999978805646186e-06, "loss": 0.0241, "step": 250, "step_time": 2.0334463750041323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 228.4375, "completions/mean_terminated_length": 229.29031372070312, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.8282675594091415, "epoch": 0.00251, "frac_reward_zero_std": 0.25, "grad_norm": 0.3462763726711273, "kl": 0.4098556861281395, "learning_rate": 9.999978607105502e-06, "loss": -0.0469, "num_tokens": 2514316.0, "reward": 0.6783076524734497, "reward_std": 0.07415859401226044, "rewards/rollout_reward_func/mean": 0.6783076524734497, "rewards/rollout_reward_func/std": 0.3682922124862671, "sampling/importance_sampling_ratio/max": 1.5505878925323486, "sampling/importance_sampling_ratio/mean": 0.755170464515686, "sampling/importance_sampling_ratio/min": 4.080400070843443e-18, "sampling/sampling_logp_difference/max": 10.550016403198242, "sampling/sampling_logp_difference/mean": 0.26191994547843933, "step": 251, "step_time": 5.59938231799606 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 0.8227101545780897, "epoch": 0.00252, "grad_norm": 0.1326359510421753, "kl": 0.40849602594971657, "learning_rate": 9.999978407639225e-06, "loss": -0.0477, "step": 252, "step_time": 2.0589838640225935 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 145.875, "completions/mean_terminated_length": 145.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5004583038389683, "epoch": 0.00253, "frac_reward_zero_std": 0.75, "grad_norm": 0.2925935685634613, "kl": 0.5579732283949852, "learning_rate": 9.999978207247353e-06, "loss": -0.0225, "num_tokens": 2532856.0, "reward": 0.4429807662963867, "reward_std": 0.10512878745794296, "rewards/rollout_reward_func/mean": 0.4429807662963867, "rewards/rollout_reward_func/std": 0.3973429501056671, "sampling/importance_sampling_ratio/max": 1.7852503061294556, "sampling/importance_sampling_ratio/mean": 0.8663730025291443, "sampling/importance_sampling_ratio/min": 0.1100713312625885, "sampling/sampling_logp_difference/max": 2.2607719898223877, "sampling/sampling_logp_difference/mean": 0.0694337710738182, "step": 253, "step_time": 4.649059256997134 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.48843202739953995, "epoch": 0.00254, "grad_norm": 0.24892230331897736, "kl": 0.560722604393959, "learning_rate": 9.999978005929887e-06, "loss": -0.0229, "step": 254, "step_time": 2.0336972570003127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 216.46875, "completions/mean_terminated_length": 216.46875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.25983165204524994, "epoch": 0.00255, "frac_reward_zero_std": 0.75, "grad_norm": 0.2792050242424011, "kl": 0.4272737056016922, "learning_rate": 9.999977803686829e-06, "loss": -0.0039, "num_tokens": 2553455.0, "reward": 0.5702115297317505, "reward_std": 0.006799104157835245, "rewards/rollout_reward_func/mean": 0.5702115297317505, "rewards/rollout_reward_func/std": 0.4130892753601074, "sampling/importance_sampling_ratio/max": 1.254645586013794, "sampling/importance_sampling_ratio/mean": 0.9212988615036011, "sampling/importance_sampling_ratio/min": 0.8490272164344788, "sampling/sampling_logp_difference/max": 0.559842586517334, "sampling/sampling_logp_difference/mean": 0.024460500106215477, "step": 255, "step_time": 5.427092450998316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2610969878733158, "epoch": 0.00256, "grad_norm": 0.3301933705806732, "kl": 0.4242855980992317, "learning_rate": 9.999977600518175e-06, "loss": -0.0049, "step": 256, "step_time": 2.044404863998352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 231.3125, "completions/mean_terminated_length": 231.3125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3439766652882099, "epoch": 0.00257, "frac_reward_zero_std": 0.25, "grad_norm": 0.5980075001716614, "kl": 0.42379941046237946, "learning_rate": 9.999977396423928e-06, "loss": 0.0151, "num_tokens": 2574641.0, "reward": 0.4825913608074188, "reward_std": 0.036251604557037354, "rewards/rollout_reward_func/mean": 0.4825913608074188, "rewards/rollout_reward_func/std": 0.40530678629875183, "sampling/importance_sampling_ratio/max": 1.6445951461791992, "sampling/importance_sampling_ratio/mean": 0.8860589265823364, "sampling/importance_sampling_ratio/min": 0.7294396758079529, "sampling/sampling_logp_difference/max": 0.8011393547058105, "sampling/sampling_logp_difference/mean": 0.04896543174982071, "step": 257, "step_time": 5.430934875003004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02708333358168602, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02708333358168602, "entropy": 0.3491726126521826, "epoch": 0.00258, "grad_norm": 0.2591632008552551, "kl": 0.420858658850193, "learning_rate": 9.999977191404087e-06, "loss": 0.013, "step": 258, "step_time": 2.0623156199944788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 245.9375, "completions/mean_terminated_length": 243.5806427001953, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.6136895027011633, "epoch": 0.00259, "frac_reward_zero_std": 0.5, "grad_norm": 0.2056535929441452, "kl": 0.4772682301700115, "learning_rate": 9.999976985458653e-06, "loss": -0.0707, "num_tokens": 2596375.0, "reward": 0.7751827239990234, "reward_std": 0.08691175282001495, "rewards/rollout_reward_func/mean": 0.7751827239990234, "rewards/rollout_reward_func/std": 0.5230498313903809, "sampling/importance_sampling_ratio/max": 1.351101279258728, "sampling/importance_sampling_ratio/mean": 0.806816577911377, "sampling/importance_sampling_ratio/min": 4.3005199455970714e-16, "sampling/sampling_logp_difference/max": 4.440119743347168, "sampling/sampling_logp_difference/mean": 0.1870102882385254, "step": 259, "step_time": 4.847037398998509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6135431621223688, "epoch": 0.0026, "grad_norm": 0.25916191935539246, "kl": 0.47572876140475273, "learning_rate": 9.999976778587625e-06, "loss": -0.0718, "step": 260, "step_time": 2.071694875987305 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 142.53125, "completions/mean_terminated_length": 142.53125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4058641418814659, "epoch": 0.00261, "frac_reward_zero_std": 0.5, "grad_norm": 0.25377514958381653, "kl": 0.41521595045924187, "learning_rate": 9.999976570791002e-06, "loss": 0.0078, "num_tokens": 2614152.0, "reward": 0.7902404069900513, "reward_std": 0.05159291997551918, "rewards/rollout_reward_func/mean": 0.7902404069900513, "rewards/rollout_reward_func/std": 0.34003034234046936, "sampling/importance_sampling_ratio/max": 1.4220991134643555, "sampling/importance_sampling_ratio/mean": 0.849725067615509, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5998696088790894, "sampling/sampling_logp_difference/mean": 0.06789401918649673, "step": 261, "step_time": 4.634398679008882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.407322533428669, "epoch": 0.00262, "grad_norm": 0.27427688241004944, "kl": 0.4141157381236553, "learning_rate": 9.999976362068785e-06, "loss": 0.009, "step": 262, "step_time": 2.487557079999533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 195.78125, "completions/mean_terminated_length": 195.78125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.6929796356707811, "epoch": 0.00263, "frac_reward_zero_std": 0.25, "grad_norm": 0.826948344707489, "kl": 0.4107682816684246, "learning_rate": 9.999976152420979e-06, "loss": -0.0286, "num_tokens": 2634177.0, "reward": 0.8013172745704651, "reward_std": 0.06330564618110657, "rewards/rollout_reward_func/mean": 0.8013172745704651, "rewards/rollout_reward_func/std": 0.2952149510383606, "sampling/importance_sampling_ratio/max": 1.5318504571914673, "sampling/importance_sampling_ratio/mean": 0.8940542936325073, "sampling/importance_sampling_ratio/min": 3.0826179824895716e-13, "sampling/sampling_logp_difference/max": 2.6310372352600098, "sampling/sampling_logp_difference/mean": 0.18105831742286682, "step": 263, "step_time": 4.760345470989705 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.015625000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020089286379516125, "entropy": 0.6995731331408024, "epoch": 0.00264, "grad_norm": 0.31516993045806885, "kl": 0.421981617808342, "learning_rate": 9.999975941847575e-06, "loss": -0.0325, "step": 264, "step_time": 2.0434324359885068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024038462433964014, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "completions/clipped_ratio": 0.03125, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 212.53125, "completions/mean_terminated_length": 212.7096710205078, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.8175917379558086, "epoch": 0.00265, "frac_reward_zero_std": 0.5, "grad_norm": 0.3697964549064636, "kl": 0.42729051411151886, "learning_rate": 9.99997573034858e-06, "loss": -0.0515, "num_tokens": 2655090.0, "reward": 0.5417773723602295, "reward_std": 0.052132897078990936, "rewards/rollout_reward_func/mean": 0.5417773723602295, "rewards/rollout_reward_func/std": 0.37549909949302673, "sampling/importance_sampling_ratio/max": 1.1495530605316162, "sampling/importance_sampling_ratio/mean": 0.8034113645553589, "sampling/importance_sampling_ratio/min": 3.193730316084051e-16, "sampling/sampling_logp_difference/max": 11.981232643127441, "sampling/sampling_logp_difference/mean": 0.2773318290710449, "step": 265, "step_time": 4.685838147997856 }, { "clip_ratio/high_max": 0.06250000186264515, "clip_ratio/high_mean": 0.031250000931322575, "clip_ratio/low_mean": 0.015224359463900328, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0464743603952229, "entropy": 0.8681580312550068, "epoch": 0.00266, "grad_norm": 0.2551434338092804, "kl": 0.43408336117863655, "learning_rate": 9.99997551792399e-06, "loss": -0.0528, "step": 266, "step_time": 2.0731832299934467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 232.09375, "completions/mean_terminated_length": 232.09375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.363807400688529, "epoch": 0.00267, "frac_reward_zero_std": 0.75, "grad_norm": 0.16730216145515442, "kl": 0.45416004583239555, "learning_rate": 9.999975304573807e-06, "loss": -0.0199, "num_tokens": 2676949.0, "reward": 0.8860336542129517, "reward_std": 0.04273567348718643, "rewards/rollout_reward_func/mean": 0.8860336542129517, "rewards/rollout_reward_func/std": 0.4214213788509369, "sampling/importance_sampling_ratio/max": 1.3072501420974731, "sampling/importance_sampling_ratio/mean": 0.9095979332923889, "sampling/importance_sampling_ratio/min": 0.37367939949035645, "sampling/sampling_logp_difference/max": 0.6005685925483704, "sampling/sampling_logp_difference/mean": 0.039855509996414185, "step": 267, "step_time": 5.15784620600607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.37841809168457985, "epoch": 0.00268, "grad_norm": 0.09885615110397339, "kl": 0.46318985521793365, "learning_rate": 9.999975090298031e-06, "loss": -0.0207, "step": 268, "step_time": 2.501680972003669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 135.6875, "completions/mean_terminated_length": 135.6875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3405720517039299, "epoch": 0.00269, "frac_reward_zero_std": 0.5, "grad_norm": 0.050510283559560776, "kl": 0.5144818127155304, "learning_rate": 9.999974875096663e-06, "loss": 0.0331, "num_tokens": 2694779.0, "reward": 0.7108414173126221, "reward_std": 0.05655108392238617, "rewards/rollout_reward_func/mean": 0.7108414173126221, "rewards/rollout_reward_func/std": 0.3348528742790222, "sampling/importance_sampling_ratio/max": 0.9839990139007568, "sampling/importance_sampling_ratio/mean": 0.9306102991104126, "sampling/importance_sampling_ratio/min": 0.03181327134370804, "sampling/sampling_logp_difference/max": 1.5676344633102417, "sampling/sampling_logp_difference/mean": 0.030965978279709816, "step": 269, "step_time": 4.117915871007426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3319592960178852, "epoch": 0.0027, "grad_norm": 0.048587970435619354, "kl": 0.5323914662003517, "learning_rate": 9.999974658969701e-06, "loss": 0.0333, "step": 270, "step_time": 2.046543181008019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 208.6875, "completions/mean_terminated_length": 208.6875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.32238749973475933, "epoch": 0.00271, "frac_reward_zero_std": 0.25, "grad_norm": 1.6710381507873535, "kl": 0.4593672715127468, "learning_rate": 9.999974441917146e-06, "loss": -0.0139, "num_tokens": 2715385.0, "reward": 0.542721152305603, "reward_std": 0.04079461470246315, "rewards/rollout_reward_func/mean": 0.542721152305603, "rewards/rollout_reward_func/std": 0.26833376288414, "sampling/importance_sampling_ratio/max": 1.823501706123352, "sampling/importance_sampling_ratio/mean": 0.9243454933166504, "sampling/importance_sampling_ratio/min": 0.19776783883571625, "sampling/sampling_logp_difference/max": 1.3054265975952148, "sampling/sampling_logp_difference/mean": 0.03372277319431305, "step": 271, "step_time": 4.563495848015009 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.031250000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03750000102445483, "entropy": 0.38179031386971474, "epoch": 0.00272, "grad_norm": 0.07445284724235535, "kl": 0.4607417993247509, "learning_rate": 9.999974223938997e-06, "loss": -0.0169, "step": 272, "step_time": 2.5408094100057497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 142.125, "completions/mean_terminated_length": 142.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6761605925858021, "epoch": 0.00273, "frac_reward_zero_std": 0.5, "grad_norm": 0.27248626947402954, "kl": 0.47180017828941345, "learning_rate": 9.999974005035256e-06, "loss": -0.0064, "num_tokens": 2733605.0, "reward": 0.4457211494445801, "reward_std": 0.06637858599424362, "rewards/rollout_reward_func/mean": 0.4457211494445801, "rewards/rollout_reward_func/std": 0.1914680302143097, "sampling/importance_sampling_ratio/max": 1.1131505966186523, "sampling/importance_sampling_ratio/mean": 0.78502357006073, "sampling/importance_sampling_ratio/min": 0.04519260674715042, "sampling/sampling_logp_difference/max": 2.462200164794922, "sampling/sampling_logp_difference/mean": 0.09516891837120056, "step": 273, "step_time": 4.277697216006345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 0.973343588411808, "epoch": 0.00274, "grad_norm": 0.18800745904445648, "kl": 0.488353718072176, "learning_rate": 9.999973785205922e-06, "loss": -0.0075, "step": 274, "step_time": 2.4813515639980324 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 133.3125, "completions/mean_terminated_length": 133.3125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5582486353814602, "epoch": 0.00275, "frac_reward_zero_std": 0.25, "grad_norm": 0.6790653467178345, "kl": 0.5446730218827724, "learning_rate": 9.999973564450996e-06, "loss": 0.0195, "num_tokens": 2751359.0, "reward": 0.44600963592529297, "reward_std": 0.054161496460437775, "rewards/rollout_reward_func/mean": 0.44600963592529297, "rewards/rollout_reward_func/std": 0.10217288881540298, "sampling/importance_sampling_ratio/max": 1.043146014213562, "sampling/importance_sampling_ratio/mean": 0.8662926554679871, "sampling/importance_sampling_ratio/min": 0.06774134933948517, "sampling/sampling_logp_difference/max": 1.340148687362671, "sampling/sampling_logp_difference/mean": 0.05763626471161842, "step": 275, "step_time": 4.129934238000715 }, { "clip_ratio/high_max": 0.078125, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.5413355957716703, "epoch": 0.00276, "grad_norm": 0.11569704860448837, "kl": 0.5494094491004944, "learning_rate": 9.999973342770475e-06, "loss": 0.0185, "step": 276, "step_time": 2.045845790002204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 171.125, "completions/mean_terminated_length": 171.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.8342353608459234, "epoch": 0.00277, "frac_reward_zero_std": 0.5, "grad_norm": 0.39826059341430664, "kl": 0.7613859958946705, "learning_rate": 9.999973120164363e-06, "loss": -0.011, "num_tokens": 2770779.0, "reward": 0.580673098564148, "reward_std": 0.05868225544691086, "rewards/rollout_reward_func/mean": 0.580673098564148, "rewards/rollout_reward_func/std": 0.3911104202270508, "sampling/importance_sampling_ratio/max": 1.0517688989639282, "sampling/importance_sampling_ratio/mean": 0.8402824401855469, "sampling/importance_sampling_ratio/min": 1.8510474039783017e-18, "sampling/sampling_logp_difference/max": 19.27903175354004, "sampling/sampling_logp_difference/mean": 0.31456851959228516, "step": 277, "step_time": 4.560406417986087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0022321429569274187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "entropy": 0.8480640146881342, "epoch": 0.00278, "grad_norm": 0.4501500129699707, "kl": 0.7514111623167992, "learning_rate": 9.999972896632658e-06, "loss": -0.0108, "step": 278, "step_time": 2.4816028159912094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 151.71875, "completions/mean_terminated_length": 151.71875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.7444561291486025, "epoch": 0.00279, "frac_reward_zero_std": 0.5, "grad_norm": 0.037740081548690796, "kl": 0.5860645584762096, "learning_rate": 9.99997267217536e-06, "loss": -0.0006, "num_tokens": 2789450.0, "reward": 0.7996442317962646, "reward_std": 0.03422653302550316, "rewards/rollout_reward_func/mean": 0.7996442317962646, "rewards/rollout_reward_func/std": 0.3055110573768616, "sampling/importance_sampling_ratio/max": 0.9637537002563477, "sampling/importance_sampling_ratio/mean": 0.875331461429596, "sampling/importance_sampling_ratio/min": 6.2658802153921135e-19, "sampling/sampling_logp_difference/max": 4.294112205505371, "sampling/sampling_logp_difference/mean": 0.256076455116272, "step": 279, "step_time": 4.241699404003157 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 0.746001722291112, "epoch": 0.0028, "grad_norm": 0.0571722649037838, "kl": 0.5748339295387268, "learning_rate": 9.999972446792469e-06, "loss": -0.0005, "step": 280, "step_time": 2.5059944019885734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 185.96875, "completions/mean_terminated_length": 185.96875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5174035020172596, "epoch": 0.00281, "frac_reward_zero_std": 0.5, "grad_norm": 0.26053664088249207, "kl": 0.5272726863622665, "learning_rate": 9.999972220483987e-06, "loss": -0.0056, "num_tokens": 2809649.0, "reward": 0.6365096569061279, "reward_std": 0.026178881525993347, "rewards/rollout_reward_func/mean": 0.6365096569061279, "rewards/rollout_reward_func/std": 0.1482645571231842, "sampling/importance_sampling_ratio/max": 1.1051816940307617, "sampling/importance_sampling_ratio/mean": 0.8551535606384277, "sampling/importance_sampling_ratio/min": 2.631990979329313e-12, "sampling/sampling_logp_difference/max": 19.39380645751953, "sampling/sampling_logp_difference/mean": 0.17944759130477905, "step": 281, "step_time": 4.367998811001598 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.5367389656603336, "epoch": 0.00282, "grad_norm": 0.3079833984375, "kl": 0.5207283683121204, "learning_rate": 9.99997199324991e-06, "loss": -0.0064, "step": 282, "step_time": 2.07461932899605 }, { "clip_ratio/high_max": 0.013095238711684942, "clip_ratio/high_mean": 0.006547619355842471, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006547619355842471, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 162.3125, "completions/mean_terminated_length": 162.3125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.8107543438673019, "epoch": 0.00283, "frac_reward_zero_std": 0.25, "grad_norm": 0.34115660190582275, "kl": 0.7562456727027893, "learning_rate": 9.999971765090241e-06, "loss": -0.0263, "num_tokens": 2828299.0, "reward": 0.4357451796531677, "reward_std": 0.21095629036426544, "rewards/rollout_reward_func/mean": 0.4357451796531677, "rewards/rollout_reward_func/std": 0.33878740668296814, "sampling/importance_sampling_ratio/max": 1.275262475013733, "sampling/importance_sampling_ratio/mean": 0.7606402635574341, "sampling/importance_sampling_ratio/min": 4.741321450001926e-16, "sampling/sampling_logp_difference/max": 9.366693496704102, "sampling/sampling_logp_difference/mean": 0.2364710122346878, "step": 283, "step_time": 4.6220107010158245 }, { "clip_ratio/high_max": 0.044047621078789234, "clip_ratio/high_mean": 0.022023810539394617, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022023810539394617, "entropy": 0.7816305682063103, "epoch": 0.00284, "grad_norm": 0.13455212116241455, "kl": 0.7479385249316692, "learning_rate": 9.999971536004981e-06, "loss": -0.0259, "step": 284, "step_time": 2.5288067770015914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 205.40625, "completions/mean_terminated_length": 205.40625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.921180721372366, "epoch": 0.00285, "frac_reward_zero_std": 0.0, "grad_norm": 0.6836562752723694, "kl": 0.6751976534724236, "learning_rate": 9.99997130599413e-06, "loss": -0.0686, "num_tokens": 2848192.0, "reward": 0.6781139373779297, "reward_std": 0.11429239064455032, "rewards/rollout_reward_func/mean": 0.6781139373779297, "rewards/rollout_reward_func/std": 0.34543493390083313, "sampling/importance_sampling_ratio/max": 1.051634430885315, "sampling/importance_sampling_ratio/mean": 0.8189393281936646, "sampling/importance_sampling_ratio/min": 5.8232996316052955e-16, "sampling/sampling_logp_difference/max": 2.942800521850586, "sampling/sampling_logp_difference/mean": 0.26359352469444275, "step": 285, "step_time": 4.801949104992673 }, { "clip_ratio/high_max": 0.02678571455180645, "clip_ratio/high_mean": 0.013392857275903225, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.034226191230118275, "entropy": 0.9085240792483091, "epoch": 0.00286, "grad_norm": 0.1842985898256302, "kl": 0.5486458837985992, "learning_rate": 9.999971075057683e-06, "loss": -0.0715, "step": 286, "step_time": 2.56646783999895 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 188.8125, "completions/mean_terminated_length": 188.8125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2511461991816759, "epoch": 0.00287, "frac_reward_zero_std": 0.75, "grad_norm": 0.2646215558052063, "kl": 0.43809688463807106, "learning_rate": 9.999970843195648e-06, "loss": 0.0129, "num_tokens": 2867938.0, "reward": 0.8159807920455933, "reward_std": 0.0492306649684906, "rewards/rollout_reward_func/mean": 0.8159807920455933, "rewards/rollout_reward_func/std": 0.4788428843021393, "sampling/importance_sampling_ratio/max": 1.0226762294769287, "sampling/importance_sampling_ratio/mean": 0.9499472975730896, "sampling/importance_sampling_ratio/min": 0.8693240284919739, "sampling/sampling_logp_difference/max": 0.4701666235923767, "sampling/sampling_logp_difference/mean": 0.01862506940960884, "step": 287, "step_time": 4.6384795470075915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25512945652008057, "epoch": 0.00288, "grad_norm": 0.33088672161102295, "kl": 0.4395143799483776, "learning_rate": 9.999970610408019e-06, "loss": 0.013, "step": 288, "step_time": 2.0801978789968416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 173.09375, "completions/mean_terminated_length": 173.09375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7250636648386717, "epoch": 0.00289, "frac_reward_zero_std": 0.25, "grad_norm": 0.1138448566198349, "kl": 0.5772934034466743, "learning_rate": 9.999970376694797e-06, "loss": -0.0519, "num_tokens": 2886413.0, "reward": 0.5857836604118347, "reward_std": 0.0664895549416542, "rewards/rollout_reward_func/mean": 0.5857836604118347, "rewards/rollout_reward_func/std": 0.2000821828842163, "sampling/importance_sampling_ratio/max": 1.168127179145813, "sampling/importance_sampling_ratio/mean": 0.8220936059951782, "sampling/importance_sampling_ratio/min": 0.003496652701869607, "sampling/sampling_logp_difference/max": 2.4518582820892334, "sampling/sampling_logp_difference/mean": 0.15453001856803894, "step": 289, "step_time": 4.7321555060188984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7520652450621128, "epoch": 0.0029, "grad_norm": 0.18277394771575928, "kl": 0.5908286347985268, "learning_rate": 9.999970142055984e-06, "loss": -0.0524, "step": 290, "step_time": 2.0292284619936254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 129.9375, "completions/mean_terminated_length": 129.9375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2254362665116787, "epoch": 0.00291, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020851674489676952, "kl": 0.6317577138543129, "learning_rate": 9.999969906491578e-06, "loss": 0.0018, "num_tokens": 2904907.0, "reward": 0.49423080682754517, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.49423080682754517, "rewards/rollout_reward_func/std": 0.11491218954324722, "sampling/importance_sampling_ratio/max": 0.9688228368759155, "sampling/importance_sampling_ratio/mean": 0.9384762048721313, "sampling/importance_sampling_ratio/min": 0.6284907460212708, "sampling/sampling_logp_difference/max": 0.40645283460617065, "sampling/sampling_logp_difference/mean": 0.018196314573287964, "step": 291, "step_time": 4.483533421996981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2313978709280491, "epoch": 0.00292, "grad_norm": 0.0034295660443603992, "kl": 0.6291234791278839, "learning_rate": 9.99996967000158e-06, "loss": 0.0018, "step": 292, "step_time": 2.045386443998723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 185.71875, "completions/mean_terminated_length": 188.03225708007812, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 1.271086934953928, "epoch": 0.00293, "frac_reward_zero_std": 0.25, "grad_norm": 0.29511138796806335, "kl": 0.821888379752636, "learning_rate": 9.999969432585992e-06, "loss": -0.0692, "num_tokens": 2924418.0, "reward": 0.5313990116119385, "reward_std": 0.06820490211248398, "rewards/rollout_reward_func/mean": 0.5313990116119385, "rewards/rollout_reward_func/std": 0.19713257253170013, "sampling/importance_sampling_ratio/max": 0.9638105034828186, "sampling/importance_sampling_ratio/mean": 0.7127524018287659, "sampling/importance_sampling_ratio/min": 1.9863339776839866e-15, "sampling/sampling_logp_difference/max": 2.91776180267334, "sampling/sampling_logp_difference/mean": 0.328949511051178, "step": 293, "step_time": 4.4540282230009325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.270740620791912, "epoch": 0.00294, "grad_norm": 0.2685367166996002, "kl": 0.7572478353977203, "learning_rate": 9.99996919424481e-06, "loss": -0.0696, "step": 294, "step_time": 2.070951748995867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 158.28125, "completions/mean_terminated_length": 158.28125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.46821741946041584, "epoch": 0.00295, "frac_reward_zero_std": 0.0, "grad_norm": 0.08995012938976288, "kl": 0.561047725379467, "learning_rate": 9.999968954978038e-06, "loss": -0.0342, "num_tokens": 2942803.0, "reward": 0.7435529232025146, "reward_std": 0.07510975003242493, "rewards/rollout_reward_func/mean": 0.7435529232025146, "rewards/rollout_reward_func/std": 0.45401453971862793, "sampling/importance_sampling_ratio/max": 1.047354817390442, "sampling/importance_sampling_ratio/mean": 0.9072803258895874, "sampling/importance_sampling_ratio/min": 0.00019038439495489, "sampling/sampling_logp_difference/max": 3.321640968322754, "sampling/sampling_logp_difference/mean": 0.06708551943302155, "step": 295, "step_time": 4.9288614410106675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.46371590718626976, "epoch": 0.00296, "grad_norm": 0.08828054368495941, "kl": 0.5554317571222782, "learning_rate": 9.999968714785673e-06, "loss": -0.0342, "step": 296, "step_time": 2.075323685996409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 165.65625, "completions/mean_terminated_length": 165.65625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 1.1869757827371359, "epoch": 0.00297, "frac_reward_zero_std": 0.0, "grad_norm": 0.4415089190006256, "kl": 0.7959621101617813, "learning_rate": 9.999968473667719e-06, "loss": 0.0285, "num_tokens": 2961504.0, "reward": 0.22283655405044556, "reward_std": 0.10672253370285034, "rewards/rollout_reward_func/mean": 0.22283655405044556, "rewards/rollout_reward_func/std": 0.15517398715019226, "sampling/importance_sampling_ratio/max": 0.9882184863090515, "sampling/importance_sampling_ratio/mean": 0.7369487285614014, "sampling/importance_sampling_ratio/min": 2.3298876716815187e-10, "sampling/sampling_logp_difference/max": 13.253283500671387, "sampling/sampling_logp_difference/mean": 0.24903517961502075, "step": 297, "step_time": 4.829437420994509 }, { "clip_ratio/high_max": 0.043750000186264515, "clip_ratio/high_mean": 0.021875000093132257, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028125000186264515, "entropy": 1.1134373154491186, "epoch": 0.00298, "grad_norm": 0.3738849461078644, "kl": 0.7173946239054203, "learning_rate": 9.99996823162417e-06, "loss": 0.0274, "step": 298, "step_time": 2.0318731520019355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 183.34375, "completions/mean_terminated_length": 183.34375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 1.0432641487568617, "epoch": 0.00299, "frac_reward_zero_std": 0.25, "grad_norm": 0.10452854633331299, "kl": 0.9819755703210831, "learning_rate": 9.99996798865503e-06, "loss": -0.0999, "num_tokens": 2981395.0, "reward": 0.7216874957084656, "reward_std": 0.245949387550354, "rewards/rollout_reward_func/mean": 0.7216874957084656, "rewards/rollout_reward_func/std": 0.5682616233825684, "sampling/importance_sampling_ratio/max": 0.9631291627883911, "sampling/importance_sampling_ratio/mean": 0.7514681816101074, "sampling/importance_sampling_ratio/min": 2.7358285024092766e-06, "sampling/sampling_logp_difference/max": 3.677426815032959, "sampling/sampling_logp_difference/mean": 0.17325523495674133, "step": 299, "step_time": 5.049723846983397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0406101550906897, "epoch": 0.003, "grad_norm": 0.11225631833076477, "kl": 0.9261497855186462, "learning_rate": 9.9999677447603e-06, "loss": -0.0999, "step": 300, "step_time": 2.5642366020038025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 150.375, "completions/mean_terminated_length": 150.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.447393324226141, "epoch": 0.00301, "frac_reward_zero_std": 0.5, "grad_norm": 1.7239680290222168, "kl": 0.5156215094029903, "learning_rate": 9.99996749993998e-06, "loss": -0.0471, "num_tokens": 3000599.0, "reward": 0.2831668257713318, "reward_std": 0.19041766226291656, "rewards/rollout_reward_func/mean": 0.2831668257713318, "rewards/rollout_reward_func/std": 0.5435440540313721, "sampling/importance_sampling_ratio/max": 2.174715518951416, "sampling/importance_sampling_ratio/mean": 1.0185121297836304, "sampling/importance_sampling_ratio/min": 0.0001766055211191997, "sampling/sampling_logp_difference/max": 2.989189863204956, "sampling/sampling_logp_difference/mean": 0.0879870057106018, "step": 301, "step_time": 4.337918107994483 }, { "clip_ratio/high_max": 0.04062500037252903, "clip_ratio/high_mean": 0.020312500186264515, "clip_ratio/low_mean": 0.043750000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06406250037252903, "entropy": 0.5288319885730743, "epoch": 0.00302, "grad_norm": 0.09480395913124084, "kl": 0.5049236603081226, "learning_rate": 9.999967254194065e-06, "loss": -0.0511, "step": 302, "step_time": 2.486441445995297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 213.3125, "completions/mean_terminated_length": 213.3125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5483262483030558, "epoch": 0.00303, "frac_reward_zero_std": 0.5, "grad_norm": 0.15301859378814697, "kl": 0.4967644587159157, "learning_rate": 9.999967007522561e-06, "loss": -0.0155, "num_tokens": 3020641.0, "reward": 0.9829086065292358, "reward_std": 0.07706069201231003, "rewards/rollout_reward_func/mean": 0.9829086065292358, "rewards/rollout_reward_func/std": 0.31047219038009644, "sampling/importance_sampling_ratio/max": 1.10426664352417, "sampling/importance_sampling_ratio/mean": 0.8763829469680786, "sampling/importance_sampling_ratio/min": 0.006093596573919058, "sampling/sampling_logp_difference/max": 2.2236812114715576, "sampling/sampling_logp_difference/mean": 0.06348315626382828, "step": 303, "step_time": 4.637195577975945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5428981240838766, "epoch": 0.00304, "grad_norm": 0.17870526015758514, "kl": 0.4945276714861393, "learning_rate": 9.999966759925464e-06, "loss": -0.015, "step": 304, "step_time": 2.055511729013233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3490499462932348, "epoch": 0.00305, "frac_reward_zero_std": 0.5, "grad_norm": 0.10995347797870636, "kl": 0.5246476195752621, "learning_rate": 9.999966511402779e-06, "loss": -0.0193, "num_tokens": 3040113.0, "reward": 0.5272692441940308, "reward_std": 0.06435341387987137, "rewards/rollout_reward_func/mean": 0.5272692441940308, "rewards/rollout_reward_func/std": 0.23034214973449707, "sampling/importance_sampling_ratio/max": 0.9936104416847229, "sampling/importance_sampling_ratio/mean": 0.9123176336288452, "sampling/importance_sampling_ratio/min": 0.13091923296451569, "sampling/sampling_logp_difference/max": 1.888425350189209, "sampling/sampling_logp_difference/mean": 0.030056817457079887, "step": 305, "step_time": 4.366126685999916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.3541003167629242, "epoch": 0.00306, "grad_norm": 0.12976601719856262, "kl": 0.5311304591596127, "learning_rate": 9.9999662619545e-06, "loss": -0.0196, "step": 306, "step_time": 2.5589457579990267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 223.84375, "completions/mean_terminated_length": 223.84375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.41247621923685074, "epoch": 0.00307, "frac_reward_zero_std": 0.5, "grad_norm": 0.03218119591474533, "kl": 0.5627606213092804, "learning_rate": 9.999966011580632e-06, "loss": -0.0705, "num_tokens": 3061548.0, "reward": 0.6788591742515564, "reward_std": 0.019109565764665604, "rewards/rollout_reward_func/mean": 0.6788591742515564, "rewards/rollout_reward_func/std": 0.3649403750896454, "sampling/importance_sampling_ratio/max": 0.9788950085639954, "sampling/importance_sampling_ratio/mean": 0.8636770248413086, "sampling/importance_sampling_ratio/min": 0.002835402265191078, "sampling/sampling_logp_difference/max": 2.9959559440612793, "sampling/sampling_logp_difference/mean": 0.06911805272102356, "step": 307, "step_time": 4.992148479992466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 0.4295904729515314, "epoch": 0.00308, "grad_norm": 0.037353891879320145, "kl": 0.579680573195219, "learning_rate": 9.999965760281171e-06, "loss": -0.0705, "step": 308, "step_time": 2.534841860004235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 155.375, "completions/mean_terminated_length": 155.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8881994541734457, "epoch": 0.00309, "frac_reward_zero_std": 0.25, "grad_norm": 2.2027080059051514, "kl": 0.6779132038354874, "learning_rate": 9.999965508056122e-06, "loss": -0.0349, "num_tokens": 3080064.0, "reward": 0.5567307472229004, "reward_std": 0.2842753529548645, "rewards/rollout_reward_func/mean": 0.5567307472229004, "rewards/rollout_reward_func/std": 0.5661587715148926, "sampling/importance_sampling_ratio/max": 1.426544427871704, "sampling/importance_sampling_ratio/mean": 0.8783824443817139, "sampling/importance_sampling_ratio/min": 6.159038716183193e-15, "sampling/sampling_logp_difference/max": 3.8378889560699463, "sampling/sampling_logp_difference/mean": 0.24204565584659576, "step": 309, "step_time": 4.447675675997743 }, { "clip_ratio/high_max": 0.07187500037252903, "clip_ratio/high_mean": 0.035937500186264515, "clip_ratio/low_mean": 0.023958334233611822, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05989583395421505, "entropy": 0.9479062017053366, "epoch": 0.0031, "grad_norm": 0.2372370809316635, "kl": 0.7001015357673168, "learning_rate": 9.999965254905479e-06, "loss": -0.0387, "step": 310, "step_time": 2.078139224991901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 210.34375, "completions/mean_terminated_length": 210.34375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.8643033914268017, "epoch": 0.00311, "frac_reward_zero_std": 0.25, "grad_norm": 0.10401681810617447, "kl": 0.9439365416765213, "learning_rate": 9.999965000829247e-06, "loss": -0.0292, "num_tokens": 3100499.0, "reward": 0.34319227933883667, "reward_std": 0.19859202206134796, "rewards/rollout_reward_func/mean": 0.34319227933883667, "rewards/rollout_reward_func/std": 0.29370415210723877, "sampling/importance_sampling_ratio/max": 1.2907549142837524, "sampling/importance_sampling_ratio/mean": 0.86146080493927, "sampling/importance_sampling_ratio/min": 5.577447882387787e-05, "sampling/sampling_logp_difference/max": 2.676870822906494, "sampling/sampling_logp_difference/mean": 0.14679954946041107, "step": 311, "step_time": 4.599120039005356 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010714286006987095, "entropy": 0.8713584654033184, "epoch": 0.00312, "grad_norm": 0.07656508684158325, "kl": 0.9218144565820694, "learning_rate": 9.999964745827424e-06, "loss": -0.0295, "step": 312, "step_time": 2.5394176010086085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 133.46875, "completions/mean_terminated_length": 133.46875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.6418043170124292, "epoch": 0.00313, "frac_reward_zero_std": 0.25, "grad_norm": 0.32083913683891296, "kl": 0.5259632766246796, "learning_rate": 9.99996448990001e-06, "loss": -0.056, "num_tokens": 3118690.0, "reward": 0.6322596073150635, "reward_std": 0.06004173308610916, "rewards/rollout_reward_func/mean": 0.6322596073150635, "rewards/rollout_reward_func/std": 0.39438578486442566, "sampling/importance_sampling_ratio/max": 0.9804874062538147, "sampling/importance_sampling_ratio/mean": 0.824046790599823, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.732093334197998, "sampling/sampling_logp_difference/mean": 0.11564968526363373, "step": 313, "step_time": 4.1921450220179395 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.01875000074505806, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02656250074505806, "entropy": 0.6614012829959393, "epoch": 0.00314, "grad_norm": 0.12119797617197037, "kl": 0.5215758010745049, "learning_rate": 9.999964233047006e-06, "loss": -0.0561, "step": 314, "step_time": 2.495642874004261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 203.6875, "completions/mean_terminated_length": 200.6774139404297, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.8859888352453709, "epoch": 0.00315, "frac_reward_zero_std": 0.5, "grad_norm": 0.38910460472106934, "kl": 0.590745959430933, "learning_rate": 9.999963975268412e-06, "loss": -0.0473, "num_tokens": 3139072.0, "reward": 0.5579759478569031, "reward_std": 0.07015484571456909, "rewards/rollout_reward_func/mean": 0.5579759478569031, "rewards/rollout_reward_func/std": 0.30041149258613586, "sampling/importance_sampling_ratio/max": 1.9535542726516724, "sampling/importance_sampling_ratio/mean": 0.8894011378288269, "sampling/importance_sampling_ratio/min": 1.876365604402963e-05, "sampling/sampling_logp_difference/max": 2.8572916984558105, "sampling/sampling_logp_difference/mean": 0.15551593899726868, "step": 315, "step_time": 4.6383457480114885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 0.9458200316876173, "epoch": 0.00316, "grad_norm": 0.13051944971084595, "kl": 0.607596293091774, "learning_rate": 9.999963716564226e-06, "loss": -0.0479, "step": 316, "step_time": 2.058922377000272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 170.375, "completions/mean_terminated_length": 170.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6224020086228848, "epoch": 0.00317, "frac_reward_zero_std": 0.25, "grad_norm": 0.2963046431541443, "kl": 0.705622959882021, "learning_rate": 9.99996345693445e-06, "loss": -0.0033, "num_tokens": 3157868.0, "reward": 0.7991764545440674, "reward_std": 0.05222113057971001, "rewards/rollout_reward_func/mean": 0.7991764545440674, "rewards/rollout_reward_func/std": 0.27937057614326477, "sampling/importance_sampling_ratio/max": 1.367920994758606, "sampling/importance_sampling_ratio/mean": 0.909225583076477, "sampling/importance_sampling_ratio/min": 2.7542094471755263e-07, "sampling/sampling_logp_difference/max": 2.8838813304901123, "sampling/sampling_logp_difference/mean": 0.11150113493204117, "step": 317, "step_time": 5.019367985005374 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.6148369871079922, "epoch": 0.00318, "grad_norm": 0.38688910007476807, "kl": 0.6724494323134422, "learning_rate": 9.999963196379084e-06, "loss": -0.0047, "step": 318, "step_time": 2.076730228989618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 170.53125, "completions/mean_terminated_length": 170.53125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5099157132208347, "epoch": 0.00319, "frac_reward_zero_std": 0.25, "grad_norm": 0.13512331247329712, "kl": 0.5290064178407192, "learning_rate": 9.999962934898128e-06, "loss": -0.0713, "num_tokens": 3177957.0, "reward": 0.7484807968139648, "reward_std": 0.06554335355758667, "rewards/rollout_reward_func/mean": 0.7484807968139648, "rewards/rollout_reward_func/std": 0.35086867213249207, "sampling/importance_sampling_ratio/max": 1.6543822288513184, "sampling/importance_sampling_ratio/mean": 0.9153742790222168, "sampling/importance_sampling_ratio/min": 0.0006425418541766703, "sampling/sampling_logp_difference/max": 2.4923923015594482, "sampling/sampling_logp_difference/mean": 0.09407559037208557, "step": 319, "step_time": 4.541334409994306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.49186512641608715, "epoch": 0.0032, "grad_norm": 0.2242000848054886, "kl": 0.5003214552998543, "learning_rate": 9.999962672491582e-06, "loss": -0.0708, "step": 320, "step_time": 2.6074250760138966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 213.78125, "completions/mean_terminated_length": 213.78125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5585581958293915, "epoch": 0.00321, "frac_reward_zero_std": 0.25, "grad_norm": 0.05965598300099373, "kl": 0.5568382702767849, "learning_rate": 9.999962409159445e-06, "loss": -0.0275, "num_tokens": 3198694.0, "reward": 0.6956778764724731, "reward_std": 0.05394408851861954, "rewards/rollout_reward_func/mean": 0.6956778764724731, "rewards/rollout_reward_func/std": 0.37008988857269287, "sampling/importance_sampling_ratio/max": 1.05362069606781, "sampling/importance_sampling_ratio/mean": 0.8999157547950745, "sampling/importance_sampling_ratio/min": 0.0032290583476424217, "sampling/sampling_logp_difference/max": 2.304732322692871, "sampling/sampling_logp_difference/mean": 0.07258275896310806, "step": 321, "step_time": 4.620273998014454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5568768493831158, "epoch": 0.00322, "grad_norm": 0.07779958099126816, "kl": 0.5508229918777943, "learning_rate": 9.999962144901718e-06, "loss": -0.0273, "step": 322, "step_time": 2.07340973400278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2591012492775917, "epoch": 0.00323, "frac_reward_zero_std": 1.0, "grad_norm": 0.013209616765379906, "kl": 0.51106371358037, "learning_rate": 9.999961879718401e-06, "loss": 0.0014, "num_tokens": 3216598.0, "reward": 0.8220384120941162, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8220384120941162, "rewards/rollout_reward_func/std": 0.1978919357061386, "sampling/importance_sampling_ratio/max": 0.9929260015487671, "sampling/importance_sampling_ratio/mean": 0.9528557062149048, "sampling/importance_sampling_ratio/min": 0.030604638159275055, "sampling/sampling_logp_difference/max": 2.6812567710876465, "sampling/sampling_logp_difference/mean": 0.0276450514793396, "step": 323, "step_time": 4.602136318018893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2568465918302536, "epoch": 0.00324, "grad_norm": 0.012525921687483788, "kl": 0.5044820122420788, "learning_rate": 9.999961613609494e-06, "loss": 0.0014, "step": 324, "step_time": 2.0278415689899703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 187.0, "completions/mean_terminated_length": 187.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8911610022187233, "epoch": 0.00325, "frac_reward_zero_std": 0.25, "grad_norm": 0.06559976190328598, "kl": 0.5807005241513252, "learning_rate": 9.999961346574998e-06, "loss": -0.0831, "num_tokens": 3235742.0, "reward": 0.9141154289245605, "reward_std": 0.10785907506942749, "rewards/rollout_reward_func/mean": 0.9141154289245605, "rewards/rollout_reward_func/std": 0.3909735381603241, "sampling/importance_sampling_ratio/max": 1.0070236921310425, "sampling/importance_sampling_ratio/mean": 0.8589205145835876, "sampling/importance_sampling_ratio/min": 1.7871952197314367e-28, "sampling/sampling_logp_difference/max": 3.3258047103881836, "sampling/sampling_logp_difference/mean": 0.35477328300476074, "step": 325, "step_time": 5.397279982993496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.8928408790379763, "epoch": 0.00326, "grad_norm": 0.05676126480102539, "kl": 0.5867515131831169, "learning_rate": 9.999961078614912e-06, "loss": -0.0832, "step": 326, "step_time": 2.0496260349900695 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.03125, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 200.4516143798828, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.8028276581317186, "epoch": 0.00327, "frac_reward_zero_std": 0.25, "grad_norm": 1.6953325271606445, "kl": 0.6451777778565884, "learning_rate": 9.999960809729237e-06, "loss": -0.0272, "num_tokens": 3256022.0, "reward": 0.6998845934867859, "reward_std": 0.25563645362854004, "rewards/rollout_reward_func/mean": 0.6998845934867859, "rewards/rollout_reward_func/std": 0.5016269683837891, "sampling/importance_sampling_ratio/max": 1.1030216217041016, "sampling/importance_sampling_ratio/mean": 0.8657159805297852, "sampling/importance_sampling_ratio/min": 5.510803236979174e-16, "sampling/sampling_logp_difference/max": 2.8652448654174805, "sampling/sampling_logp_difference/mean": 0.2476574331521988, "step": 327, "step_time": 4.719894776986621 }, { "clip_ratio/high_max": 0.09151785913854837, "clip_ratio/high_mean": 0.04575892956927419, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05357142956927419, "entropy": 0.729648208245635, "epoch": 0.00328, "grad_norm": 0.3753415048122406, "kl": 0.6856517419219017, "learning_rate": 9.99996053991797e-06, "loss": -0.0309, "step": 328, "step_time": 2.0480044490032014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3979774434119463, "epoch": 0.00329, "frac_reward_zero_std": 0.75, "grad_norm": 0.022655155509710312, "kl": 0.5171357020735741, "learning_rate": 9.999960269181116e-06, "loss": -0.035, "num_tokens": 3276198.0, "reward": 0.9614807963371277, "reward_std": 0.00679910508915782, "rewards/rollout_reward_func/mean": 0.9614807963371277, "rewards/rollout_reward_func/std": 0.3332526981830597, "sampling/importance_sampling_ratio/max": 1.0262800455093384, "sampling/importance_sampling_ratio/mean": 0.915996789932251, "sampling/importance_sampling_ratio/min": 0.003064705990254879, "sampling/sampling_logp_difference/max": 2.3134963512420654, "sampling/sampling_logp_difference/mean": 0.05797967687249184, "step": 329, "step_time": 5.2825737800158095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41598671954125166, "epoch": 0.0033, "grad_norm": 0.021610360592603683, "kl": 0.5193303376436234, "learning_rate": 9.999959997518671e-06, "loss": -0.035, "step": 330, "step_time": 2.052881868010445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 214.03125, "completions/mean_terminated_length": 211.258056640625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.8778189476579428, "epoch": 0.00331, "frac_reward_zero_std": 0.25, "grad_norm": 0.12344569712877274, "kl": 0.7925139330327511, "learning_rate": 9.999959724930638e-06, "loss": -0.0581, "num_tokens": 3296991.0, "reward": 0.7274024486541748, "reward_std": 0.07705644518136978, "rewards/rollout_reward_func/mean": 0.7274024486541748, "rewards/rollout_reward_func/std": 0.3739074766635895, "sampling/importance_sampling_ratio/max": 0.9868254661560059, "sampling/importance_sampling_ratio/mean": 0.8244869112968445, "sampling/importance_sampling_ratio/min": 4.3596988454428914e-18, "sampling/sampling_logp_difference/max": 3.981567621231079, "sampling/sampling_logp_difference/mean": 0.25481730699539185, "step": 331, "step_time": 5.170779076004692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8646531868726015, "epoch": 0.00332, "grad_norm": 0.13950806856155396, "kl": 0.796257559210062, "learning_rate": 9.999959451417012e-06, "loss": -0.058, "step": 332, "step_time": 2.0842006180027965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 148.84375, "completions/mean_terminated_length": 148.84375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4742706585675478, "epoch": 0.00333, "frac_reward_zero_std": 0.75, "grad_norm": 0.017850270494818687, "kl": 0.6358435302972794, "learning_rate": 9.9999591769778e-06, "loss": -0.0334, "num_tokens": 3314970.0, "reward": 0.7359134554862976, "reward_std": 0.01924855448305607, "rewards/rollout_reward_func/mean": 0.7359134554862976, "rewards/rollout_reward_func/std": 0.3531670868396759, "sampling/importance_sampling_ratio/max": 1.049041986465454, "sampling/importance_sampling_ratio/mean": 0.9321046471595764, "sampling/importance_sampling_ratio/min": 5.136221079737879e-05, "sampling/sampling_logp_difference/max": 3.1882636547088623, "sampling/sampling_logp_difference/mean": 0.08599327504634857, "step": 333, "step_time": 4.214165080993553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.47021032497286797, "epoch": 0.00334, "grad_norm": 0.019245408475399017, "kl": 0.6508124209940434, "learning_rate": 9.999958901612997e-06, "loss": -0.0334, "step": 334, "step_time": 2.4839636410033563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 153.21875, "completions/mean_terminated_length": 153.21875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8634353969246149, "epoch": 0.00335, "frac_reward_zero_std": 0.0, "grad_norm": 0.07575427740812302, "kl": 0.7253914996981621, "learning_rate": 9.999958625322606e-06, "loss": -0.0769, "num_tokens": 3333481.0, "reward": 0.9040144681930542, "reward_std": 0.14238515496253967, "rewards/rollout_reward_func/mean": 0.9040144681930542, "rewards/rollout_reward_func/std": 0.4150696098804474, "sampling/importance_sampling_ratio/max": 1.0793277025222778, "sampling/importance_sampling_ratio/mean": 0.7951005697250366, "sampling/importance_sampling_ratio/min": 9.668282291386276e-05, "sampling/sampling_logp_difference/max": 3.731553316116333, "sampling/sampling_logp_difference/mean": 0.15598437190055847, "step": 335, "step_time": 4.178057021999848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.862995246425271, "epoch": 0.00336, "grad_norm": 0.09037654101848602, "kl": 0.7353184521198273, "learning_rate": 9.999958348106625e-06, "loss": -0.0769, "step": 336, "step_time": 2.042047597009514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 123.21875, "completions/mean_terminated_length": 123.21875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.8346072062849998, "epoch": 0.00337, "frac_reward_zero_std": 0.5, "grad_norm": 1.7703040838241577, "kl": 0.5801205672323704, "learning_rate": 9.999958069965056e-06, "loss": -0.0564, "num_tokens": 3350808.0, "reward": 0.5209615230560303, "reward_std": 0.05043834447860718, "rewards/rollout_reward_func/mean": 0.5209615230560303, "rewards/rollout_reward_func/std": 0.1713763177394867, "sampling/importance_sampling_ratio/max": 1.4526257514953613, "sampling/importance_sampling_ratio/mean": 0.9752525091171265, "sampling/importance_sampling_ratio/min": 3.514083815603364e-11, "sampling/sampling_logp_difference/max": 4.298625946044922, "sampling/sampling_logp_difference/mean": 0.21831487119197845, "step": 337, "step_time": 4.636138853005832 }, { "clip_ratio/high_max": 0.125, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.7199696954339743, "epoch": 0.00338, "grad_norm": 0.2372298538684845, "kl": 0.6084044650197029, "learning_rate": 9.999957790897897e-06, "loss": -0.0609, "step": 338, "step_time": 2.041665654993267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 118.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 98.9375, "completions/mean_terminated_length": 98.32257843017578, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5498980116099119, "epoch": 0.00339, "frac_reward_zero_std": 0.75, "grad_norm": 0.011039630509912968, "kl": 0.5093022994697094, "learning_rate": 9.999957510905149e-06, "loss": -0.018, "num_tokens": 3367462.0, "reward": 0.7433172464370728, "reward_std": 0.026516500860452652, "rewards/rollout_reward_func/mean": 0.7433172464370728, "rewards/rollout_reward_func/std": 0.17952199280261993, "sampling/importance_sampling_ratio/max": 1.0440831184387207, "sampling/importance_sampling_ratio/mean": 0.9662851095199585, "sampling/importance_sampling_ratio/min": 1.5556263002137658e-17, "sampling/sampling_logp_difference/max": 3.7980165481567383, "sampling/sampling_logp_difference/mean": 0.2543046176433563, "step": 339, "step_time": 3.8509960630108253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5430313860997558, "epoch": 0.0034, "grad_norm": 0.01066172868013382, "kl": 0.5098210498690605, "learning_rate": 9.999957229986813e-06, "loss": -0.018, "step": 340, "step_time": 2.4632724770053755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 120.625, "completions/mean_terminated_length": 120.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.6151407845318317, "epoch": 0.00341, "frac_reward_zero_std": 0.5, "grad_norm": 0.6807675957679749, "kl": 0.6224066503345966, "learning_rate": 9.999956948142888e-06, "loss": 0.0135, "num_tokens": 3384546.0, "reward": 0.5195192098617554, "reward_std": 0.07239300012588501, "rewards/rollout_reward_func/mean": 0.5195192098617554, "rewards/rollout_reward_func/std": 0.31135791540145874, "sampling/importance_sampling_ratio/max": 1.2159572839736938, "sampling/importance_sampling_ratio/mean": 0.859732449054718, "sampling/importance_sampling_ratio/min": 0.014235743321478367, "sampling/sampling_logp_difference/max": 2.2960586547851562, "sampling/sampling_logp_difference/mean": 0.1171109676361084, "step": 341, "step_time": 4.05364149199886 }, { "clip_ratio/high_max": 0.05625000037252903, "clip_ratio/high_mean": 0.028125000186264515, "clip_ratio/low_mean": 0.044791667722165585, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0729166679084301, "entropy": 0.618890505284071, "epoch": 0.00342, "grad_norm": 0.06882821768522263, "kl": 0.6675473935902119, "learning_rate": 9.999956665373374e-06, "loss": 0.0118, "step": 342, "step_time": 2.0298717399928137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 230.5625, "completions/mean_terminated_length": 230.5625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.31097185146063566, "epoch": 0.00343, "frac_reward_zero_std": 0.75, "grad_norm": 0.5147904753684998, "kl": 0.525695376098156, "learning_rate": 9.999956381678271e-06, "loss": -0.0306, "num_tokens": 3405676.0, "reward": 0.650120198726654, "reward_std": 0.16060680150985718, "rewards/rollout_reward_func/mean": 0.650120198726654, "rewards/rollout_reward_func/std": 0.3377905488014221, "sampling/importance_sampling_ratio/max": 1.2143200635910034, "sampling/importance_sampling_ratio/mean": 0.8395602703094482, "sampling/importance_sampling_ratio/min": 0.020316895097494125, "sampling/sampling_logp_difference/max": 2.4550564289093018, "sampling/sampling_logp_difference/mean": 0.056489743292331696, "step": 343, "step_time": 5.250592604010308 }, { "clip_ratio/high_max": 0.016741071827709675, "clip_ratio/high_mean": 0.008370535913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008370535913854837, "entropy": 0.3426098134368658, "epoch": 0.00344, "grad_norm": 0.09980545938014984, "kl": 0.5321155935525894, "learning_rate": 9.99995609705758e-06, "loss": -0.0316, "step": 344, "step_time": 2.090704230002302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.12070713844150305, "epoch": 0.00345, "frac_reward_zero_std": 0.75, "grad_norm": 0.8485684990882874, "kl": 0.5049353241920471, "learning_rate": 9.999955811511302e-06, "loss": 0.0038, "num_tokens": 3423816.0, "reward": 0.7653557658195496, "reward_std": 0.017317991703748703, "rewards/rollout_reward_func/mean": 0.7653557658195496, "rewards/rollout_reward_func/std": 0.3388753831386566, "sampling/importance_sampling_ratio/max": 1.0667636394500732, "sampling/importance_sampling_ratio/mean": 0.9967477917671204, "sampling/importance_sampling_ratio/min": 0.9506382942199707, "sampling/sampling_logp_difference/max": 0.05789109319448471, "sampling/sampling_logp_difference/mean": 0.006703739054501057, "step": 345, "step_time": 4.443866628003889 }, { "clip_ratio/high_max": 0.031250000931322575, "clip_ratio/high_mean": 0.015625000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625000465661287, "entropy": 0.11256680265069008, "epoch": 0.00346, "grad_norm": 0.1960476189851761, "kl": 0.5092373713850975, "learning_rate": 9.999955525039433e-06, "loss": 0.001, "step": 346, "step_time": 2.5424589949980145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 142.625, "completions/mean_terminated_length": 142.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.24622727558016777, "epoch": 0.00347, "frac_reward_zero_std": 0.75, "grad_norm": 0.008443483151495457, "kl": 0.5336944870650768, "learning_rate": 9.999955237641976e-06, "loss": -0.0267, "num_tokens": 3441540.0, "reward": 0.712740421295166, "reward_std": 0.011830446310341358, "rewards/rollout_reward_func/mean": 0.712740421295166, "rewards/rollout_reward_func/std": 0.33572664856910706, "sampling/importance_sampling_ratio/max": 1.0446609258651733, "sampling/importance_sampling_ratio/mean": 0.966600775718689, "sampling/importance_sampling_ratio/min": 0.011640057899057865, "sampling/sampling_logp_difference/max": 1.9136409759521484, "sampling/sampling_logp_difference/mean": 0.033149030059576035, "step": 347, "step_time": 4.103589223996096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2432181602343917, "epoch": 0.00348, "grad_norm": 0.008947855792939663, "kl": 0.5414164364337921, "learning_rate": 9.999954949318932e-06, "loss": -0.0267, "step": 348, "step_time": 2.027216023001529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 171.9375, "completions/mean_terminated_length": 171.9375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.5131063908338547, "epoch": 0.00349, "frac_reward_zero_std": 0.5, "grad_norm": 0.039223577827215195, "kl": 0.6770050153136253, "learning_rate": 9.999954660070299e-06, "loss": -0.0172, "num_tokens": 3460770.0, "reward": 0.6405225992202759, "reward_std": 0.23328132927417755, "rewards/rollout_reward_func/mean": 0.6405225992202759, "rewards/rollout_reward_func/std": 0.5262656807899475, "sampling/importance_sampling_ratio/max": 1.008788824081421, "sampling/importance_sampling_ratio/mean": 0.9274492263793945, "sampling/importance_sampling_ratio/min": 3.2054176699602976e-06, "sampling/sampling_logp_difference/max": 2.817164421081543, "sampling/sampling_logp_difference/mean": 0.08696402609348297, "step": 349, "step_time": 5.330817031994229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5065736556425691, "epoch": 0.0035, "grad_norm": 0.04048660770058632, "kl": 0.6738890931010246, "learning_rate": 9.999954369896076e-06, "loss": -0.0173, "step": 350, "step_time": 2.119944434998615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 190.3125, "completions/mean_terminated_length": 195.9354705810547, "completions/min_length": 16.0, "completions/min_terminated_length": 104.0, "entropy": 0.7707962244749069, "epoch": 0.00351, "frac_reward_zero_std": 0.25, "grad_norm": 0.03808164969086647, "kl": 0.7819023430347443, "learning_rate": 9.999954078796268e-06, "loss": -0.0732, "num_tokens": 3480212.0, "reward": 0.5574904084205627, "reward_std": 0.22843638062477112, "rewards/rollout_reward_func/mean": 0.5574904084205627, "rewards/rollout_reward_func/std": 0.42963460087776184, "sampling/importance_sampling_ratio/max": 1.0012000799179077, "sampling/importance_sampling_ratio/mean": 0.8735616207122803, "sampling/importance_sampling_ratio/min": 1.1835490786821852e-16, "sampling/sampling_logp_difference/max": 4.3493828773498535, "sampling/sampling_logp_difference/mean": 0.214617058634758, "step": 351, "step_time": 4.571628422985668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.7672058269381523, "epoch": 0.00352, "grad_norm": 0.04879825562238693, "kl": 0.7682327292859554, "learning_rate": 9.99995378677087e-06, "loss": -0.0732, "step": 352, "step_time": 2.5542297379943193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 257.09375, "completions/mean_terminated_length": 257.09375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2553304545581341, "epoch": 0.00353, "frac_reward_zero_std": 0.75, "grad_norm": 0.033635519444942474, "kl": 0.5575128793716431, "learning_rate": 9.999953493819885e-06, "loss": 0.0404, "num_tokens": 3501975.0, "reward": 0.6584999561309814, "reward_std": 0.01359820831567049, "rewards/rollout_reward_func/mean": 0.6584999561309814, "rewards/rollout_reward_func/std": 0.25511932373046875, "sampling/importance_sampling_ratio/max": 1.0324245691299438, "sampling/importance_sampling_ratio/mean": 0.9336526989936829, "sampling/importance_sampling_ratio/min": 0.016247134655714035, "sampling/sampling_logp_difference/max": 1.7354545593261719, "sampling/sampling_logp_difference/mean": 0.027594471350312233, "step": 353, "step_time": 4.87906572400243 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.25343842525035143, "epoch": 0.00354, "grad_norm": 0.027169516310095787, "kl": 0.5494293048977852, "learning_rate": 9.999953199943314e-06, "loss": 0.0403, "step": 354, "step_time": 2.5116168479944463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 192.25, "completions/mean_terminated_length": 192.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.11025442369282246, "epoch": 0.00355, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010303981835022569, "kl": 0.4810371734201908, "learning_rate": 9.999952905141152e-06, "loss": 0.0018, "num_tokens": 3520935.0, "reward": 0.8722307682037354, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8722307682037354, "rewards/rollout_reward_func/std": 0.4485711455345154, "sampling/importance_sampling_ratio/max": 1.1136386394500732, "sampling/importance_sampling_ratio/mean": 1.0039036273956299, "sampling/importance_sampling_ratio/min": 0.9740779995918274, "sampling/sampling_logp_difference/max": 0.11547493934631348, "sampling/sampling_logp_difference/mean": 0.0060632661916315556, "step": 355, "step_time": 4.430845836002845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11102683562785387, "epoch": 0.00356, "grad_norm": 0.001045991899445653, "kl": 0.4809154123067856, "learning_rate": 9.999952609413403e-06, "loss": 0.0018, "step": 356, "step_time": 2.0676968230036437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 161.3125, "completions/mean_terminated_length": 161.3125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.5138017535209656, "epoch": 0.00357, "frac_reward_zero_std": 0.0, "grad_norm": 3.815866470336914, "kl": 0.6501386798918247, "learning_rate": 9.999952312760068e-06, "loss": -0.0062, "num_tokens": 3539185.0, "reward": 0.46974998712539673, "reward_std": 0.17142897844314575, "rewards/rollout_reward_func/mean": 0.46974998712539673, "rewards/rollout_reward_func/std": 0.3414653241634369, "sampling/importance_sampling_ratio/max": 1.0698184967041016, "sampling/importance_sampling_ratio/mean": 0.9004001617431641, "sampling/importance_sampling_ratio/min": 0.003268908942118287, "sampling/sampling_logp_difference/max": 2.5160984992980957, "sampling/sampling_logp_difference/mean": 0.07763555645942688, "step": 357, "step_time": 4.3371874389995355 }, { "clip_ratio/high_max": 0.05267857201397419, "clip_ratio/high_mean": 0.030803571920841932, "clip_ratio/low_mean": 0.04739583423361182, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07819940615445375, "entropy": 0.6025256756693125, "epoch": 0.00358, "grad_norm": 0.2857136130332947, "kl": 0.6558709256350994, "learning_rate": 9.999952015181144e-06, "loss": -0.0127, "step": 358, "step_time": 2.528990418999456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 122.25, "completions/mean_terminated_length": 122.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2560152290388942, "epoch": 0.00359, "frac_reward_zero_std": 0.5, "grad_norm": 0.07516457885503769, "kl": 0.7382850125432014, "learning_rate": 9.999951716676632e-06, "loss": 0.0083, "num_tokens": 3556089.0, "reward": 0.5550481081008911, "reward_std": 0.06867095828056335, "rewards/rollout_reward_func/mean": 0.5550481081008911, "rewards/rollout_reward_func/std": 0.18968544900417328, "sampling/importance_sampling_ratio/max": 0.9961239099502563, "sampling/importance_sampling_ratio/mean": 0.9328306913375854, "sampling/importance_sampling_ratio/min": 0.09169001132249832, "sampling/sampling_logp_difference/max": 2.1208853721618652, "sampling/sampling_logp_difference/mean": 0.029706979170441628, "step": 359, "step_time": 4.140768067998579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2608974501490593, "epoch": 0.0036, "grad_norm": 0.0905667096376419, "kl": 0.7147044911980629, "learning_rate": 9.999951417246534e-06, "loss": 0.0081, "step": 360, "step_time": 2.4819871630097623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 186.6875, "completions/mean_terminated_length": 186.6875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7832365036010742, "epoch": 0.00361, "frac_reward_zero_std": 0.0, "grad_norm": 0.9220073223114014, "kl": 0.5425079017877579, "learning_rate": 9.999951116890847e-06, "loss": -0.0247, "num_tokens": 3574511.0, "reward": 0.5068173408508301, "reward_std": 0.095094695687294, "rewards/rollout_reward_func/mean": 0.5068173408508301, "rewards/rollout_reward_func/std": 0.24079307913780212, "sampling/importance_sampling_ratio/max": 1.18942391872406, "sampling/importance_sampling_ratio/mean": 0.8347518444061279, "sampling/importance_sampling_ratio/min": 0.004068057984113693, "sampling/sampling_logp_difference/max": 2.6722354888916016, "sampling/sampling_logp_difference/mean": 0.10537862777709961, "step": 361, "step_time": 4.516486822009028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03794642956927419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03794642956927419, "entropy": 0.8990448098629713, "epoch": 0.00362, "grad_norm": 0.2224353700876236, "kl": 0.7356040216982365, "learning_rate": 9.999950815609574e-06, "loss": -0.0262, "step": 362, "step_time": 2.053724115001387 }, { "clip_ratio/high_max": 0.02291666716337204, "clip_ratio/high_mean": 0.01145833358168602, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01145833358168602, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 124.75, "completions/mean_terminated_length": 124.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.27876582741737366, "epoch": 0.00363, "frac_reward_zero_std": 0.75, "grad_norm": 0.05453196540474892, "kl": 0.5521597266197205, "learning_rate": 9.999950513402715e-06, "loss": -0.0195, "num_tokens": 3591991.0, "reward": 0.531004786491394, "reward_std": 0.176736518740654, "rewards/rollout_reward_func/mean": 0.531004786491394, "rewards/rollout_reward_func/std": 0.34664762020111084, "sampling/importance_sampling_ratio/max": 2.0389671325683594, "sampling/importance_sampling_ratio/mean": 1.0077464580535889, "sampling/importance_sampling_ratio/min": 0.0723240002989769, "sampling/sampling_logp_difference/max": 2.560798168182373, "sampling/sampling_logp_difference/mean": 0.047558482736349106, "step": 363, "step_time": 4.5633487299928674 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.31444150768220425, "epoch": 0.00364, "grad_norm": 0.2778615951538086, "kl": 0.5499042421579361, "learning_rate": 9.999950210270267e-06, "loss": -0.0185, "step": 364, "step_time": 2.0228387029856094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.5617540348321199, "epoch": 0.00365, "frac_reward_zero_std": 0.5, "grad_norm": 0.11486855894327164, "kl": 0.42306357994675636, "learning_rate": 9.999949906212232e-06, "loss": -0.039, "num_tokens": 3610703.0, "reward": 0.9095745086669922, "reward_std": 0.021655144169926643, "rewards/rollout_reward_func/mean": 0.9095745086669922, "rewards/rollout_reward_func/std": 0.1253441721200943, "sampling/importance_sampling_ratio/max": 1.6284016370773315, "sampling/importance_sampling_ratio/mean": 0.7910870909690857, "sampling/importance_sampling_ratio/min": 0.005813502706587315, "sampling/sampling_logp_difference/max": 2.560795307159424, "sampling/sampling_logp_difference/mean": 0.11469726264476776, "step": 365, "step_time": 4.367729164005141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.6443508081138134, "epoch": 0.00366, "grad_norm": 0.110932357609272, "kl": 0.4152439497411251, "learning_rate": 9.999949601228609e-06, "loss": -0.0387, "step": 366, "step_time": 2.4919644510082435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 206.78125, "completions/mean_terminated_length": 206.78125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.6707478575408459, "epoch": 0.00367, "frac_reward_zero_std": 0.5, "grad_norm": 1.5042632818222046, "kl": 0.4245556928217411, "learning_rate": 9.9999492953194e-06, "loss": 0.0618, "num_tokens": 3629848.0, "reward": 1.0904231071472168, "reward_std": 0.049234941601753235, "rewards/rollout_reward_func/mean": 1.0904231071472168, "rewards/rollout_reward_func/std": 0.2267238199710846, "sampling/importance_sampling_ratio/max": 1.8018447160720825, "sampling/importance_sampling_ratio/mean": 0.933875560760498, "sampling/importance_sampling_ratio/min": 1.098951777208315e-22, "sampling/sampling_logp_difference/max": 4.033266067504883, "sampling/sampling_logp_difference/mean": 0.23473438620567322, "step": 367, "step_time": 4.752902368985815 }, { "clip_ratio/high_max": 0.0357142873108387, "clip_ratio/high_mean": 0.01785714365541935, "clip_ratio/low_mean": 0.011904762359336019, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029761906014755368, "entropy": 0.6064002988860011, "epoch": 0.00368, "grad_norm": 2.008445978164673, "kl": 0.432943731546402, "learning_rate": 9.999948988484605e-06, "loss": 0.0546, "step": 368, "step_time": 2.0562495429985574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 187.84375, "completions/mean_terminated_length": 187.84375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.7575978580862284, "epoch": 0.00369, "frac_reward_zero_std": 0.5, "grad_norm": 0.06465933471918106, "kl": 0.5948529653251171, "learning_rate": 9.999948680724223e-06, "loss": 0.0132, "num_tokens": 3648523.0, "reward": 0.7984327077865601, "reward_std": 0.05045434832572937, "rewards/rollout_reward_func/mean": 0.7984327077865601, "rewards/rollout_reward_func/std": 0.42333778738975525, "sampling/importance_sampling_ratio/max": 1.0328707695007324, "sampling/importance_sampling_ratio/mean": 0.8961896300315857, "sampling/importance_sampling_ratio/min": 2.4347472907297824e-15, "sampling/sampling_logp_difference/max": 4.099143981933594, "sampling/sampling_logp_difference/mean": 0.19884008169174194, "step": 369, "step_time": 4.955990073991416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7608414720743895, "epoch": 0.0037, "grad_norm": 0.06163695827126503, "kl": 0.580732349306345, "learning_rate": 9.999948372038253e-06, "loss": 0.013, "step": 370, "step_time": 2.0466027299844427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 134.0, "completions/mean_terminated_length": 134.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.553016415797174, "epoch": 0.00371, "frac_reward_zero_std": 0.75, "grad_norm": 0.02344849705696106, "kl": 0.4498259536921978, "learning_rate": 9.9999480624267e-06, "loss": -0.0199, "num_tokens": 3667059.0, "reward": 0.6490048170089722, "reward_std": 0.028256431221961975, "rewards/rollout_reward_func/mean": 0.6490048170089722, "rewards/rollout_reward_func/std": 0.17538857460021973, "sampling/importance_sampling_ratio/max": 1.7719955444335938, "sampling/importance_sampling_ratio/mean": 0.9621078968048096, "sampling/importance_sampling_ratio/min": 2.1142298051130734e-17, "sampling/sampling_logp_difference/max": 4.070335388183594, "sampling/sampling_logp_difference/mean": 0.21735413372516632, "step": 371, "step_time": 4.848866492997331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5510708503425121, "epoch": 0.00372, "grad_norm": 0.021095294505357742, "kl": 0.44665486365556717, "learning_rate": 9.999947751889557e-06, "loss": -0.0199, "step": 372, "step_time": 2.0436040299973683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 172.9375, "completions/mean_terminated_length": 172.9375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.9512163233011961, "epoch": 0.00373, "frac_reward_zero_std": 0.0, "grad_norm": 0.12257733196020126, "kl": 0.5784791372716427, "learning_rate": 9.99994744042683e-06, "loss": -0.0539, "num_tokens": 3685921.0, "reward": 0.5147452354431152, "reward_std": 0.15744924545288086, "rewards/rollout_reward_func/mean": 0.5147452354431152, "rewards/rollout_reward_func/std": 0.38563525676727295, "sampling/importance_sampling_ratio/max": 1.004136323928833, "sampling/importance_sampling_ratio/mean": 0.859372615814209, "sampling/importance_sampling_ratio/min": 8.07905565364253e-12, "sampling/sampling_logp_difference/max": 3.0370328426361084, "sampling/sampling_logp_difference/mean": 0.2503660321235657, "step": 373, "step_time": 4.505817845005367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9341335631906986, "epoch": 0.00374, "grad_norm": 0.12820503115653992, "kl": 0.5893316678702831, "learning_rate": 9.999947128038514e-06, "loss": -0.0536, "step": 374, "step_time": 2.5490535140197608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 150.5, "completions/mean_terminated_length": 150.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 1.0636362861841917, "epoch": 0.00375, "frac_reward_zero_std": 0.25, "grad_norm": 1.0102208852767944, "kl": 0.5263872779905796, "learning_rate": 9.999946814724613e-06, "loss": -0.0604, "num_tokens": 3704481.0, "reward": 0.5758557915687561, "reward_std": 0.24167382717132568, "rewards/rollout_reward_func/mean": 0.5758557915687561, "rewards/rollout_reward_func/std": 0.42072445154190063, "sampling/importance_sampling_ratio/max": 1.0047811269760132, "sampling/importance_sampling_ratio/mean": 0.8178613185882568, "sampling/importance_sampling_ratio/min": 3.5435015564945646e-27, "sampling/sampling_logp_difference/max": 10.87964916229248, "sampling/sampling_logp_difference/mean": 0.3445640802383423, "step": 375, "step_time": 4.728223218015046 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.055803571827709675, "clip_ratio/low_min": 0.03125, "clip_ratio/region_mean": 0.06101190531626344, "entropy": 1.1944007892161608, "epoch": 0.00376, "grad_norm": 0.1672048270702362, "kl": 0.6509051248431206, "learning_rate": 9.999946500485126e-06, "loss": -0.0629, "step": 376, "step_time": 2.065167148015462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 190.40625, "completions/mean_terminated_length": 190.40625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.9006984382867813, "epoch": 0.00377, "frac_reward_zero_std": 0.5, "grad_norm": 0.045834265649318695, "kl": 0.5458194985985756, "learning_rate": 9.999946185320051e-06, "loss": -0.0721, "num_tokens": 3723430.0, "reward": 0.8222211599349976, "reward_std": 0.08357458561658859, "rewards/rollout_reward_func/mean": 0.8222211599349976, "rewards/rollout_reward_func/std": 0.3677361309528351, "sampling/importance_sampling_ratio/max": 1.8658509254455566, "sampling/importance_sampling_ratio/mean": 0.8458514213562012, "sampling/importance_sampling_ratio/min": 1.611516059485396e-10, "sampling/sampling_logp_difference/max": 13.152851104736328, "sampling/sampling_logp_difference/mean": 0.18990664184093475, "step": 377, "step_time": 4.967137896994245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9954017959535122, "epoch": 0.00378, "grad_norm": 0.045233968645334244, "kl": 0.5490649305284023, "learning_rate": 9.999945869229393e-06, "loss": -0.0721, "step": 378, "step_time": 2.070532584999455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 180.8125, "completions/mean_terminated_length": 180.8125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 1.4331169500946999, "epoch": 0.00379, "frac_reward_zero_std": 0.25, "grad_norm": 0.8777669072151184, "kl": 0.6247304752469063, "learning_rate": 9.999945552213145e-06, "loss": -0.0428, "num_tokens": 3742888.0, "reward": 0.7096202373504639, "reward_std": 0.0296848826110363, "rewards/rollout_reward_func/mean": 0.7096202373504639, "rewards/rollout_reward_func/std": 0.23202887177467346, "sampling/importance_sampling_ratio/max": 1.2075401544570923, "sampling/importance_sampling_ratio/mean": 0.784309983253479, "sampling/importance_sampling_ratio/min": 6.6727625380735844e-06, "sampling/sampling_logp_difference/max": 3.370732307434082, "sampling/sampling_logp_difference/mean": 0.2948785424232483, "step": 379, "step_time": 4.574923327003489 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.03571428591385484, "clip_ratio/low_min": 0.02083333395421505, "clip_ratio/region_mean": 0.03962053591385484, "entropy": 1.485826661810279, "epoch": 0.0038, "grad_norm": 0.06049887463450432, "kl": 0.6760434582829475, "learning_rate": 9.999945234271316e-06, "loss": -0.0434, "step": 380, "step_time": 2.5389804590013227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 134.9375, "completions/mean_terminated_length": 135.61289978027344, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 1.289469864219427, "epoch": 0.00381, "frac_reward_zero_std": 0.5, "grad_norm": 0.0627988874912262, "kl": 0.545707855373621, "learning_rate": 9.9999449154039e-06, "loss": -0.0499, "num_tokens": 3760422.0, "reward": 0.5696201920509338, "reward_std": 0.09351402521133423, "rewards/rollout_reward_func/mean": 0.5696201920509338, "rewards/rollout_reward_func/std": 0.40014874935150146, "sampling/importance_sampling_ratio/max": 1.02996826171875, "sampling/importance_sampling_ratio/mean": 0.86138516664505, "sampling/importance_sampling_ratio/min": 2.2108946420215546e-20, "sampling/sampling_logp_difference/max": 4.1850385665893555, "sampling/sampling_logp_difference/mean": 0.3977704644203186, "step": 381, "step_time": 4.634778951993212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2827199790626764, "epoch": 0.00382, "grad_norm": 0.05931645259261131, "kl": 0.5499317012727261, "learning_rate": 9.999944595610896e-06, "loss": -0.05, "step": 382, "step_time": 2.0715083990071435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 159.78125, "completions/mean_terminated_length": 159.78125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3333843443542719, "epoch": 0.00383, "frac_reward_zero_std": 0.75, "grad_norm": 0.009421635419130325, "kl": 0.4870615676045418, "learning_rate": 9.999944274892308e-06, "loss": -0.0264, "num_tokens": 3778607.0, "reward": 0.5937981009483337, "reward_std": 0.029508110135793686, "rewards/rollout_reward_func/mean": 0.5937981009483337, "rewards/rollout_reward_func/std": 0.3346112072467804, "sampling/importance_sampling_ratio/max": 1.0529965162277222, "sampling/importance_sampling_ratio/mean": 0.9575717449188232, "sampling/importance_sampling_ratio/min": 0.011232119053602219, "sampling/sampling_logp_difference/max": 2.46077823638916, "sampling/sampling_logp_difference/mean": 0.035660240799188614, "step": 383, "step_time": 4.926484084993717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.32705435901880264, "epoch": 0.00384, "grad_norm": 0.008639384061098099, "kl": 0.4888637661933899, "learning_rate": 9.999943953248133e-06, "loss": -0.0264, "step": 384, "step_time": 2.0324698469921714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 176.8125, "completions/mean_terminated_length": 176.8125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.5681317858397961, "epoch": 0.00385, "frac_reward_zero_std": 0.5, "grad_norm": 0.2977074086666107, "kl": 0.5938740111887455, "learning_rate": 9.999943630678372e-06, "loss": -0.0523, "num_tokens": 3798401.0, "reward": 0.7061827182769775, "reward_std": 0.2909940183162689, "rewards/rollout_reward_func/mean": 0.7061827182769775, "rewards/rollout_reward_func/std": 0.5523344874382019, "sampling/importance_sampling_ratio/max": 1.8443191051483154, "sampling/importance_sampling_ratio/mean": 0.9069644212722778, "sampling/importance_sampling_ratio/min": 0.00012854760279878974, "sampling/sampling_logp_difference/max": 3.0106170177459717, "sampling/sampling_logp_difference/mean": 0.0835283026099205, "step": 385, "step_time": 4.687626239006931 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.013392857275903225, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02589285746216774, "entropy": 0.5618646424263716, "epoch": 0.00386, "grad_norm": 0.09820078313350677, "kl": 0.6007045097649097, "learning_rate": 9.999943307183029e-06, "loss": -0.0534, "step": 386, "step_time": 2.539936667009897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 110.125, "completions/mean_terminated_length": 110.125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 1.1736626662313938, "epoch": 0.00387, "frac_reward_zero_std": 0.75, "grad_norm": 0.9311319589614868, "kl": 0.6098137721419334, "learning_rate": 9.999942982762097e-06, "loss": -0.0207, "num_tokens": 3815277.0, "reward": 0.629355788230896, "reward_std": 0.016520008444786072, "rewards/rollout_reward_func/mean": 0.629355788230896, "rewards/rollout_reward_func/std": 0.1836216300725937, "sampling/importance_sampling_ratio/max": 1.5423133373260498, "sampling/importance_sampling_ratio/mean": 0.8467633128166199, "sampling/importance_sampling_ratio/min": 1.962215806505796e-11, "sampling/sampling_logp_difference/max": 19.518917083740234, "sampling/sampling_logp_difference/mean": 0.3720387816429138, "step": 387, "step_time": 4.146617551996314 }, { "clip_ratio/high_max": 0.11875000037252903, "clip_ratio/high_mean": 0.059375000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.059375000186264515, "entropy": 1.1288152197375894, "epoch": 0.00388, "grad_norm": 0.14302507042884827, "kl": 0.6249534860253334, "learning_rate": 9.999942657415583e-06, "loss": -0.0225, "step": 388, "step_time": 2.0607044199932716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 195.9375, "completions/mean_terminated_length": 195.9375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.24931235425174236, "epoch": 0.00389, "frac_reward_zero_std": 0.75, "grad_norm": 0.03634660691022873, "kl": 0.6884410567581654, "learning_rate": 9.99994233114348e-06, "loss": -0.0252, "num_tokens": 3834947.0, "reward": 0.671668291091919, "reward_std": 0.02878740429878235, "rewards/rollout_reward_func/mean": 0.671668291091919, "rewards/rollout_reward_func/std": 0.4549766182899475, "sampling/importance_sampling_ratio/max": 1.0300079584121704, "sampling/importance_sampling_ratio/mean": 0.9471379518508911, "sampling/importance_sampling_ratio/min": 0.005591553170233965, "sampling/sampling_logp_difference/max": 2.9618613719940186, "sampling/sampling_logp_difference/mean": 0.027497274801135063, "step": 389, "step_time": 4.97514919898822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24941842071712017, "epoch": 0.0039, "grad_norm": 0.036531414836645126, "kl": 0.6903882473707199, "learning_rate": 9.999942003945793e-06, "loss": -0.0252, "step": 390, "step_time": 2.0678137010036153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 158.375, "completions/mean_terminated_length": 158.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4392330180853605, "epoch": 0.00391, "frac_reward_zero_std": 0.75, "grad_norm": 0.674168586730957, "kl": 0.5974964536726475, "learning_rate": 9.999941675822523e-06, "loss": 0.0187, "num_tokens": 3852767.0, "reward": 0.5781345963478088, "reward_std": 0.03871176764369011, "rewards/rollout_reward_func/mean": 0.5781345963478088, "rewards/rollout_reward_func/std": 0.22934888303279877, "sampling/importance_sampling_ratio/max": 1.4875972270965576, "sampling/importance_sampling_ratio/mean": 0.9131360054016113, "sampling/importance_sampling_ratio/min": 0.018237091600894928, "sampling/sampling_logp_difference/max": 1.8528786897659302, "sampling/sampling_logp_difference/mean": 0.07141491025686264, "step": 391, "step_time": 4.6428565780079225 }, { "clip_ratio/high_max": 0.02291666716337204, "clip_ratio/high_mean": 0.01145833358168602, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01145833358168602, "entropy": 0.41290388256311417, "epoch": 0.00392, "grad_norm": 0.4353567361831665, "kl": 0.5875406973063946, "learning_rate": 9.999941346773667e-06, "loss": 0.0174, "step": 392, "step_time": 2.0446454740012996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 198.6875, "completions/mean_terminated_length": 198.6875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 1.0590973477810621, "epoch": 0.00393, "frac_reward_zero_std": 0.25, "grad_norm": 0.10459762066602707, "kl": 0.6603377759456635, "learning_rate": 9.999941016799226e-06, "loss": -0.0808, "num_tokens": 3872605.0, "reward": 0.7875192165374756, "reward_std": 0.22496342658996582, "rewards/rollout_reward_func/mean": 0.7875192165374756, "rewards/rollout_reward_func/std": 0.5019603371620178, "sampling/importance_sampling_ratio/max": 1.3511650562286377, "sampling/importance_sampling_ratio/mean": 0.8155274391174316, "sampling/importance_sampling_ratio/min": 1.7535031759940978e-10, "sampling/sampling_logp_difference/max": 11.692972183227539, "sampling/sampling_logp_difference/mean": 0.263958215713501, "step": 393, "step_time": 4.674588949987083 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 1.0216727908700705, "epoch": 0.00394, "grad_norm": 0.08455049991607666, "kl": 0.6405898332595825, "learning_rate": 9.9999406858992e-06, "loss": -0.0811, "step": 394, "step_time": 2.097014075006882 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01145833358168602, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 195.21875, "completions/mean_terminated_length": 195.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6922989767044783, "epoch": 0.00395, "frac_reward_zero_std": 0.25, "grad_norm": 3.5401711463928223, "kl": 0.8149512968957424, "learning_rate": 9.999940354073589e-06, "loss": -0.052, "num_tokens": 3892420.0, "reward": 0.6592115163803101, "reward_std": 0.06795129179954529, "rewards/rollout_reward_func/mean": 0.6592115163803101, "rewards/rollout_reward_func/std": 0.24958638846874237, "sampling/importance_sampling_ratio/max": 2.13049054145813, "sampling/importance_sampling_ratio/mean": 0.8472733497619629, "sampling/importance_sampling_ratio/min": 0.0019303271546959877, "sampling/sampling_logp_difference/max": 2.7965087890625, "sampling/sampling_logp_difference/mean": 0.14734014868736267, "step": 395, "step_time": 5.1306422220150125 }, { "clip_ratio/high_max": 0.05208333395421505, "clip_ratio/high_mean": 0.026041666977107525, "clip_ratio/low_mean": 0.11227678786963224, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.13831845438107848, "entropy": 0.7944676205515862, "epoch": 0.00396, "grad_norm": 0.07435604929924011, "kl": 0.8516476005315781, "learning_rate": 9.999940021322394e-06, "loss": -0.0566, "step": 396, "step_time": 2.051253033998364 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 136.75, "completions/mean_terminated_length": 136.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.40598942525684834, "epoch": 0.00397, "frac_reward_zero_std": 0.5, "grad_norm": 1.56301748752594, "kl": 0.5129614137113094, "learning_rate": 9.999939687645615e-06, "loss": -0.0482, "num_tokens": 3910508.0, "reward": 0.6832884550094604, "reward_std": 0.28651636838912964, "rewards/rollout_reward_func/mean": 0.6832884550094604, "rewards/rollout_reward_func/std": 0.5421920418739319, "sampling/importance_sampling_ratio/max": 2.2406508922576904, "sampling/importance_sampling_ratio/mean": 0.9492145776748657, "sampling/importance_sampling_ratio/min": 0.06653837859630585, "sampling/sampling_logp_difference/max": 1.7305076122283936, "sampling/sampling_logp_difference/mean": 0.0882243663072586, "step": 397, "step_time": 4.744535244011786 }, { "clip_ratio/high_max": 0.09166666865348816, "clip_ratio/high_mean": 0.04583333432674408, "clip_ratio/low_mean": 0.0885416679084301, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.13437500037252903, "entropy": 0.5519272265955806, "epoch": 0.00398, "grad_norm": 0.12794087827205658, "kl": 0.8313946202397346, "learning_rate": 9.999939353043252e-06, "loss": -0.054, "step": 398, "step_time": 2.0513699160146643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 205.5, "completions/mean_terminated_length": 205.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4036705046892166, "epoch": 0.00399, "frac_reward_zero_std": 0.25, "grad_norm": 3.579836130142212, "kl": 0.5456324703991413, "learning_rate": 9.999939017515304e-06, "loss": 0.0515, "num_tokens": 3930108.0, "reward": 0.4396668076515198, "reward_std": 0.035626932978630066, "rewards/rollout_reward_func/mean": 0.4396668076515198, "rewards/rollout_reward_func/std": 0.13220973312854767, "sampling/importance_sampling_ratio/max": 2.349329948425293, "sampling/importance_sampling_ratio/mean": 0.9782435894012451, "sampling/importance_sampling_ratio/min": 0.04388779029250145, "sampling/sampling_logp_difference/max": 1.974292516708374, "sampling/sampling_logp_difference/mean": 0.06316154450178146, "step": 399, "step_time": 4.538098834003904 }, { "clip_ratio/high_max": 0.04017857275903225, "clip_ratio/high_mean": 0.020089286379516125, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0290178582072258, "entropy": 0.3907371535897255, "epoch": 0.004, "grad_norm": 0.6010165810585022, "kl": 0.5448498092591763, "learning_rate": 9.99993868106177e-06, "loss": 0.0461, "step": 400, "step_time": 2.555338813996059 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 158.34375, "completions/mean_terminated_length": 158.34375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5981812132522464, "epoch": 0.00401, "frac_reward_zero_std": 0.25, "grad_norm": 0.23634546995162964, "kl": 0.719678808003664, "learning_rate": 9.999938343682654e-06, "loss": -0.0398, "num_tokens": 3948615.0, "reward": 0.5070961713790894, "reward_std": 0.08754942566156387, "rewards/rollout_reward_func/mean": 0.5070961713790894, "rewards/rollout_reward_func/std": 0.188239187002182, "sampling/importance_sampling_ratio/max": 1.0146362781524658, "sampling/importance_sampling_ratio/mean": 0.8634461164474487, "sampling/importance_sampling_ratio/min": 0.016748853027820587, "sampling/sampling_logp_difference/max": 1.644752025604248, "sampling/sampling_logp_difference/mean": 0.069610096514225, "step": 401, "step_time": 4.337090903005446 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.6109833559021354, "epoch": 0.00402, "grad_norm": 0.32602715492248535, "kl": 0.7184202186763287, "learning_rate": 9.999938005377952e-06, "loss": -0.0401, "step": 402, "step_time": 2.54176929499954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 176.15625, "completions/mean_terminated_length": 176.15625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.38299495726823807, "epoch": 0.00403, "frac_reward_zero_std": 0.5, "grad_norm": 0.12083485722541809, "kl": 0.5280619524419308, "learning_rate": 9.999937666147667e-06, "loss": -0.0187, "num_tokens": 3967740.0, "reward": 0.8728365302085876, "reward_std": 0.04450950399041176, "rewards/rollout_reward_func/mean": 0.8728365302085876, "rewards/rollout_reward_func/std": 0.3493533730506897, "sampling/importance_sampling_ratio/max": 1.14118492603302, "sampling/importance_sampling_ratio/mean": 0.9814561605453491, "sampling/importance_sampling_ratio/min": 0.0009955186396837234, "sampling/sampling_logp_difference/max": 1.6845765113830566, "sampling/sampling_logp_difference/mean": 0.04887201264500618, "step": 403, "step_time": 4.3866473009984475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3870580140501261, "epoch": 0.00404, "grad_norm": 0.11436112225055695, "kl": 0.5283026099205017, "learning_rate": 9.999937325991797e-06, "loss": -0.0191, "step": 404, "step_time": 2.068981892996817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 144.71875, "completions/mean_terminated_length": 144.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8631370225921273, "epoch": 0.00405, "frac_reward_zero_std": 0.25, "grad_norm": 0.08406799286603928, "kl": 0.6583860218524933, "learning_rate": 9.999936984910345e-06, "loss": -0.085, "num_tokens": 3985755.0, "reward": 0.9620610475540161, "reward_std": 0.06819941103458405, "rewards/rollout_reward_func/mean": 0.9620610475540161, "rewards/rollout_reward_func/std": 0.22537435591220856, "sampling/importance_sampling_ratio/max": 1.0463416576385498, "sampling/importance_sampling_ratio/mean": 0.8685111999511719, "sampling/importance_sampling_ratio/min": 3.8964402207916896e-19, "sampling/sampling_logp_difference/max": 4.077892303466797, "sampling/sampling_logp_difference/mean": 0.2853989005088806, "step": 405, "step_time": 4.446925560005184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8545569814741611, "epoch": 0.00406, "grad_norm": 0.08667614310979843, "kl": 0.6619053483009338, "learning_rate": 9.999936642903308e-06, "loss": -0.0849, "step": 406, "step_time": 2.5673692040072638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 249.125, "completions/mean_terminated_length": 249.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.19833338726311922, "epoch": 0.00407, "frac_reward_zero_std": 0.75, "grad_norm": 0.019175158813595772, "kl": 0.42872704192996025, "learning_rate": 9.999936299970686e-06, "loss": -0.0161, "num_tokens": 4006967.0, "reward": 0.9872692227363586, "reward_std": 0.027196412906050682, "rewards/rollout_reward_func/mean": 0.9872692227363586, "rewards/rollout_reward_func/std": 0.39396706223487854, "sampling/importance_sampling_ratio/max": 0.9939207434654236, "sampling/importance_sampling_ratio/mean": 0.9536734223365784, "sampling/importance_sampling_ratio/min": 0.02292618155479431, "sampling/sampling_logp_difference/max": 2.2490153312683105, "sampling/sampling_logp_difference/mean": 0.022687040269374847, "step": 407, "step_time": 4.855654311002581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20267290994524956, "epoch": 0.00408, "grad_norm": 0.01726018823683262, "kl": 0.43199121579527855, "learning_rate": 9.999935956112484e-06, "loss": -0.0161, "step": 408, "step_time": 2.531547889011563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 148.3125, "completions/mean_terminated_length": 149.4193572998047, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 1.0044182147830725, "epoch": 0.00409, "frac_reward_zero_std": 0.5, "grad_norm": 0.8054972290992737, "kl": 0.4899189919233322, "learning_rate": 9.999935611328696e-06, "loss": -0.0222, "num_tokens": 4025169.0, "reward": 0.6935096383094788, "reward_std": 0.06567186117172241, "rewards/rollout_reward_func/mean": 0.6935096383094788, "rewards/rollout_reward_func/std": 0.29705244302749634, "sampling/importance_sampling_ratio/max": 0.9945728778839111, "sampling/importance_sampling_ratio/mean": 0.8371671438217163, "sampling/importance_sampling_ratio/min": 1.619229728078153e-08, "sampling/sampling_logp_difference/max": 3.0392589569091797, "sampling/sampling_logp_difference/mean": 0.21607191860675812, "step": 409, "step_time": 4.371248296985868 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.03750000009313226, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05000000027939677, "entropy": 1.0272411182522774, "epoch": 0.0041, "grad_norm": 0.07910996675491333, "kl": 0.49071475118398666, "learning_rate": 9.999935265619325e-06, "loss": -0.0234, "step": 410, "step_time": 2.057387439010199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.31722853519022465, "epoch": 0.00411, "frac_reward_zero_std": 0.5, "grad_norm": 1.221916913986206, "kl": 0.5196779035031796, "learning_rate": 9.999934918984371e-06, "loss": -0.0185, "num_tokens": 4044729.0, "reward": 0.7721322774887085, "reward_std": 0.13031314313411713, "rewards/rollout_reward_func/mean": 0.7721322774887085, "rewards/rollout_reward_func/std": 0.36878448724746704, "sampling/importance_sampling_ratio/max": 1.4603173732757568, "sampling/importance_sampling_ratio/mean": 0.9162493944168091, "sampling/importance_sampling_ratio/min": 0.026484452188014984, "sampling/sampling_logp_difference/max": 2.414534330368042, "sampling/sampling_logp_difference/mean": 0.04993163421750069, "step": 411, "step_time": 4.606131736996758 }, { "clip_ratio/high_max": 0.04062500037252903, "clip_ratio/high_mean": 0.020312500186264515, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.035937500186264515, "entropy": 0.3279524929821491, "epoch": 0.00412, "grad_norm": 0.05734618753194809, "kl": 0.5213808082044125, "learning_rate": 9.999934571423834e-06, "loss": -0.0202, "step": 412, "step_time": 2.5145816349977395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 221.25, "completions/mean_terminated_length": 221.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.10833728034049273, "epoch": 0.00413, "frac_reward_zero_std": 1.0, "grad_norm": 0.001245069899596274, "kl": 0.4513351209461689, "learning_rate": 9.999934222937713e-06, "loss": 0.0018, "num_tokens": 4065353.0, "reward": 0.7544615268707275, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7544615268707275, "rewards/rollout_reward_func/std": 0.1959151178598404, "sampling/importance_sampling_ratio/max": 1.2169570922851562, "sampling/importance_sampling_ratio/mean": 1.0083091259002686, "sampling/importance_sampling_ratio/min": 0.9777642488479614, "sampling/sampling_logp_difference/max": 0.22857794165611267, "sampling/sampling_logp_difference/mean": 0.0063660768792033195, "step": 413, "step_time": 5.004834037012188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1071372888982296, "epoch": 0.00414, "grad_norm": 0.001252758433111012, "kl": 0.45147883519530296, "learning_rate": 9.99993387352601e-06, "loss": 0.0018, "step": 414, "step_time": 2.4917284690091037 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 94.75, "completions/mean_terminated_length": 94.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2484180424362421, "epoch": 0.00415, "frac_reward_zero_std": 0.5, "grad_norm": 0.14794103801250458, "kl": 0.7092150151729584, "learning_rate": 9.999933523188722e-06, "loss": -0.0132, "num_tokens": 4081225.0, "reward": 0.7299038767814636, "reward_std": 0.09730062633752823, "rewards/rollout_reward_func/mean": 0.7299038767814636, "rewards/rollout_reward_func/std": 0.3176495432853699, "sampling/importance_sampling_ratio/max": 1.0516279935836792, "sampling/importance_sampling_ratio/mean": 0.9447219967842102, "sampling/importance_sampling_ratio/min": 0.17807923257350922, "sampling/sampling_logp_difference/max": 1.4078807830810547, "sampling/sampling_logp_difference/mean": 0.03430997580289841, "step": 415, "step_time": 3.92427894899447 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.25265052542090416, "epoch": 0.00416, "grad_norm": 0.2946039140224457, "kl": 0.6826515831053257, "learning_rate": 9.999933171925851e-06, "loss": -0.0138, "step": 416, "step_time": 2.028208537005412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 116.40625, "completions/mean_terminated_length": 116.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2889519613236189, "epoch": 0.00417, "frac_reward_zero_std": 0.75, "grad_norm": 0.036722514778375626, "kl": 0.6198297962546349, "learning_rate": 9.999932819737398e-06, "loss": -0.0249, "num_tokens": 4098574.0, "reward": 0.7827885150909424, "reward_std": 0.004079459700733423, "rewards/rollout_reward_func/mean": 0.7827885150909424, "rewards/rollout_reward_func/std": 0.20904876291751862, "sampling/importance_sampling_ratio/max": 0.9956132173538208, "sampling/importance_sampling_ratio/mean": 0.9528743028640747, "sampling/importance_sampling_ratio/min": 0.07097340375185013, "sampling/sampling_logp_difference/max": 1.7514361143112183, "sampling/sampling_logp_difference/mean": 0.026373393833637238, "step": 417, "step_time": 4.586234243004583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28919689171016216, "epoch": 0.00418, "grad_norm": 0.042793385684490204, "kl": 0.6206811964511871, "learning_rate": 9.999932466623362e-06, "loss": -0.025, "step": 418, "step_time": 2.039196495010401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7102918932214379, "epoch": 0.00419, "frac_reward_zero_std": 0.5, "grad_norm": 0.04365886002779007, "kl": 0.5424728542566299, "learning_rate": 9.999932112583745e-06, "loss": -0.0604, "num_tokens": 4117902.0, "reward": 0.9795851111412048, "reward_std": 0.09924162179231644, "rewards/rollout_reward_func/mean": 0.9795851111412048, "rewards/rollout_reward_func/std": 0.3551478981971741, "sampling/importance_sampling_ratio/max": 0.9933720231056213, "sampling/importance_sampling_ratio/mean": 0.8965594172477722, "sampling/importance_sampling_ratio/min": 1.2659786017622415e-19, "sampling/sampling_logp_difference/max": 2.975247859954834, "sampling/sampling_logp_difference/mean": 0.20612022280693054, "step": 419, "step_time": 5.556064805998176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7097962684929371, "epoch": 0.0042, "grad_norm": 0.038702670484781265, "kl": 0.5535204820334911, "learning_rate": 9.999931757618544e-06, "loss": -0.0605, "step": 420, "step_time": 2.2067513480069465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 228.1875, "completions/mean_terminated_length": 228.1875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.5967962304130197, "epoch": 0.00421, "frac_reward_zero_std": 0.5, "grad_norm": 0.028269272297620773, "kl": 0.4904729910194874, "learning_rate": 9.999931401727761e-06, "loss": -0.0641, "num_tokens": 4138804.0, "reward": 0.8833783268928528, "reward_std": 0.08900434523820877, "rewards/rollout_reward_func/mean": 0.8833783268928528, "rewards/rollout_reward_func/std": 0.22970542311668396, "sampling/importance_sampling_ratio/max": 1.0396840572357178, "sampling/importance_sampling_ratio/mean": 0.929973304271698, "sampling/importance_sampling_ratio/min": 1.9632118542695734e-18, "sampling/sampling_logp_difference/max": 4.471541881561279, "sampling/sampling_logp_difference/mean": 0.21631264686584473, "step": 421, "step_time": 4.725498918007361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5970263304188848, "epoch": 0.00422, "grad_norm": 0.033155109733343124, "kl": 0.4968566931784153, "learning_rate": 9.999931044911395e-06, "loss": -0.0641, "step": 422, "step_time": 2.0993273379936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 204.3125, "completions/mean_terminated_length": 204.3125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.5062640998512506, "epoch": 0.00423, "frac_reward_zero_std": 0.5, "grad_norm": 0.03449878841638565, "kl": 0.6596762165427208, "learning_rate": 9.999930687169449e-06, "loss": -0.0573, "num_tokens": 4158742.0, "reward": 0.8523317575454712, "reward_std": 0.19345729053020477, "rewards/rollout_reward_func/mean": 0.8523317575454712, "rewards/rollout_reward_func/std": 0.5409366488456726, "sampling/importance_sampling_ratio/max": 0.9946108460426331, "sampling/importance_sampling_ratio/mean": 0.8951168060302734, "sampling/importance_sampling_ratio/min": 0.013383992947638035, "sampling/sampling_logp_difference/max": 1.8645336627960205, "sampling/sampling_logp_difference/mean": 0.0566217415034771, "step": 423, "step_time": 5.003540265992342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5078693805262446, "epoch": 0.00424, "grad_norm": 0.03656001016497612, "kl": 0.67705949395895, "learning_rate": 9.999930328501917e-06, "loss": -0.0573, "step": 424, "step_time": 2.0591624369990313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 121.03125, "completions/mean_terminated_length": 121.03125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5148836290463805, "epoch": 0.00425, "frac_reward_zero_std": 0.5, "grad_norm": 2.7441155910491943, "kl": 0.545395765453577, "learning_rate": 9.999929968908805e-06, "loss": -0.0428, "num_tokens": 4175687.0, "reward": 0.6644231081008911, "reward_std": 0.03381932154297829, "rewards/rollout_reward_func/mean": 0.6644231081008911, "rewards/rollout_reward_func/std": 0.1665218472480774, "sampling/importance_sampling_ratio/max": 2.5514326095581055, "sampling/importance_sampling_ratio/mean": 0.9563944339752197, "sampling/importance_sampling_ratio/min": 8.6596259052385e-09, "sampling/sampling_logp_difference/max": 7.628231048583984, "sampling/sampling_logp_difference/mean": 0.1468207836151123, "step": 425, "step_time": 4.626509134992375 }, { "clip_ratio/high_max": 0.05000000074505806, "clip_ratio/high_mean": 0.02500000037252903, "clip_ratio/low_mean": 0.033333334140479565, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.058333334513008595, "entropy": 0.5257120486348867, "epoch": 0.00426, "grad_norm": 0.06196746602654457, "kl": 0.5869138687849045, "learning_rate": 9.99992960839011e-06, "loss": -0.0453, "step": 426, "step_time": 2.01440685599664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 100.9375, "completions/mean_terminated_length": 100.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.41519937571138144, "epoch": 0.00427, "frac_reward_zero_std": 0.75, "grad_norm": 0.029724357649683952, "kl": 0.64027339220047, "learning_rate": 9.999929246945834e-06, "loss": -0.0026, "num_tokens": 4191845.0, "reward": 0.831081748008728, "reward_std": 0.03253094106912613, "rewards/rollout_reward_func/mean": 0.831081748008728, "rewards/rollout_reward_func/std": 0.2857119143009186, "sampling/importance_sampling_ratio/max": 1.0009974241256714, "sampling/importance_sampling_ratio/mean": 0.9265941381454468, "sampling/importance_sampling_ratio/min": 0.006283692549914122, "sampling/sampling_logp_difference/max": 2.333338737487793, "sampling/sampling_logp_difference/mean": 0.05076524615287781, "step": 427, "step_time": 4.111005031008972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41718293260782957, "epoch": 0.00428, "grad_norm": 0.029660621657967567, "kl": 0.6399852484464645, "learning_rate": 9.999928884575976e-06, "loss": -0.0026, "step": 428, "step_time": 2.0242942239929107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 190.0625, "completions/mean_terminated_length": 190.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.43105652183294296, "epoch": 0.00429, "frac_reward_zero_std": 0.5, "grad_norm": 0.029400130733847618, "kl": 0.5864811688661575, "learning_rate": 9.999928521280536e-06, "loss": 0.0021, "num_tokens": 4210895.0, "reward": 0.6298750042915344, "reward_std": 0.03603525459766388, "rewards/rollout_reward_func/mean": 0.6298750042915344, "rewards/rollout_reward_func/std": 0.3109738826751709, "sampling/importance_sampling_ratio/max": 0.9962356090545654, "sampling/importance_sampling_ratio/mean": 0.9280284643173218, "sampling/importance_sampling_ratio/min": 0.00019456013978924602, "sampling/sampling_logp_difference/max": 2.6097702980041504, "sampling/sampling_logp_difference/mean": 0.07607247680425644, "step": 429, "step_time": 5.108919317986874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43120027892291546, "epoch": 0.0043, "grad_norm": 0.02850642055273056, "kl": 0.5811182968318462, "learning_rate": 9.999928157059513e-06, "loss": 0.0021, "step": 430, "step_time": 2.0650951210118365 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.19740402046591043, "epoch": 0.00431, "frac_reward_zero_std": 0.5, "grad_norm": 0.6901423931121826, "kl": 0.6699011363089085, "learning_rate": 9.99992779191291e-06, "loss": -0.0275, "num_tokens": 4229239.0, "reward": 0.6268268823623657, "reward_std": 0.08566870540380478, "rewards/rollout_reward_func/mean": 0.6268268823623657, "rewards/rollout_reward_func/std": 0.1671849936246872, "sampling/importance_sampling_ratio/max": 1.4375226497650146, "sampling/importance_sampling_ratio/mean": 0.9596114158630371, "sampling/importance_sampling_ratio/min": 0.08059738576412201, "sampling/sampling_logp_difference/max": 2.0184543132781982, "sampling/sampling_logp_difference/mean": 0.03337537497282028, "step": 431, "step_time": 4.740680238995992 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.22557042073458433, "epoch": 0.00432, "grad_norm": 0.08262671530246735, "kl": 0.6525072604417801, "learning_rate": 9.999927425840725e-06, "loss": -0.029, "step": 432, "step_time": 2.044525886005431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 177.71875, "completions/mean_terminated_length": 177.71875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.27940538711845875, "epoch": 0.00433, "frac_reward_zero_std": 0.75, "grad_norm": 0.051436156034469604, "kl": 0.48995715752243996, "learning_rate": 9.999927058842958e-06, "loss": 0.0208, "num_tokens": 4248222.0, "reward": 0.9986057877540588, "reward_std": 0.007479015737771988, "rewards/rollout_reward_func/mean": 0.9986057877540588, "rewards/rollout_reward_func/std": 0.1420421153306961, "sampling/importance_sampling_ratio/max": 1.0017344951629639, "sampling/importance_sampling_ratio/mean": 0.9556876420974731, "sampling/importance_sampling_ratio/min": 0.0037521785125136375, "sampling/sampling_logp_difference/max": 1.9082136154174805, "sampling/sampling_logp_difference/mean": 0.03785581886768341, "step": 433, "step_time": 4.403579235993675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2848894577473402, "epoch": 0.00434, "grad_norm": 0.05807250365614891, "kl": 0.47551362961530685, "learning_rate": 9.999926690919612e-06, "loss": 0.0207, "step": 434, "step_time": 2.031517369992798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.16939453221857548, "epoch": 0.00435, "frac_reward_zero_std": 0.75, "grad_norm": 0.031185468658804893, "kl": 0.5023097135126591, "learning_rate": 9.999926322070682e-06, "loss": 0.0304, "num_tokens": 4269618.0, "reward": 0.8246250152587891, "reward_std": 0.0017677652649581432, "rewards/rollout_reward_func/mean": 0.8246250152587891, "rewards/rollout_reward_func/std": 0.40076586604118347, "sampling/importance_sampling_ratio/max": 0.9941108226776123, "sampling/importance_sampling_ratio/mean": 0.9561952352523804, "sampling/importance_sampling_ratio/min": 0.00920185912400484, "sampling/sampling_logp_difference/max": 2.1107029914855957, "sampling/sampling_logp_difference/mean": 0.023425718769431114, "step": 435, "step_time": 5.332581399990886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.16975790774449706, "epoch": 0.00436, "grad_norm": 0.029925694689154625, "kl": 0.4955662749707699, "learning_rate": 9.999925952296172e-06, "loss": 0.0304, "step": 436, "step_time": 2.515800218003278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 142.5, "completions/mean_terminated_length": 142.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.07461619190871716, "epoch": 0.00437, "frac_reward_zero_std": 1.0, "grad_norm": 0.00045879860408604145, "kl": 0.47817494347691536, "learning_rate": 9.99992558159608e-06, "loss": 0.0015, "num_tokens": 4288018.0, "reward": 0.8080769777297974, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8080769777297974, "rewards/rollout_reward_func/std": 0.23190359771251678, "sampling/importance_sampling_ratio/max": 0.9946244955062866, "sampling/importance_sampling_ratio/mean": 0.9912822246551514, "sampling/importance_sampling_ratio/min": 0.9874077439308167, "sampling/sampling_logp_difference/max": 0.0043300967663526535, "sampling/sampling_logp_difference/mean": 0.002445298945531249, "step": 437, "step_time": 4.056246479005495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07445511966943741, "epoch": 0.00438, "grad_norm": 0.0004734323301818222, "kl": 0.478156715631485, "learning_rate": 9.999925209970408e-06, "loss": 0.0015, "step": 438, "step_time": 2.0415206349935033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 122.5, "completions/mean_terminated_length": 122.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11429170612245798, "epoch": 0.00439, "frac_reward_zero_std": 1.0, "grad_norm": 0.00042625574860721827, "kl": 0.6151896864175797, "learning_rate": 9.999924837419155e-06, "loss": 0.0015, "num_tokens": 4305074.0, "reward": 0.6451153755187988, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6451153755187988, "rewards/rollout_reward_func/std": 0.18644005060195923, "sampling/importance_sampling_ratio/max": 0.9973807334899902, "sampling/importance_sampling_ratio/mean": 0.9853714108467102, "sampling/importance_sampling_ratio/min": 0.9686482548713684, "sampling/sampling_logp_difference/max": 0.024608904495835304, "sampling/sampling_logp_difference/mean": 0.0049629854038357735, "step": 439, "step_time": 4.579865478008287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11383224464952946, "epoch": 0.0044, "grad_norm": 0.0004517913912422955, "kl": 0.6151705458760262, "learning_rate": 9.999924463942322e-06, "loss": 0.0015, "step": 440, "step_time": 2.0478040580128436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.08655172027647495, "epoch": 0.00441, "frac_reward_zero_std": 0.75, "grad_norm": 0.016566196456551552, "kl": 0.48801594600081444, "learning_rate": 9.999924089539907e-06, "loss": 0.0015, "num_tokens": 4323794.0, "reward": 0.42557692527770996, "reward_std": 0.0409497395157814, "rewards/rollout_reward_func/mean": 0.42557692527770996, "rewards/rollout_reward_func/std": 0.10461375117301941, "sampling/importance_sampling_ratio/max": 0.9940600395202637, "sampling/importance_sampling_ratio/mean": 0.9870791435241699, "sampling/importance_sampling_ratio/min": 0.9766711592674255, "sampling/sampling_logp_difference/max": 0.01706593856215477, "sampling/sampling_logp_difference/mean": 0.0031848903745412827, "step": 441, "step_time": 4.974197891999211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08722496870905161, "epoch": 0.00442, "grad_norm": 0.01738954707980156, "kl": 0.4878831058740616, "learning_rate": 9.999923714211912e-06, "loss": 0.0015, "step": 442, "step_time": 2.464655625015439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 201.0625, "completions/mean_terminated_length": 201.0625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.24234883021563292, "epoch": 0.00443, "frac_reward_zero_std": 0.75, "grad_norm": 0.024567656219005585, "kl": 0.5031914561986923, "learning_rate": 9.999923337958336e-06, "loss": -0.0267, "num_tokens": 4342820.0, "reward": 1.0163654088974, "reward_std": 0.051945146173238754, "rewards/rollout_reward_func/mean": 1.0163654088974, "rewards/rollout_reward_func/std": 0.3254527151584625, "sampling/importance_sampling_ratio/max": 1.0044496059417725, "sampling/importance_sampling_ratio/mean": 0.9557356238365173, "sampling/importance_sampling_ratio/min": 0.0021493772510439157, "sampling/sampling_logp_difference/max": 2.1036930084228516, "sampling/sampling_logp_difference/mean": 0.03475464880466461, "step": 443, "step_time": 4.627207168014138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24238535948097706, "epoch": 0.00444, "grad_norm": 0.021953782066702843, "kl": 0.4883089177310467, "learning_rate": 9.99992296077918e-06, "loss": -0.0267, "step": 444, "step_time": 2.039225605993124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 118.8125, "completions/mean_terminated_length": 118.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.49312951415777206, "epoch": 0.00445, "frac_reward_zero_std": 0.75, "grad_norm": 0.05323029309511185, "kl": 0.73226597905159, "learning_rate": 9.999922582674445e-06, "loss": -0.0319, "num_tokens": 4360518.0, "reward": 0.6811953783035278, "reward_std": 0.21511097252368927, "rewards/rollout_reward_func/mean": 0.6811953783035278, "rewards/rollout_reward_func/std": 0.4650577902793884, "sampling/importance_sampling_ratio/max": 1.0240132808685303, "sampling/importance_sampling_ratio/mean": 0.9305558204650879, "sampling/importance_sampling_ratio/min": 2.5567919309521514e-11, "sampling/sampling_logp_difference/max": 3.0164594650268555, "sampling/sampling_logp_difference/mean": 0.13194330036640167, "step": 445, "step_time": 4.932338445003552 }, { "clip_ratio/high_max": 0.004999999888241291, "clip_ratio/high_mean": 0.0024999999441206455, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024999999441206455, "entropy": 0.4867311678826809, "epoch": 0.00446, "grad_norm": 0.06154410541057587, "kl": 0.7172872200608253, "learning_rate": 9.999922203644126e-06, "loss": -0.0319, "step": 446, "step_time": 2.643936964988825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 141.1875, "completions/mean_terminated_length": 141.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.38995820563286543, "epoch": 0.00447, "frac_reward_zero_std": 0.75, "grad_norm": 0.011672652326524258, "kl": 0.6045137122273445, "learning_rate": 9.99992182368823e-06, "loss": -0.0352, "num_tokens": 4378444.0, "reward": 0.6687067747116089, "reward_std": 0.0007207065937109292, "rewards/rollout_reward_func/mean": 0.6687067747116089, "rewards/rollout_reward_func/std": 0.2816472351551056, "sampling/importance_sampling_ratio/max": 0.9980484843254089, "sampling/importance_sampling_ratio/mean": 0.9531594514846802, "sampling/importance_sampling_ratio/min": 2.906402896130089e-10, "sampling/sampling_logp_difference/max": 3.9766340255737305, "sampling/sampling_logp_difference/mean": 0.09943848848342896, "step": 447, "step_time": 4.6115199429987115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38739162497222424, "epoch": 0.00448, "grad_norm": 0.009277190081775188, "kl": 0.5972922369837761, "learning_rate": 9.999921442806754e-06, "loss": -0.0353, "step": 448, "step_time": 2.4832661309992545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 132.375, "completions/mean_terminated_length": 132.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6365942787379026, "epoch": 0.00449, "frac_reward_zero_std": 0.25, "grad_norm": 0.270971417427063, "kl": 0.5661894083023071, "learning_rate": 9.999921060999696e-06, "loss": -0.0344, "num_tokens": 4396112.0, "reward": 0.6633942127227783, "reward_std": 0.2212877869606018, "rewards/rollout_reward_func/mean": 0.6633942127227783, "rewards/rollout_reward_func/std": 0.45057952404022217, "sampling/importance_sampling_ratio/max": 1.0380959510803223, "sampling/importance_sampling_ratio/mean": 0.86411052942276, "sampling/importance_sampling_ratio/min": 0.0035154910292476416, "sampling/sampling_logp_difference/max": 2.182088851928711, "sampling/sampling_logp_difference/mean": 0.10076983273029327, "step": 449, "step_time": 4.159559340994747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.016666667070239782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016666667070239782, "entropy": 0.6507970998063684, "epoch": 0.0045, "grad_norm": 0.2891238331794739, "kl": 0.5608807541429996, "learning_rate": 9.99992067826706e-06, "loss": -0.0353, "step": 450, "step_time": 2.0437157819833374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 127.0, "completions/mean_terminated_length": 127.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 1.1630965545773506, "epoch": 0.00451, "frac_reward_zero_std": 0.25, "grad_norm": 0.08143855631351471, "kl": 0.590413361787796, "learning_rate": 9.999920294608844e-06, "loss": -0.0704, "num_tokens": 4413832.0, "reward": 0.9694408774375916, "reward_std": 0.08320016413927078, "rewards/rollout_reward_func/mean": 0.9694408774375916, "rewards/rollout_reward_func/std": 0.22540482878684998, "sampling/importance_sampling_ratio/max": 1.015694499015808, "sampling/importance_sampling_ratio/mean": 0.8360400199890137, "sampling/importance_sampling_ratio/min": 4.4163495777072015e-11, "sampling/sampling_logp_difference/max": 3.0896196365356445, "sampling/sampling_logp_difference/mean": 0.2864947021007538, "step": 451, "step_time": 4.099895349012513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.172570469789207, "epoch": 0.00452, "grad_norm": 0.0666019394993782, "kl": 0.5949632227420807, "learning_rate": 9.999919910025047e-06, "loss": -0.0705, "step": 452, "step_time": 2.4850854389951564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 76.34375, "completions/mean_terminated_length": 76.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7474445002153516, "epoch": 0.00453, "frac_reward_zero_std": 0.5, "grad_norm": 0.035897836089134216, "kl": 0.8452129811048508, "learning_rate": 9.999919524515672e-06, "loss": 0.0015, "num_tokens": 4429411.0, "reward": 0.6002403497695923, "reward_std": 0.019717399030923843, "rewards/rollout_reward_func/mean": 0.6002403497695923, "rewards/rollout_reward_func/std": 0.12474802136421204, "sampling/importance_sampling_ratio/max": 0.9883947968482971, "sampling/importance_sampling_ratio/mean": 0.9193676114082336, "sampling/importance_sampling_ratio/min": 7.568958514689392e-15, "sampling/sampling_logp_difference/max": 13.775935173034668, "sampling/sampling_logp_difference/mean": 0.32367879152297974, "step": 453, "step_time": 3.791824425003142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7460203720256686, "epoch": 0.00454, "grad_norm": 0.036021918058395386, "kl": 0.8587566614151001, "learning_rate": 9.999919138080717e-06, "loss": 0.0015, "step": 454, "step_time": 2.4888465329931933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 169.5, "completions/mean_terminated_length": 169.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.20633180998265743, "epoch": 0.00455, "frac_reward_zero_std": 0.75, "grad_norm": 0.09443801641464233, "kl": 1.0814041905105114, "learning_rate": 9.999918750720182e-06, "loss": -0.0211, "num_tokens": 4448235.0, "reward": 0.7397115230560303, "reward_std": 0.031275875866413116, "rewards/rollout_reward_func/mean": 0.7397115230560303, "rewards/rollout_reward_func/std": 0.2258348912000656, "sampling/importance_sampling_ratio/max": 0.9944967031478882, "sampling/importance_sampling_ratio/mean": 0.9264539480209351, "sampling/importance_sampling_ratio/min": 0.07728182524442673, "sampling/sampling_logp_difference/max": 2.0555849075317383, "sampling/sampling_logp_difference/mean": 0.028600014746189117, "step": 455, "step_time": 4.250030996998248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20933319069445133, "epoch": 0.00456, "grad_norm": 0.0952974483370781, "kl": 1.0838125608861446, "learning_rate": 9.99991836243407e-06, "loss": -0.0211, "step": 456, "step_time": 2.0456123069889145 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3196258023381233, "epoch": 0.00457, "frac_reward_zero_std": 0.5, "grad_norm": 0.13083720207214355, "kl": 0.5407583378255367, "learning_rate": 9.999917973222375e-06, "loss": 0.0065, "num_tokens": 4465703.0, "reward": 0.8224952220916748, "reward_std": 0.043955203145742416, "rewards/rollout_reward_func/mean": 0.8224952220916748, "rewards/rollout_reward_func/std": 0.19304901361465454, "sampling/importance_sampling_ratio/max": 1.250219464302063, "sampling/importance_sampling_ratio/mean": 0.9084590673446655, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.643455147743225, "sampling/sampling_logp_difference/mean": 0.0645546168088913, "step": 457, "step_time": 4.205687816000136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3424372375011444, "epoch": 0.00458, "grad_norm": 0.14996856451034546, "kl": 0.540143258869648, "learning_rate": 9.999917583085104e-06, "loss": 0.0066, "step": 458, "step_time": 2.4886567889989237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 100.6875, "completions/mean_terminated_length": 100.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.25587387196719646, "epoch": 0.00459, "frac_reward_zero_std": 0.75, "grad_norm": 0.13475684821605682, "kl": 1.6459665223956108, "learning_rate": 9.999917192022251e-06, "loss": -0.0042, "num_tokens": 4482141.0, "reward": 0.6490865349769592, "reward_std": 0.01563793420791626, "rewards/rollout_reward_func/mean": 0.6490865349769592, "rewards/rollout_reward_func/std": 0.23493845760822296, "sampling/importance_sampling_ratio/max": 1.0020272731781006, "sampling/importance_sampling_ratio/mean": 0.9573119878768921, "sampling/importance_sampling_ratio/min": 0.06168561056256294, "sampling/sampling_logp_difference/max": 1.8332679271697998, "sampling/sampling_logp_difference/mean": 0.033634912222623825, "step": 459, "step_time": 4.523063954016834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25972389336675406, "epoch": 0.0046, "grad_norm": 0.14279940724372864, "kl": 1.713817998766899, "learning_rate": 9.999916800033823e-06, "loss": -0.004, "step": 460, "step_time": 2.027156273004948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 304.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 125.4375, "completions/mean_terminated_length": 119.67741394042969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5423665205016732, "epoch": 0.00461, "frac_reward_zero_std": 0.5, "grad_norm": 0.04722604900598526, "kl": 0.7175898477435112, "learning_rate": 9.999916407119812e-06, "loss": -0.0537, "num_tokens": 4499291.0, "reward": 0.8875672817230225, "reward_std": 0.10130664706230164, "rewards/rollout_reward_func/mean": 0.8875672817230225, "rewards/rollout_reward_func/std": 0.3026984930038452, "sampling/importance_sampling_ratio/max": 0.9909653663635254, "sampling/importance_sampling_ratio/mean": 0.9214380979537964, "sampling/importance_sampling_ratio/min": 1.0829614842847361e-17, "sampling/sampling_logp_difference/max": 3.205220937728882, "sampling/sampling_logp_difference/mean": 0.21174496412277222, "step": 461, "step_time": 4.473915180009499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5462047029286623, "epoch": 0.00462, "grad_norm": 0.04303519055247307, "kl": 0.7151229158043861, "learning_rate": 9.999916013280226e-06, "loss": -0.0537, "step": 462, "step_time": 2.0611683880051714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 165.875, "completions/mean_terminated_length": 165.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5633796248584986, "epoch": 0.00463, "frac_reward_zero_std": 0.75, "grad_norm": 0.010414984077215195, "kl": 0.4820668622851372, "learning_rate": 9.999915618515059e-06, "loss": -0.0339, "num_tokens": 4517839.0, "reward": 0.5691730976104736, "reward_std": 0.00948344450443983, "rewards/rollout_reward_func/mean": 0.5691730976104736, "rewards/rollout_reward_func/std": 0.20653508603572845, "sampling/importance_sampling_ratio/max": 0.9977564215660095, "sampling/importance_sampling_ratio/mean": 0.9232521057128906, "sampling/importance_sampling_ratio/min": 5.300386732614094e-19, "sampling/sampling_logp_difference/max": 4.249969959259033, "sampling/sampling_logp_difference/mean": 0.18887168169021606, "step": 463, "step_time": 4.692500793011277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.57205262593925, "epoch": 0.00464, "grad_norm": 0.00986580178141594, "kl": 0.4763948358595371, "learning_rate": 9.999915222824314e-06, "loss": -0.0339, "step": 464, "step_time": 2.513763826995273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 95.75, "completions/mean_terminated_length": 95.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.16335149854421616, "epoch": 0.00465, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014188832137733698, "kl": 0.5795404762029648, "learning_rate": 9.999914826207992e-06, "loss": 0.0014, "num_tokens": 4533759.0, "reward": 0.49038463830947876, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.49038463830947876, "rewards/rollout_reward_func/std": 0.17652884125709534, "sampling/importance_sampling_ratio/max": 1.0099948644638062, "sampling/importance_sampling_ratio/mean": 0.9793744683265686, "sampling/importance_sampling_ratio/min": 0.9428086876869202, "sampling/sampling_logp_difference/max": 0.03194813057780266, "sampling/sampling_logp_difference/mean": 0.007743035443127155, "step": 465, "step_time": 4.268358294990321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.16743635013699532, "epoch": 0.00466, "grad_norm": 0.0014327630633488297, "kl": 0.5790991336107254, "learning_rate": 9.99991442866609e-06, "loss": 0.0014, "step": 466, "step_time": 2.0281171099995845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.826668968424201, "epoch": 0.00467, "frac_reward_zero_std": 0.25, "grad_norm": 0.024829760193824768, "kl": 0.6450798958539963, "learning_rate": 9.999914030198609e-06, "loss": -0.0268, "num_tokens": 4552423.0, "reward": 0.6171923279762268, "reward_std": 0.4048920273780823, "rewards/rollout_reward_func/mean": 0.6171923279762268, "rewards/rollout_reward_func/std": 0.6546569466590881, "sampling/importance_sampling_ratio/max": 0.995599627494812, "sampling/importance_sampling_ratio/mean": 0.8656901717185974, "sampling/importance_sampling_ratio/min": 2.3837130356696434e-05, "sampling/sampling_logp_difference/max": 2.546670436859131, "sampling/sampling_logp_difference/mean": 0.1209394633769989, "step": 467, "step_time": 4.652685584005667 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.8179941549897194, "epoch": 0.00468, "grad_norm": 0.02433602139353752, "kl": 0.6380603723227978, "learning_rate": 9.999913630805554e-06, "loss": -0.0269, "step": 468, "step_time": 2.045503004002967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 189.65625, "completions/mean_terminated_length": 189.65625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.297340034507215, "epoch": 0.00469, "frac_reward_zero_std": 0.75, "grad_norm": 0.026356013491749763, "kl": 0.5422541536390781, "learning_rate": 9.999913230486916e-06, "loss": -0.0159, "num_tokens": 4571028.0, "reward": 0.8264135122299194, "reward_std": 0.055296000093221664, "rewards/rollout_reward_func/mean": 0.8264135122299194, "rewards/rollout_reward_func/std": 0.310303658246994, "sampling/importance_sampling_ratio/max": 1.0011235475540161, "sampling/importance_sampling_ratio/mean": 0.9546070098876953, "sampling/importance_sampling_ratio/min": 0.0046121892519295216, "sampling/sampling_logp_difference/max": 2.4195685386657715, "sampling/sampling_logp_difference/mean": 0.032675255089998245, "step": 469, "step_time": 4.330048976997205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2978465938940644, "epoch": 0.0047, "grad_norm": 0.02623055689036846, "kl": 0.5424918755888939, "learning_rate": 9.999912829242704e-06, "loss": -0.0159, "step": 470, "step_time": 2.5583944330064696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 168.15625, "completions/mean_terminated_length": 168.15625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7240169458091259, "epoch": 0.00471, "frac_reward_zero_std": 0.75, "grad_norm": 0.05252303555607796, "kl": 0.7531380578875542, "learning_rate": 9.999912427072911e-06, "loss": 0.0167, "num_tokens": 4589073.0, "reward": 0.5266153812408447, "reward_std": 0.008390634320676327, "rewards/rollout_reward_func/mean": 0.5266153812408447, "rewards/rollout_reward_func/std": 0.0817488431930542, "sampling/importance_sampling_ratio/max": 0.9866698980331421, "sampling/importance_sampling_ratio/mean": 0.8767682909965515, "sampling/importance_sampling_ratio/min": 9.782077728945956e-15, "sampling/sampling_logp_difference/max": 2.9262266159057617, "sampling/sampling_logp_difference/mean": 0.20510363578796387, "step": 471, "step_time": 5.044266899007198 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 0.7251759078353643, "epoch": 0.00472, "grad_norm": 0.048832617700099945, "kl": 0.7395668551325798, "learning_rate": 9.999912023977543e-06, "loss": 0.0166, "step": 472, "step_time": 2.0494499420092325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 264.6875, "completions/mean_terminated_length": 264.6875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.34821379091590643, "epoch": 0.00473, "frac_reward_zero_std": 0.75, "grad_norm": 0.029331333935260773, "kl": 0.44722984731197357, "learning_rate": 9.999911619956595e-06, "loss": -0.0455, "num_tokens": 4610399.0, "reward": 1.115173101425171, "reward_std": 0.042862433940172195, "rewards/rollout_reward_func/mean": 1.115173101425171, "rewards/rollout_reward_func/std": 0.34275901317596436, "sampling/importance_sampling_ratio/max": 0.9997990131378174, "sampling/importance_sampling_ratio/mean": 0.9211794137954712, "sampling/importance_sampling_ratio/min": 0.014249475672841072, "sampling/sampling_logp_difference/max": 2.5335700511932373, "sampling/sampling_logp_difference/mean": 0.03970300033688545, "step": 473, "step_time": 4.849308944991208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3441797746345401, "epoch": 0.00474, "grad_norm": 0.029501089826226234, "kl": 0.4453612267971039, "learning_rate": 9.999911215010072e-06, "loss": -0.0454, "step": 474, "step_time": 2.077017302006425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 146.28125, "completions/mean_terminated_length": 146.28125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6049955645576119, "epoch": 0.00475, "frac_reward_zero_std": 0.5, "grad_norm": 0.04661089926958084, "kl": 0.5699125863611698, "learning_rate": 9.99991080913797e-06, "loss": 0.0008, "num_tokens": 4628376.0, "reward": 0.7182211875915527, "reward_std": 0.03358757495880127, "rewards/rollout_reward_func/mean": 0.7182211875915527, "rewards/rollout_reward_func/std": 0.15242373943328857, "sampling/importance_sampling_ratio/max": 1.0082248449325562, "sampling/importance_sampling_ratio/mean": 0.8948543071746826, "sampling/importance_sampling_ratio/min": 0.0029255663976073265, "sampling/sampling_logp_difference/max": 2.492363691329956, "sampling/sampling_logp_difference/mean": 0.08582600206136703, "step": 475, "step_time": 4.654042769994703 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.5984050342813134, "epoch": 0.00476, "grad_norm": 0.03257519006729126, "kl": 0.5760231576859951, "learning_rate": 9.999910402340289e-06, "loss": 0.0008, "step": 476, "step_time": 2.0376069110134267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 55.3125, "completions/mean_terminated_length": 55.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8914230968803167, "epoch": 0.00477, "frac_reward_zero_std": 0.5, "grad_norm": 0.01546529121696949, "kl": 0.7915550693869591, "learning_rate": 9.999909994617032e-06, "loss": -0.036, "num_tokens": 4643506.0, "reward": 0.6865384578704834, "reward_std": 0.038074977695941925, "rewards/rollout_reward_func/mean": 0.6865384578704834, "rewards/rollout_reward_func/std": 0.09759464859962463, "sampling/importance_sampling_ratio/max": 0.9883425235748291, "sampling/importance_sampling_ratio/mean": 0.9106662273406982, "sampling/importance_sampling_ratio/min": 3.3994854220509296e-06, "sampling/sampling_logp_difference/max": 2.349097967147827, "sampling/sampling_logp_difference/mean": 0.2066306471824646, "step": 477, "step_time": 4.2182492629945045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8949822634458542, "epoch": 0.00478, "grad_norm": 0.016084544360637665, "kl": 0.7904470413923264, "learning_rate": 9.9999095859682e-06, "loss": -0.036, "step": 478, "step_time": 2.035225628002081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 117.78125, "completions/mean_terminated_length": 117.78125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.8714675325900316, "epoch": 0.00479, "frac_reward_zero_std": 0.5, "grad_norm": 0.03041725978255272, "kl": 0.6603813096880913, "learning_rate": 9.99990917639379e-06, "loss": -0.0523, "num_tokens": 4660675.0, "reward": 0.6141346096992493, "reward_std": 0.07671020925045013, "rewards/rollout_reward_func/mean": 0.6141346096992493, "rewards/rollout_reward_func/std": 0.335403174161911, "sampling/importance_sampling_ratio/max": 0.9980865716934204, "sampling/importance_sampling_ratio/mean": 0.8858953714370728, "sampling/importance_sampling_ratio/min": 3.4260580150657916e-07, "sampling/sampling_logp_difference/max": 3.020073652267456, "sampling/sampling_logp_difference/mean": 0.1735190600156784, "step": 479, "step_time": 4.1384879120087135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8652054127305746, "epoch": 0.0048, "grad_norm": 0.030290652066469193, "kl": 0.654934711754322, "learning_rate": 9.999908765893802e-06, "loss": -0.0523, "step": 480, "step_time": 2.0190950020041782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 122.875, "completions/mean_terminated_length": 122.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4456351734697819, "epoch": 0.00481, "frac_reward_zero_std": 0.5, "grad_norm": 0.02973560243844986, "kl": 0.6082111485302448, "learning_rate": 9.999908354468237e-06, "loss": -0.027, "num_tokens": 4677823.0, "reward": 0.7526923418045044, "reward_std": 0.12510351836681366, "rewards/rollout_reward_func/mean": 0.7526923418045044, "rewards/rollout_reward_func/std": 0.2771950662136078, "sampling/importance_sampling_ratio/max": 0.9905909299850464, "sampling/importance_sampling_ratio/mean": 0.9474376440048218, "sampling/importance_sampling_ratio/min": 1.94481744983932e-05, "sampling/sampling_logp_difference/max": 2.4684207439422607, "sampling/sampling_logp_difference/mean": 0.08943213522434235, "step": 481, "step_time": 4.635140619990125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.44433479383587837, "epoch": 0.00482, "grad_norm": 0.032435208559036255, "kl": 0.60931646078825, "learning_rate": 9.999907942117095e-06, "loss": -0.0271, "step": 482, "step_time": 2.4748701990029076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 155.8125, "completions/mean_terminated_length": 155.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5710101313889027, "epoch": 0.00483, "frac_reward_zero_std": 0.75, "grad_norm": 0.027102680876851082, "kl": 0.7342284843325615, "learning_rate": 9.999907528840379e-06, "loss": -0.046, "num_tokens": 4696217.0, "reward": 0.4300192594528198, "reward_std": 0.06045510247349739, "rewards/rollout_reward_func/mean": 0.4300192594528198, "rewards/rollout_reward_func/std": 1.1785123348236084, "sampling/importance_sampling_ratio/max": 1.0123002529144287, "sampling/importance_sampling_ratio/mean": 0.9146372079849243, "sampling/importance_sampling_ratio/min": 3.8196643004084763e-07, "sampling/sampling_logp_difference/max": 2.507732629776001, "sampling/sampling_logp_difference/mean": 0.09393294155597687, "step": 483, "step_time": 4.754221685005177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.566102484241128, "epoch": 0.00484, "grad_norm": 0.025912266224622726, "kl": 0.719513550400734, "learning_rate": 9.999907114638084e-06, "loss": -0.0461, "step": 484, "step_time": 2.077485539004556 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 148.59375, "completions/mean_terminated_length": 148.59375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2913481146097183, "epoch": 0.00485, "frac_reward_zero_std": 0.5, "grad_norm": 0.031801171600818634, "kl": 0.4774562083184719, "learning_rate": 9.999906699510213e-06, "loss": 0.02, "num_tokens": 4714324.0, "reward": 0.6834135055541992, "reward_std": 0.061693087220191956, "rewards/rollout_reward_func/mean": 0.6834135055541992, "rewards/rollout_reward_func/std": 0.28052735328674316, "sampling/importance_sampling_ratio/max": 1.0006895065307617, "sampling/importance_sampling_ratio/mean": 0.9548560380935669, "sampling/importance_sampling_ratio/min": 0.019109264016151428, "sampling/sampling_logp_difference/max": 1.7195672988891602, "sampling/sampling_logp_difference/mean": 0.029782282188534737, "step": 485, "step_time": 4.155678166003781 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.28063962049782276, "epoch": 0.00486, "grad_norm": 0.03175170719623566, "kl": 0.4755147621035576, "learning_rate": 9.999906283456766e-06, "loss": 0.02, "step": 486, "step_time": 2.0186631030082935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 149.3125, "completions/mean_terminated_length": 149.3125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.8489683624356985, "epoch": 0.00487, "frac_reward_zero_std": 0.0, "grad_norm": 0.28242477774620056, "kl": 0.5771954692900181, "learning_rate": 9.999905866477743e-06, "loss": -0.0659, "num_tokens": 4732374.0, "reward": 0.8450288772583008, "reward_std": 0.23424533009529114, "rewards/rollout_reward_func/mean": 0.8450288772583008, "rewards/rollout_reward_func/std": 0.468122273683548, "sampling/importance_sampling_ratio/max": 1.0282185077667236, "sampling/importance_sampling_ratio/mean": 0.846875786781311, "sampling/importance_sampling_ratio/min": 5.855599644862062e-16, "sampling/sampling_logp_difference/max": 4.250287055969238, "sampling/sampling_logp_difference/mean": 0.2776850461959839, "step": 487, "step_time": 4.855947438998555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8202155586332083, "epoch": 0.00488, "grad_norm": 0.261038601398468, "kl": 0.572563786059618, "learning_rate": 9.999905448573144e-06, "loss": -0.0666, "step": 488, "step_time": 2.5280878719931934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 172.875, "completions/mean_terminated_length": 172.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.334214941598475, "epoch": 0.00489, "frac_reward_zero_std": 0.75, "grad_norm": 0.01350339874625206, "kl": 0.49733585864305496, "learning_rate": 9.999905029742968e-06, "loss": -0.0258, "num_tokens": 4750898.0, "reward": 0.8786057829856873, "reward_std": 0.008838835172355175, "rewards/rollout_reward_func/mean": 0.8786057829856873, "rewards/rollout_reward_func/std": 0.31409478187561035, "sampling/importance_sampling_ratio/max": 0.9920508861541748, "sampling/importance_sampling_ratio/mean": 0.9421569108963013, "sampling/importance_sampling_ratio/min": 0.0005034581990912557, "sampling/sampling_logp_difference/max": 2.6710736751556396, "sampling/sampling_logp_difference/mean": 0.050853848457336426, "step": 489, "step_time": 4.338743697000609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 0.3286138288676739, "epoch": 0.0049, "grad_norm": 0.013253342360258102, "kl": 0.4987194649875164, "learning_rate": 9.99990460998722e-06, "loss": -0.0258, "step": 490, "step_time": 2.036150242005533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 191.03125, "completions/mean_terminated_length": 191.03125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.6815328076481819, "epoch": 0.00491, "frac_reward_zero_std": 0.5, "grad_norm": 0.02768758311867714, "kl": 0.440030001103878, "learning_rate": 9.999904189305892e-06, "loss": 0.0008, "num_tokens": 4770003.0, "reward": 0.6528846025466919, "reward_std": 0.045146044343709946, "rewards/rollout_reward_func/mean": 0.6528846025466919, "rewards/rollout_reward_func/std": 0.32634079456329346, "sampling/importance_sampling_ratio/max": 0.9861879944801331, "sampling/importance_sampling_ratio/mean": 0.9170041680335999, "sampling/importance_sampling_ratio/min": 3.986772996573426e-19, "sampling/sampling_logp_difference/max": 3.7219085693359375, "sampling/sampling_logp_difference/mean": 0.21305713057518005, "step": 491, "step_time": 4.688890982979501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6632618568837643, "epoch": 0.00492, "grad_norm": 0.027174631133675575, "kl": 0.4373573996126652, "learning_rate": 9.999903767698988e-06, "loss": 0.0007, "step": 492, "step_time": 2.034840059997805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 165.875, "completions/mean_terminated_length": 165.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3809137651696801, "epoch": 0.00493, "frac_reward_zero_std": 0.75, "grad_norm": 0.03674039617180824, "kl": 0.6694633290171623, "learning_rate": 9.999903345166511e-06, "loss": -0.0261, "num_tokens": 4788663.0, "reward": 0.751028835773468, "reward_std": 0.0431063212454319, "rewards/rollout_reward_func/mean": 0.751028835773468, "rewards/rollout_reward_func/std": 0.09905389696359634, "sampling/importance_sampling_ratio/max": 1.0088202953338623, "sampling/importance_sampling_ratio/mean": 0.9484118223190308, "sampling/importance_sampling_ratio/min": 0.00023771512496750802, "sampling/sampling_logp_difference/max": 4.003372669219971, "sampling/sampling_logp_difference/mean": 0.07161549478769302, "step": 493, "step_time": 5.1513056679978035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.37834729719907045, "epoch": 0.00494, "grad_norm": 0.038031574338674545, "kl": 0.6917313486337662, "learning_rate": 9.999902921708457e-06, "loss": -0.026, "step": 494, "step_time": 2.487636765006755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 156.5, "completions/mean_terminated_length": 156.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4047119477763772, "epoch": 0.00495, "frac_reward_zero_std": 0.5, "grad_norm": 0.01940837875008583, "kl": 0.5204099528491497, "learning_rate": 9.999902497324827e-06, "loss": -0.0272, "num_tokens": 4807031.0, "reward": 0.8054408431053162, "reward_std": 0.057644911110401154, "rewards/rollout_reward_func/mean": 0.8054408431053162, "rewards/rollout_reward_func/std": 0.27544349431991577, "sampling/importance_sampling_ratio/max": 1.0028060674667358, "sampling/importance_sampling_ratio/mean": 0.9595072269439697, "sampling/importance_sampling_ratio/min": 5.248012311179218e-13, "sampling/sampling_logp_difference/max": 3.121196746826172, "sampling/sampling_logp_difference/mean": 0.13147157430648804, "step": 495, "step_time": 4.603484095998283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4012466473504901, "epoch": 0.00496, "grad_norm": 0.020308317616581917, "kl": 0.5205800198018551, "learning_rate": 9.999902072015623e-06, "loss": -0.0272, "step": 496, "step_time": 2.0617393470092793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 121.5, "completions/mean_terminated_length": 121.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.42896400950849056, "epoch": 0.00497, "frac_reward_zero_std": 0.75, "grad_norm": 0.09235631674528122, "kl": 0.5593158230185509, "learning_rate": 9.999901645780843e-06, "loss": -0.032, "num_tokens": 4823919.0, "reward": 0.6313942670822144, "reward_std": 0.02744312584400177, "rewards/rollout_reward_func/mean": 0.6313942670822144, "rewards/rollout_reward_func/std": 0.12315620481967926, "sampling/importance_sampling_ratio/max": 0.9977195858955383, "sampling/importance_sampling_ratio/mean": 0.9176896810531616, "sampling/importance_sampling_ratio/min": 0.04985363408923149, "sampling/sampling_logp_difference/max": 1.9996602535247803, "sampling/sampling_logp_difference/mean": 0.04439635947346687, "step": 497, "step_time": 4.5257534720003605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4200271889567375, "epoch": 0.00498, "grad_norm": 0.08736501634120941, "kl": 0.5612993240356445, "learning_rate": 9.99990121862049e-06, "loss": -0.032, "step": 498, "step_time": 2.5056525010149926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 119.25, "completions/mean_terminated_length": 119.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.15370612405240536, "epoch": 0.00499, "frac_reward_zero_std": 1.0, "grad_norm": 0.001184661639854312, "kl": 0.5178538225591183, "learning_rate": 9.99990079053456e-06, "loss": 0.0014, "num_tokens": 4841087.0, "reward": 0.6573077440261841, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6573077440261841, "rewards/rollout_reward_func/std": 0.18200428783893585, "sampling/importance_sampling_ratio/max": 1.2824770212173462, "sampling/importance_sampling_ratio/mean": 0.9834368228912354, "sampling/importance_sampling_ratio/min": 0.4878779649734497, "sampling/sampling_logp_difference/max": 0.604356050491333, "sampling/sampling_logp_difference/mean": 0.013900745660066605, "step": 499, "step_time": 4.558660957998654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1506170155480504, "epoch": 0.005, "grad_norm": 0.001148501061834395, "kl": 0.5181662440299988, "learning_rate": 9.999900361523054e-06, "loss": 0.0014, "step": 500, "step_time": 2.0148197219969006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 168.21875, "completions/mean_terminated_length": 168.21875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.315537815913558, "epoch": 0.00501, "frac_reward_zero_std": 0.5, "grad_norm": 0.04536471515893936, "kl": 0.5309115573763847, "learning_rate": 9.999899931585976e-06, "loss": -0.0323, "num_tokens": 4859358.0, "reward": 0.4440913200378418, "reward_std": 0.3216869533061981, "rewards/rollout_reward_func/mean": 0.4440913200378418, "rewards/rollout_reward_func/std": 0.9213410019874573, "sampling/importance_sampling_ratio/max": 1.224729061126709, "sampling/importance_sampling_ratio/mean": 0.9682892560958862, "sampling/importance_sampling_ratio/min": 0.01854010671377182, "sampling/sampling_logp_difference/max": 1.465319275856018, "sampling/sampling_logp_difference/mean": 0.04041832685470581, "step": 501, "step_time": 4.545863059989642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3142246715724468, "epoch": 0.00502, "grad_norm": 0.04272058233618736, "kl": 0.5292160995304585, "learning_rate": 9.999899500723323e-06, "loss": -0.0325, "step": 502, "step_time": 2.033518209012982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 123.4375, "completions/mean_terminated_length": 123.4375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2130447467789054, "epoch": 0.00503, "frac_reward_zero_std": 0.75, "grad_norm": 0.016644157469272614, "kl": 0.5854885168373585, "learning_rate": 9.999899068935093e-06, "loss": -0.0264, "num_tokens": 4875876.0, "reward": 0.5978990793228149, "reward_std": 0.02878740429878235, "rewards/rollout_reward_func/mean": 0.5978990793228149, "rewards/rollout_reward_func/std": 0.20709340274333954, "sampling/importance_sampling_ratio/max": 1.0289722681045532, "sampling/importance_sampling_ratio/mean": 0.9657047986984253, "sampling/importance_sampling_ratio/min": 0.010823848657310009, "sampling/sampling_logp_difference/max": 2.265242099761963, "sampling/sampling_logp_difference/mean": 0.031045347452163696, "step": 503, "step_time": 4.222365351997723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21010884921997786, "epoch": 0.00504, "grad_norm": 0.02004918083548546, "kl": 0.6003467477858067, "learning_rate": 9.99989863622129e-06, "loss": -0.0264, "step": 504, "step_time": 2.5213111770062824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 195.375, "completions/mean_terminated_length": 195.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4803332071751356, "epoch": 0.00505, "frac_reward_zero_std": 0.5, "grad_norm": 0.017544668167829514, "kl": 0.5506593510508537, "learning_rate": 9.999898202581914e-06, "loss": -0.0168, "num_tokens": 4894440.0, "reward": 1.0983893871307373, "reward_std": 0.07605478167533875, "rewards/rollout_reward_func/mean": 1.0983893871307373, "rewards/rollout_reward_func/std": 0.28413254022598267, "sampling/importance_sampling_ratio/max": 1.0351777076721191, "sampling/importance_sampling_ratio/mean": 0.9354905486106873, "sampling/importance_sampling_ratio/min": 0.00022654722852166742, "sampling/sampling_logp_difference/max": 2.675424098968506, "sampling/sampling_logp_difference/mean": 0.08838900923728943, "step": 505, "step_time": 5.300863530996139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.480527488514781, "epoch": 0.00506, "grad_norm": 0.018634837120771408, "kl": 0.552667111158371, "learning_rate": 9.999897768016961e-06, "loss": -0.0168, "step": 506, "step_time": 2.048461682992638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 146.6875, "completions/mean_terminated_length": 146.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.32599077839404345, "epoch": 0.00507, "frac_reward_zero_std": 0.75, "grad_norm": 0.017311876639723778, "kl": 0.6021274849772453, "learning_rate": 9.999897332526437e-06, "loss": -0.0175, "num_tokens": 4912030.0, "reward": 0.8450673222541809, "reward_std": 0.013190259225666523, "rewards/rollout_reward_func/mean": 0.8450673222541809, "rewards/rollout_reward_func/std": 0.2821946144104004, "sampling/importance_sampling_ratio/max": 1.1522067785263062, "sampling/importance_sampling_ratio/mean": 0.954047441482544, "sampling/importance_sampling_ratio/min": 0.005942960269749165, "sampling/sampling_logp_difference/max": 2.431516647338867, "sampling/sampling_logp_difference/mean": 0.04595262557268143, "step": 507, "step_time": 4.388535847014282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33011906407773495, "epoch": 0.00508, "grad_norm": 0.018945960327982903, "kl": 0.6177117861807346, "learning_rate": 9.999896896110337e-06, "loss": -0.0174, "step": 508, "step_time": 2.0566305890170042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.22873554937541485, "epoch": 0.00509, "frac_reward_zero_std": 0.75, "grad_norm": 0.02960900217294693, "kl": 0.6390590071678162, "learning_rate": 9.999896458768663e-06, "loss": -0.0252, "num_tokens": 4929238.0, "reward": 0.6426442861557007, "reward_std": 0.008838837966322899, "rewards/rollout_reward_func/mean": 0.6426442861557007, "rewards/rollout_reward_func/std": 0.11364023387432098, "sampling/importance_sampling_ratio/max": 0.9925007820129395, "sampling/importance_sampling_ratio/mean": 0.9545032978057861, "sampling/importance_sampling_ratio/min": 0.04597193002700806, "sampling/sampling_logp_difference/max": 1.9531068801879883, "sampling/sampling_logp_difference/mean": 0.02299826592206955, "step": 509, "step_time": 4.365732682002999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22709515690803528, "epoch": 0.0051, "grad_norm": 0.02966729737818241, "kl": 0.6393866240978241, "learning_rate": 9.999896020501416e-06, "loss": -0.0252, "step": 510, "step_time": 2.4674164009993547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 191.71875, "completions/mean_terminated_length": 191.71875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.22018816228955984, "epoch": 0.00511, "frac_reward_zero_std": 0.75, "grad_norm": 0.032085780054330826, "kl": 0.48805422335863113, "learning_rate": 9.999895581308597e-06, "loss": -0.0357, "num_tokens": 4947997.0, "reward": 1.2226442098617554, "reward_std": 0.0524618923664093, "rewards/rollout_reward_func/mean": 1.2226442098617554, "rewards/rollout_reward_func/std": 0.1412661373615265, "sampling/importance_sampling_ratio/max": 1.0127975940704346, "sampling/importance_sampling_ratio/mean": 0.9684109687805176, "sampling/importance_sampling_ratio/min": 0.0531337633728981, "sampling/sampling_logp_difference/max": 1.6632403135299683, "sampling/sampling_logp_difference/mean": 0.018794210627675056, "step": 511, "step_time": 4.859654937994492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22285317396745086, "epoch": 0.00512, "grad_norm": 0.028346702456474304, "kl": 0.48205098882317543, "learning_rate": 9.999895141190201e-06, "loss": -0.0357, "step": 512, "step_time": 2.0419278299814323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 169.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2436061892658472, "epoch": 0.00513, "frac_reward_zero_std": 0.5, "grad_norm": 0.5122861862182617, "kl": 0.5274320170283318, "learning_rate": 9.999894700146234e-06, "loss": -0.0387, "num_tokens": 4966945.0, "reward": 1.0026923418045044, "reward_std": 0.09681924432516098, "rewards/rollout_reward_func/mean": 1.0026923418045044, "rewards/rollout_reward_func/std": 0.2555904984474182, "sampling/importance_sampling_ratio/max": 1.0383156538009644, "sampling/importance_sampling_ratio/mean": 0.9484351873397827, "sampling/importance_sampling_ratio/min": 0.029977748170495033, "sampling/sampling_logp_difference/max": 1.7489140033721924, "sampling/sampling_logp_difference/mean": 0.02804543636739254, "step": 513, "step_time": 4.221329029001936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.2565177455544472, "epoch": 0.00514, "grad_norm": 0.02074248716235161, "kl": 0.5480043515563011, "learning_rate": 9.999894258176692e-06, "loss": -0.0393, "step": 514, "step_time": 2.037379844005045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 166.5625, "completions/mean_terminated_length": 166.5625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2475863080471754, "epoch": 0.00515, "frac_reward_zero_std": 0.5, "grad_norm": 0.06518706679344177, "kl": 0.478052731603384, "learning_rate": 9.999893815281578e-06, "loss": -0.0524, "num_tokens": 4985435.0, "reward": 0.8050048351287842, "reward_std": 0.050354160368442535, "rewards/rollout_reward_func/mean": 0.8050048351287842, "rewards/rollout_reward_func/std": 0.3089631497859955, "sampling/importance_sampling_ratio/max": 1.0147408246994019, "sampling/importance_sampling_ratio/mean": 0.9408366680145264, "sampling/importance_sampling_ratio/min": 0.019195713102817535, "sampling/sampling_logp_difference/max": 2.3856358528137207, "sampling/sampling_logp_difference/mean": 0.03649010509252548, "step": 515, "step_time": 4.7317986550115165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2548461128026247, "epoch": 0.00516, "grad_norm": 0.0620817095041275, "kl": 0.4817935675382614, "learning_rate": 9.999893371460891e-06, "loss": -0.0525, "step": 516, "step_time": 2.4752125149898347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2785690021701157, "epoch": 0.00517, "frac_reward_zero_std": 0.75, "grad_norm": 0.005148150958120823, "kl": 0.4529029503464699, "learning_rate": 9.99989292671463e-06, "loss": -0.017, "num_tokens": 5005579.0, "reward": 0.8513365983963013, "reward_std": 0.008838835172355175, "rewards/rollout_reward_func/mean": 0.8513365983963013, "rewards/rollout_reward_func/std": 0.15326203405857086, "sampling/importance_sampling_ratio/max": 1.002087950706482, "sampling/importance_sampling_ratio/mean": 0.9624387621879578, "sampling/importance_sampling_ratio/min": 0.005286486819386482, "sampling/sampling_logp_difference/max": 2.2720868587493896, "sampling/sampling_logp_difference/mean": 0.029170960187911987, "step": 517, "step_time": 4.670304547020351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.27614229172468185, "epoch": 0.00518, "grad_norm": 0.005931710358709097, "kl": 0.45352647081017494, "learning_rate": 9.999892481042796e-06, "loss": -0.017, "step": 518, "step_time": 2.054132078992552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 122.625, "completions/mean_terminated_length": 122.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2815898396074772, "epoch": 0.00519, "frac_reward_zero_std": 0.75, "grad_norm": 0.015972018241882324, "kl": 0.5949035063385963, "learning_rate": 9.99989203444539e-06, "loss": -0.0271, "num_tokens": 5023215.0, "reward": 0.5832740068435669, "reward_std": 0.030228814110159874, "rewards/rollout_reward_func/mean": 0.5832740068435669, "rewards/rollout_reward_func/std": 0.1685929149389267, "sampling/importance_sampling_ratio/max": 1.0063107013702393, "sampling/importance_sampling_ratio/mean": 0.9634127616882324, "sampling/importance_sampling_ratio/min": 0.016909927129745483, "sampling/sampling_logp_difference/max": 1.625520944595337, "sampling/sampling_logp_difference/mean": 0.02926328405737877, "step": 519, "step_time": 4.022057700000005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27997997030615807, "epoch": 0.0052, "grad_norm": 0.018658457323908806, "kl": 0.5843173786997795, "learning_rate": 9.99989158692241e-06, "loss": -0.0271, "step": 520, "step_time": 2.023407220010995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 147.28125, "completions/mean_terminated_length": 147.28125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2946733729913831, "epoch": 0.00521, "frac_reward_zero_std": 0.5, "grad_norm": 0.022611714899539948, "kl": 0.613461147993803, "learning_rate": 9.999891138473859e-06, "loss": -0.0171, "num_tokens": 5041552.0, "reward": 0.7816538214683533, "reward_std": 0.06542650610208511, "rewards/rollout_reward_func/mean": 0.7816538214683533, "rewards/rollout_reward_func/std": 0.2070728838443756, "sampling/importance_sampling_ratio/max": 1.5073485374450684, "sampling/importance_sampling_ratio/mean": 0.9601793885231018, "sampling/importance_sampling_ratio/min": 0.0001212570205098018, "sampling/sampling_logp_difference/max": 2.9084339141845703, "sampling/sampling_logp_difference/mean": 0.05489547178149223, "step": 521, "step_time": 4.888582064006187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2953263493254781, "epoch": 0.00522, "grad_norm": 0.027523187920451164, "kl": 0.6297499164938927, "learning_rate": 9.999890689099736e-06, "loss": -0.017, "step": 522, "step_time": 2.4873677780051366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 100.21875, "completions/mean_terminated_length": 100.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2699120510369539, "epoch": 0.00523, "frac_reward_zero_std": 0.75, "grad_norm": 0.14058628678321838, "kl": 0.6234389990568161, "learning_rate": 9.999890238800038e-06, "loss": -0.0299, "num_tokens": 5057783.0, "reward": 0.7288942337036133, "reward_std": 0.04876905307173729, "rewards/rollout_reward_func/mean": 0.7288942337036133, "rewards/rollout_reward_func/std": 0.24657213687896729, "sampling/importance_sampling_ratio/max": 1.8894696235656738, "sampling/importance_sampling_ratio/mean": 0.9262174367904663, "sampling/importance_sampling_ratio/min": 0.027109242975711823, "sampling/sampling_logp_difference/max": 2.5933077335357666, "sampling/sampling_logp_difference/mean": 0.06375856697559357, "step": 523, "step_time": 4.097877123000217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.04062500083819032, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04062500083819032, "entropy": 0.32901995070278645, "epoch": 0.00524, "grad_norm": 0.0184831153601408, "kl": 0.6359697505831718, "learning_rate": 9.99988978757477e-06, "loss": -0.03, "step": 524, "step_time": 2.0235411560206558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 70.65625, "completions/mean_terminated_length": 70.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7284766044467688, "epoch": 0.00525, "frac_reward_zero_std": 0.5, "grad_norm": 0.22475765645503998, "kl": 2.9484310671687126, "learning_rate": 9.99988933542393e-06, "loss": -0.0273, "num_tokens": 5072972.0, "reward": 0.7233653664588928, "reward_std": 0.08242291957139969, "rewards/rollout_reward_func/mean": 0.7233653664588928, "rewards/rollout_reward_func/std": 0.14161580801010132, "sampling/importance_sampling_ratio/max": 1.4763712882995605, "sampling/importance_sampling_ratio/mean": 0.8219119310379028, "sampling/importance_sampling_ratio/min": 0.008677753619849682, "sampling/sampling_logp_difference/max": 2.0272624492645264, "sampling/sampling_logp_difference/mean": 0.1576952338218689, "step": 525, "step_time": 4.092434461002995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.7996163927018642, "epoch": 0.00526, "grad_norm": 0.15891125798225403, "kl": 2.3938146233558655, "learning_rate": 9.999888882347517e-06, "loss": -0.0284, "step": 526, "step_time": 2.0348169669887284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 149.71875, "completions/mean_terminated_length": 149.71875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5755701344460249, "epoch": 0.00527, "frac_reward_zero_std": 0.25, "grad_norm": 0.03260107710957527, "kl": 0.7515982538461685, "learning_rate": 9.999888428345532e-06, "loss": -0.0255, "num_tokens": 5091355.0, "reward": 0.30165863037109375, "reward_std": 0.34485915303230286, "rewards/rollout_reward_func/mean": 0.30165863037109375, "rewards/rollout_reward_func/std": 0.7744202613830566, "sampling/importance_sampling_ratio/max": 0.9918128252029419, "sampling/importance_sampling_ratio/mean": 0.8972995281219482, "sampling/importance_sampling_ratio/min": 4.459520278032869e-05, "sampling/sampling_logp_difference/max": 2.585176944732666, "sampling/sampling_logp_difference/mean": 0.10436920076608658, "step": 527, "step_time": 4.721840806007094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5820124549791217, "epoch": 0.00528, "grad_norm": 0.0280331838876009, "kl": 0.739382803440094, "learning_rate": 9.999887973417974e-06, "loss": -0.0254, "step": 528, "step_time": 2.473984480006038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 108.75, "completions/mean_terminated_length": 108.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.373443640768528, "epoch": 0.00529, "frac_reward_zero_std": 0.5, "grad_norm": 0.4597068428993225, "kl": 0.6778057441115379, "learning_rate": 9.999887517564846e-06, "loss": -0.0202, "num_tokens": 5108411.0, "reward": 0.6626730561256409, "reward_std": 0.03811834752559662, "rewards/rollout_reward_func/mean": 0.6626730561256409, "rewards/rollout_reward_func/std": 0.2177087962627411, "sampling/importance_sampling_ratio/max": 1.6337560415267944, "sampling/importance_sampling_ratio/mean": 0.9576630592346191, "sampling/importance_sampling_ratio/min": 3.0618106393376365e-05, "sampling/sampling_logp_difference/max": 2.60197377204895, "sampling/sampling_logp_difference/mean": 0.08077450096607208, "step": 529, "step_time": 4.309633621996909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 0.3843681113794446, "epoch": 0.0053, "grad_norm": 0.1690109223127365, "kl": 0.6656837686896324, "learning_rate": 9.999887060786147e-06, "loss": -0.0217, "step": 530, "step_time": 2.0381272500162595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 123.78125, "completions/mean_terminated_length": 123.78125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.38467780500650406, "epoch": 0.00531, "frac_reward_zero_std": 0.5, "grad_norm": 0.08265794068574905, "kl": 0.6038872003555298, "learning_rate": 9.999886603081875e-06, "loss": 0.0047, "num_tokens": 5125852.0, "reward": 0.5196634531021118, "reward_std": 0.3456892967224121, "rewards/rollout_reward_func/mean": 0.5196634531021118, "rewards/rollout_reward_func/std": 0.6776248216629028, "sampling/importance_sampling_ratio/max": 1.0359854698181152, "sampling/importance_sampling_ratio/mean": 0.9351030588150024, "sampling/importance_sampling_ratio/min": 0.025395436212420464, "sampling/sampling_logp_difference/max": 1.952560544013977, "sampling/sampling_logp_difference/mean": 0.051618024706840515, "step": 531, "step_time": 4.052290355008154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39082348719239235, "epoch": 0.00532, "grad_norm": 0.09301833063364029, "kl": 0.6015421226620674, "learning_rate": 9.999886144452034e-06, "loss": 0.0047, "step": 532, "step_time": 2.052130093012238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 171.5, "completions/mean_terminated_length": 171.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.09195313323289156, "epoch": 0.00533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007931085419841111, "kl": 0.45695148780941963, "learning_rate": 9.999885684896619e-06, "loss": 0.0016, "num_tokens": 5144332.0, "reward": 0.9587692022323608, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9587692022323608, "rewards/rollout_reward_func/std": 0.3458428978919983, "sampling/importance_sampling_ratio/max": 0.9974175691604614, "sampling/importance_sampling_ratio/mean": 0.9887936115264893, "sampling/importance_sampling_ratio/min": 0.9781241416931152, "sampling/sampling_logp_difference/max": 0.013128435239195824, "sampling/sampling_logp_difference/mean": 0.00265504140406847, "step": 533, "step_time": 5.248836995990132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09378133714199066, "epoch": 0.00534, "grad_norm": 0.0008182890596799552, "kl": 0.4566446505486965, "learning_rate": 9.999885224415634e-06, "loss": 0.0015, "step": 534, "step_time": 2.0369577089950326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 175.5625, "completions/mean_terminated_length": 175.5625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.44044867902994156, "epoch": 0.00535, "frac_reward_zero_std": 0.5, "grad_norm": 0.029842451214790344, "kl": 0.5498414412140846, "learning_rate": 9.999884763009078e-06, "loss": -0.0448, "num_tokens": 5162814.0, "reward": 0.6884375214576721, "reward_std": 0.014318913221359253, "rewards/rollout_reward_func/mean": 0.6884375214576721, "rewards/rollout_reward_func/std": 0.3967992067337036, "sampling/importance_sampling_ratio/max": 1.0162140130996704, "sampling/importance_sampling_ratio/mean": 0.935027003288269, "sampling/importance_sampling_ratio/min": 0.00034402907476760447, "sampling/sampling_logp_difference/max": 3.0125386714935303, "sampling/sampling_logp_difference/mean": 0.05892196297645569, "step": 535, "step_time": 4.670384323988401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.44020059425383806, "epoch": 0.00536, "grad_norm": 0.030861053615808487, "kl": 0.5474926196038723, "learning_rate": 9.99988430067695e-06, "loss": -0.0448, "step": 536, "step_time": 2.0372734229822527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.08144369907677174, "epoch": 0.00537, "frac_reward_zero_std": 0.75, "grad_norm": 0.008907783776521683, "kl": 0.47535310685634613, "learning_rate": 9.999883837419253e-06, "loss": 0.0016, "num_tokens": 5181278.0, "reward": 0.7770384550094604, "reward_std": 0.040949735790491104, "rewards/rollout_reward_func/mean": 0.7770384550094604, "rewards/rollout_reward_func/std": 0.40727466344833374, "sampling/importance_sampling_ratio/max": 1.0012673139572144, "sampling/importance_sampling_ratio/mean": 0.9933499097824097, "sampling/importance_sampling_ratio/min": 0.9875993132591248, "sampling/sampling_logp_difference/max": 0.012510351836681366, "sampling/sampling_logp_difference/mean": 0.0022532367147505283, "step": 537, "step_time": 4.464596861987957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08276391681283712, "epoch": 0.00538, "grad_norm": 0.00969698280096054, "kl": 0.47516679763793945, "learning_rate": 9.999883373235985e-06, "loss": 0.0015, "step": 538, "step_time": 2.0479442920041038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 223.28125, "completions/mean_terminated_length": 223.28125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.34479291178286076, "epoch": 0.00539, "frac_reward_zero_std": 0.5, "grad_norm": 0.05642319098114967, "kl": 0.8109174929559231, "learning_rate": 9.999882908127145e-06, "loss": -0.0006, "num_tokens": 5201087.0, "reward": 0.917920708656311, "reward_std": 0.07179173827171326, "rewards/rollout_reward_func/mean": 0.917920708656311, "rewards/rollout_reward_func/std": 0.3310492932796478, "sampling/importance_sampling_ratio/max": 1.1234636306762695, "sampling/importance_sampling_ratio/mean": 0.943769633769989, "sampling/importance_sampling_ratio/min": 1.6025690001697512e-06, "sampling/sampling_logp_difference/max": 3.595282554626465, "sampling/sampling_logp_difference/mean": 0.09773115068674088, "step": 539, "step_time": 5.907020233004005 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.003289473708719015, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 0.34455110505223274, "epoch": 0.0054, "grad_norm": 0.06375354528427124, "kl": 0.8079422898590565, "learning_rate": 9.999882442092736e-06, "loss": -0.0006, "step": 540, "step_time": 2.0763470249803504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 150.8125, "completions/mean_terminated_length": 150.8125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2855285806581378, "epoch": 0.00541, "frac_reward_zero_std": 0.75, "grad_norm": 0.009608951397240162, "kl": 0.5679389089345932, "learning_rate": 9.999881975132757e-06, "loss": -0.0168, "num_tokens": 5218913.0, "reward": 0.6030961275100708, "reward_std": 0.00679910508915782, "rewards/rollout_reward_func/mean": 0.6030961275100708, "rewards/rollout_reward_func/std": 0.21752722561359406, "sampling/importance_sampling_ratio/max": 1.001436710357666, "sampling/importance_sampling_ratio/mean": 0.9577078819274902, "sampling/importance_sampling_ratio/min": 0.016682345420122147, "sampling/sampling_logp_difference/max": 2.0803067684173584, "sampling/sampling_logp_difference/mean": 0.0288749560713768, "step": 541, "step_time": 4.28335845998663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28904097341001034, "epoch": 0.00542, "grad_norm": 0.009871838614344597, "kl": 0.5664203986525536, "learning_rate": 9.999881507247207e-06, "loss": -0.0168, "step": 542, "step_time": 2.0224204210026073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 210.78125, "completions/mean_terminated_length": 210.78125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.33909058570861816, "epoch": 0.00543, "frac_reward_zero_std": 0.5, "grad_norm": 0.014534508809447289, "kl": 0.5110445506870747, "learning_rate": 9.999881038436085e-06, "loss": -0.0544, "num_tokens": 5238922.0, "reward": 0.9705480933189392, "reward_std": 0.00911080464720726, "rewards/rollout_reward_func/mean": 0.9705480933189392, "rewards/rollout_reward_func/std": 0.3500179946422577, "sampling/importance_sampling_ratio/max": 1.0009652376174927, "sampling/importance_sampling_ratio/mean": 0.9307692646980286, "sampling/importance_sampling_ratio/min": 0.008707833476364613, "sampling/sampling_logp_difference/max": 2.1098005771636963, "sampling/sampling_logp_difference/mean": 0.04465439170598984, "step": 543, "step_time": 4.51799412600667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3425491922535002, "epoch": 0.00544, "grad_norm": 0.014656747691333294, "kl": 0.5100430585443974, "learning_rate": 9.999880568699396e-06, "loss": -0.0544, "step": 544, "step_time": 2.4950888569946983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 123.8125, "completions/mean_terminated_length": 123.8125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.43788245413452387, "epoch": 0.00545, "frac_reward_zero_std": 0.5, "grad_norm": 0.012576183304190636, "kl": 0.5638104304671288, "learning_rate": 9.999880098037136e-06, "loss": -0.0345, "num_tokens": 5256596.0, "reward": 0.6759134531021118, "reward_std": 0.07322631776332855, "rewards/rollout_reward_func/mean": 0.6759134531021118, "rewards/rollout_reward_func/std": 0.17696066200733185, "sampling/importance_sampling_ratio/max": 1.0061415433883667, "sampling/importance_sampling_ratio/mean": 0.9299633502960205, "sampling/importance_sampling_ratio/min": 0.0008336707251146436, "sampling/sampling_logp_difference/max": 2.4787185192108154, "sampling/sampling_logp_difference/mean": 0.06893341988325119, "step": 545, "step_time": 4.631515301007312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.44372356589883566, "epoch": 0.00546, "grad_norm": 0.01220746524631977, "kl": 0.5683990381658077, "learning_rate": 9.999879626449306e-06, "loss": -0.0344, "step": 546, "step_time": 2.0332269949867623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 116.125, "completions/mean_terminated_length": 116.19354248046875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.5344618670642376, "epoch": 0.00547, "frac_reward_zero_std": 0.75, "grad_norm": 2.7518718242645264, "kl": 0.462713323533535, "learning_rate": 9.999879153935907e-06, "loss": -0.0391, "num_tokens": 5273312.0, "reward": 0.48201921582221985, "reward_std": 0.04362036660313606, "rewards/rollout_reward_func/mean": 0.48201921582221985, "rewards/rollout_reward_func/std": 0.10568524897098541, "sampling/importance_sampling_ratio/max": 2.212153673171997, "sampling/importance_sampling_ratio/mean": 0.9218744039535522, "sampling/importance_sampling_ratio/min": 1.1258917131939938e-09, "sampling/sampling_logp_difference/max": 2.3640174865722656, "sampling/sampling_logp_difference/mean": 0.188588485121727, "step": 547, "step_time": 4.081422916991869 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.11875000223517418, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1312500024214387, "entropy": 0.9430006239563227, "epoch": 0.00548, "grad_norm": 0.040014609694480896, "kl": 0.46646304056048393, "learning_rate": 9.99987868049694e-06, "loss": -0.0445, "step": 548, "step_time": 2.026960319002683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06619648449122906, "epoch": 0.00549, "frac_reward_zero_std": 0.75, "grad_norm": 0.010345548391342163, "kl": 0.46130289882421494, "learning_rate": 9.9998782061324e-06, "loss": 0.0019, "num_tokens": 5293400.0, "reward": 0.8850384950637817, "reward_std": 0.040949735790491104, "rewards/rollout_reward_func/mean": 0.8850384950637817, "rewards/rollout_reward_func/std": 0.28728586435317993, "sampling/importance_sampling_ratio/max": 1.0343540906906128, "sampling/importance_sampling_ratio/mean": 0.9988170862197876, "sampling/importance_sampling_ratio/min": 0.9889905452728271, "sampling/sampling_logp_difference/max": 0.04169128090143204, "sampling/sampling_logp_difference/mean": 0.0025750650092959404, "step": 549, "step_time": 4.5490293680049945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06536354869604111, "epoch": 0.0055, "grad_norm": 0.010559245944023132, "kl": 0.46143197268247604, "learning_rate": 9.999877730842293e-06, "loss": 0.0019, "step": 550, "step_time": 2.9303536269944743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 120.8125, "completions/mean_terminated_length": 120.8125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.35581435821950436, "epoch": 0.00551, "frac_reward_zero_std": 0.75, "grad_norm": 0.010389760136604309, "kl": 0.7081686556339264, "learning_rate": 9.999877254626616e-06, "loss": -0.0169, "num_tokens": 5310042.0, "reward": 0.6276923418045044, "reward_std": 0.038074981421232224, "rewards/rollout_reward_func/mean": 0.6276923418045044, "rewards/rollout_reward_func/std": 0.133396178483963, "sampling/importance_sampling_ratio/max": 0.9924627542495728, "sampling/importance_sampling_ratio/mean": 0.9482254981994629, "sampling/importance_sampling_ratio/min": 0.0035798533353954554, "sampling/sampling_logp_difference/max": 2.2552127838134766, "sampling/sampling_logp_difference/mean": 0.050594478845596313, "step": 551, "step_time": 4.030289383008494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35529105458408594, "epoch": 0.00552, "grad_norm": 0.009920688346028328, "kl": 0.7078035995364189, "learning_rate": 9.99987677748537e-06, "loss": -0.0169, "step": 552, "step_time": 2.039799989986932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 165.5, "completions/mean_terminated_length": 165.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2026800410822034, "epoch": 0.00553, "frac_reward_zero_std": 0.75, "grad_norm": 0.018710903823375702, "kl": 0.4631485752761364, "learning_rate": 9.999876299418556e-06, "loss": -0.0167, "num_tokens": 5329042.0, "reward": 0.672374963760376, "reward_std": 0.0088388342410326, "rewards/rollout_reward_func/mean": 0.672374963760376, "rewards/rollout_reward_func/std": 0.3484468162059784, "sampling/importance_sampling_ratio/max": 1.0036646127700806, "sampling/importance_sampling_ratio/mean": 0.9582653641700745, "sampling/importance_sampling_ratio/min": 0.019437124952673912, "sampling/sampling_logp_difference/max": 2.021327018737793, "sampling/sampling_logp_difference/mean": 0.02670004777610302, "step": 553, "step_time": 4.333562690007966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20236433297395706, "epoch": 0.00554, "grad_norm": 0.01861949823796749, "kl": 0.46309028565883636, "learning_rate": 9.999875820426172e-06, "loss": -0.0167, "step": 554, "step_time": 2.0354740309994668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 121.96875, "completions/mean_terminated_length": 121.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6867004996165633, "epoch": 0.00555, "frac_reward_zero_std": 0.5, "grad_norm": 0.016505293548107147, "kl": 0.5972956195473671, "learning_rate": 9.999875340508221e-06, "loss": -0.0557, "num_tokens": 5346057.0, "reward": 0.743418276309967, "reward_std": 0.031996577978134155, "rewards/rollout_reward_func/mean": 0.743418276309967, "rewards/rollout_reward_func/std": 0.2335359901189804, "sampling/importance_sampling_ratio/max": 0.9994720220565796, "sampling/importance_sampling_ratio/mean": 0.9043092131614685, "sampling/importance_sampling_ratio/min": 7.007899694144726e-05, "sampling/sampling_logp_difference/max": 2.8204548358917236, "sampling/sampling_logp_difference/mean": 0.13257735967636108, "step": 555, "step_time": 4.382848415989429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6857032533735037, "epoch": 0.00556, "grad_norm": 0.01591799594461918, "kl": 0.6003667712211609, "learning_rate": 9.999874859664698e-06, "loss": -0.0557, "step": 556, "step_time": 2.964035745979345 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 123.0625, "completions/mean_terminated_length": 123.0625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.7222844157367945, "epoch": 0.00557, "frac_reward_zero_std": 0.5, "grad_norm": 0.27279233932495117, "kl": 0.9305752590298653, "learning_rate": 9.99987437789561e-06, "loss": 0.0573, "num_tokens": 5362363.0, "reward": 0.6633893847465515, "reward_std": 0.2936166226863861, "rewards/rollout_reward_func/mean": 0.6633893847465515, "rewards/rollout_reward_func/std": 0.5676219463348389, "sampling/importance_sampling_ratio/max": 0.9821059703826904, "sampling/importance_sampling_ratio/mean": 0.765741229057312, "sampling/importance_sampling_ratio/min": 0.00010689114424167201, "sampling/sampling_logp_difference/max": 3.459178924560547, "sampling/sampling_logp_difference/mean": 0.2168469876050949, "step": 557, "step_time": 4.112037614992005 }, { "clip_ratio/high_max": 0.11250000167638063, "clip_ratio/high_mean": 0.05625000083819032, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05625000083819032, "entropy": 0.6483716666698456, "epoch": 0.00558, "grad_norm": 0.1727023720741272, "kl": 0.8020364865660667, "learning_rate": 9.999873895200953e-06, "loss": 0.0568, "step": 558, "step_time": 2.032808528005262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 218.625, "completions/mean_terminated_length": 218.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.2746572308242321, "epoch": 0.00559, "frac_reward_zero_std": 0.5, "grad_norm": 3.068614959716797, "kl": 0.6331011205911636, "learning_rate": 9.999873411580727e-06, "loss": -0.0455, "num_tokens": 5382895.0, "reward": 0.4883846044540405, "reward_std": 0.07614996284246445, "rewards/rollout_reward_func/mean": 0.4883846044540405, "rewards/rollout_reward_func/std": 0.18020637333393097, "sampling/importance_sampling_ratio/max": 1.5649570226669312, "sampling/importance_sampling_ratio/mean": 0.9313215017318726, "sampling/importance_sampling_ratio/min": 0.00557528343051672, "sampling/sampling_logp_difference/max": 2.2637643814086914, "sampling/sampling_logp_difference/mean": 0.038875192403793335, "step": 559, "step_time": 4.599886531002994 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.026785715483129025, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03199404897168279, "entropy": 0.368420859798789, "epoch": 0.0056, "grad_norm": 0.06925599277019501, "kl": 0.6589637398719788, "learning_rate": 9.999872927034932e-06, "loss": -0.0475, "step": 560, "step_time": 2.0561092469943105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 172.34375, "completions/mean_terminated_length": 172.34375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 1.1098433621227741, "epoch": 0.00561, "frac_reward_zero_std": 0.75, "grad_norm": 0.05534413456916809, "kl": 0.6625386737287045, "learning_rate": 9.99987244156357e-06, "loss": -0.0302, "num_tokens": 5401754.0, "reward": 0.8144903779029846, "reward_std": 0.05781066417694092, "rewards/rollout_reward_func/mean": 0.8144903779029846, "rewards/rollout_reward_func/std": 0.4052894413471222, "sampling/importance_sampling_ratio/max": 0.9985430240631104, "sampling/importance_sampling_ratio/mean": 0.887540340423584, "sampling/importance_sampling_ratio/min": 3.3029609718095564e-17, "sampling/sampling_logp_difference/max": 3.971953868865967, "sampling/sampling_logp_difference/mean": 0.3702417314052582, "step": 561, "step_time": 4.446160264007631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.092094810679555, "epoch": 0.00562, "grad_norm": 0.047613613307476044, "kl": 0.6189615875482559, "learning_rate": 9.999871955166642e-06, "loss": -0.0303, "step": 562, "step_time": 3.00144910900417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.22382140066474676, "epoch": 0.00563, "frac_reward_zero_std": 0.5, "grad_norm": 0.03972702473402023, "kl": 0.5183525867760181, "learning_rate": 9.999871467844145e-06, "loss": -0.0261, "num_tokens": 5419750.0, "reward": 0.6733654141426086, "reward_std": 0.09215877205133438, "rewards/rollout_reward_func/mean": 0.6733654141426086, "rewards/rollout_reward_func/std": 0.2774355411529541, "sampling/importance_sampling_ratio/max": 0.9959811568260193, "sampling/importance_sampling_ratio/mean": 0.9531292915344238, "sampling/importance_sampling_ratio/min": 0.01174512691795826, "sampling/sampling_logp_difference/max": 2.0674147605895996, "sampling/sampling_logp_difference/mean": 0.026546554639935493, "step": 563, "step_time": 4.2842176249978365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22329418547451496, "epoch": 0.00564, "grad_norm": 0.05303080007433891, "kl": 0.5161363631486893, "learning_rate": 9.999870979596079e-06, "loss": -0.0261, "step": 564, "step_time": 2.0244269149916363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.10115552507340908, "epoch": 0.00565, "frac_reward_zero_std": 0.75, "grad_norm": 0.026786062866449356, "kl": 0.549852579832077, "learning_rate": 9.999870490422448e-06, "loss": 0.0018, "num_tokens": 5437686.0, "reward": 0.5749038457870483, "reward_std": 0.04578319191932678, "rewards/rollout_reward_func/mean": 0.5749038457870483, "rewards/rollout_reward_func/std": 0.22259823977947235, "sampling/importance_sampling_ratio/max": 0.9992284178733826, "sampling/importance_sampling_ratio/mean": 0.9856471419334412, "sampling/importance_sampling_ratio/min": 0.963807225227356, "sampling/sampling_logp_difference/max": 0.03209029138088226, "sampling/sampling_logp_difference/mean": 0.0034846358466893435, "step": 565, "step_time": 4.259486251998169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10197913832962513, "epoch": 0.00566, "grad_norm": 0.04229649528861046, "kl": 0.5496331825852394, "learning_rate": 9.999870000323247e-06, "loss": 0.0017, "step": 566, "step_time": 2.0220089750073384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 164.15625, "completions/mean_terminated_length": 164.15625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.20660804584622383, "epoch": 0.00567, "frac_reward_zero_std": 0.75, "grad_norm": 0.013888769783079624, "kl": 0.4977695159614086, "learning_rate": 9.99986950929848e-06, "loss": -0.0266, "num_tokens": 5456203.0, "reward": 0.9213653802871704, "reward_std": 0.004079459700733423, "rewards/rollout_reward_func/mean": 0.9213653802871704, "rewards/rollout_reward_func/std": 0.329580157995224, "sampling/importance_sampling_ratio/max": 1.0530731678009033, "sampling/importance_sampling_ratio/mean": 0.9685459733009338, "sampling/importance_sampling_ratio/min": 0.023879049345850945, "sampling/sampling_logp_difference/max": 1.8830746412277222, "sampling/sampling_logp_difference/mean": 0.02668062038719654, "step": 567, "step_time": 4.806291009015695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2093363841995597, "epoch": 0.00568, "grad_norm": 0.016488978639245033, "kl": 0.49863768368959427, "learning_rate": 9.999869017348145e-06, "loss": -0.0266, "step": 568, "step_time": 2.5329311530003906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 184.8125, "completions/mean_terminated_length": 184.8125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.16271663829684258, "epoch": 0.00569, "frac_reward_zero_std": 0.75, "grad_norm": 0.05727388337254524, "kl": 0.5322196818888187, "learning_rate": 9.999868524472245e-06, "loss": -0.0249, "num_tokens": 5475597.0, "reward": 0.7042836546897888, "reward_std": 0.014318910427391529, "rewards/rollout_reward_func/mean": 0.7042836546897888, "rewards/rollout_reward_func/std": 0.16512222588062286, "sampling/importance_sampling_ratio/max": 1.0716825723648071, "sampling/importance_sampling_ratio/mean": 0.9788642525672913, "sampling/importance_sampling_ratio/min": 0.5020595192909241, "sampling/sampling_logp_difference/max": 0.684132993221283, "sampling/sampling_logp_difference/mean": 0.008913267403841019, "step": 569, "step_time": 4.372299043992825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.1544804759323597, "epoch": 0.0057, "grad_norm": 0.07175111770629883, "kl": 0.5131861194968224, "learning_rate": 9.999868030670776e-06, "loss": -0.025, "step": 570, "step_time": 2.0545507700007875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 148.0, "completions/mean_terminated_length": 148.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.6173192234709859, "epoch": 0.00571, "frac_reward_zero_std": 0.5, "grad_norm": 0.01682165265083313, "kl": 0.5431640669703484, "learning_rate": 9.99986753594374e-06, "loss": -0.0553, "num_tokens": 5493869.0, "reward": 0.7740947008132935, "reward_std": 0.1845589578151703, "rewards/rollout_reward_func/mean": 0.7740947008132935, "rewards/rollout_reward_func/std": 0.3358272910118103, "sampling/importance_sampling_ratio/max": 0.996526300907135, "sampling/importance_sampling_ratio/mean": 0.9251461625099182, "sampling/importance_sampling_ratio/min": 1.5044433210796743e-18, "sampling/sampling_logp_difference/max": 3.3011386394500732, "sampling/sampling_logp_difference/mean": 0.21536917984485626, "step": 571, "step_time": 4.733107366992044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6147173941135406, "epoch": 0.00572, "grad_norm": 0.01938587799668312, "kl": 0.5366679430007935, "learning_rate": 9.99986704029114e-06, "loss": -0.0553, "step": 572, "step_time": 2.09590702400601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 168.875, "completions/mean_terminated_length": 168.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6748920669779181, "epoch": 0.00573, "frac_reward_zero_std": 0.25, "grad_norm": 0.1763761341571808, "kl": 0.5913609489798546, "learning_rate": 9.99986654371297e-06, "loss": 0.0, "num_tokens": 5512489.0, "reward": 0.6061586141586304, "reward_std": 0.0692128911614418, "rewards/rollout_reward_func/mean": 0.6061586141586304, "rewards/rollout_reward_func/std": 0.30389270186424255, "sampling/importance_sampling_ratio/max": 1.007163405418396, "sampling/importance_sampling_ratio/mean": 0.927140474319458, "sampling/importance_sampling_ratio/min": 3.2719925944016404e-13, "sampling/sampling_logp_difference/max": 3.5894863605499268, "sampling/sampling_logp_difference/mean": 0.18477454781532288, "step": 573, "step_time": 5.485547922995465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6762363156303763, "epoch": 0.00574, "grad_norm": 0.2917422950267792, "kl": 0.5850836299359798, "learning_rate": 9.999866046209236e-06, "loss": -0.0004, "step": 574, "step_time": 2.0493869019992417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 163.0, "completions/mean_terminated_length": 163.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.9186871200799942, "epoch": 0.00575, "frac_reward_zero_std": 0.0, "grad_norm": 0.05549677461385727, "kl": 0.7959640622138977, "learning_rate": 9.999865547779934e-06, "loss": -0.1056, "num_tokens": 5531137.0, "reward": 0.8572403788566589, "reward_std": 0.14843758940696716, "rewards/rollout_reward_func/mean": 0.8572403788566589, "rewards/rollout_reward_func/std": 0.280100554227829, "sampling/importance_sampling_ratio/max": 1.0090402364730835, "sampling/importance_sampling_ratio/mean": 0.7568415403366089, "sampling/importance_sampling_ratio/min": 5.78055123151597e-27, "sampling/sampling_logp_difference/max": 3.7631258964538574, "sampling/sampling_logp_difference/mean": 0.5647425055503845, "step": 575, "step_time": 4.677513465001539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9090290535241365, "epoch": 0.00576, "grad_norm": 0.07216497510671616, "kl": 0.7841255068778992, "learning_rate": 9.999865048425068e-06, "loss": -0.1054, "step": 576, "step_time": 2.061460460005037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 138.0625, "completions/mean_terminated_length": 136.22579956054688, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.31442582607269287, "epoch": 0.00577, "frac_reward_zero_std": 0.5, "grad_norm": 0.42015475034713745, "kl": 0.4903907850384712, "learning_rate": 9.999864548144636e-06, "loss": -0.0333, "num_tokens": 5548067.0, "reward": 0.5561827421188354, "reward_std": 0.32140785455703735, "rewards/rollout_reward_func/mean": 0.5561827421188354, "rewards/rollout_reward_func/std": 0.5705376863479614, "sampling/importance_sampling_ratio/max": 2.0089876651763916, "sampling/importance_sampling_ratio/mean": 0.9944146275520325, "sampling/importance_sampling_ratio/min": 0.04385393112897873, "sampling/sampling_logp_difference/max": 1.4557836055755615, "sampling/sampling_logp_difference/mean": 0.07194922864437103, "step": 577, "step_time": 4.1663041499996325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.33245788142085075, "epoch": 0.00578, "grad_norm": 0.19548331201076508, "kl": 0.4897814057767391, "learning_rate": 9.999864046938636e-06, "loss": -0.034, "step": 578, "step_time": 2.037502242004848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 112.6875, "completions/mean_terminated_length": 112.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8576631601899862, "epoch": 0.00579, "frac_reward_zero_std": 0.5, "grad_norm": 0.4509117603302002, "kl": 0.5246343612670898, "learning_rate": 9.999863544807073e-06, "loss": 0.0431, "num_tokens": 5564265.0, "reward": 0.8816932439804077, "reward_std": 0.029672017320990562, "rewards/rollout_reward_func/mean": 0.8816932439804077, "rewards/rollout_reward_func/std": 0.18942789733409882, "sampling/importance_sampling_ratio/max": 1.1407220363616943, "sampling/importance_sampling_ratio/mean": 0.8898916840553284, "sampling/importance_sampling_ratio/min": 0.0005820344667881727, "sampling/sampling_logp_difference/max": 2.6946897506713867, "sampling/sampling_logp_difference/mean": 0.15078598260879517, "step": 579, "step_time": 5.218784433003748 }, { "clip_ratio/high_max": 0.0546875, "clip_ratio/high_mean": 0.02734375, "clip_ratio/low_mean": 0.05208333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07942708488553762, "entropy": 0.8364594131708145, "epoch": 0.0058, "grad_norm": 0.13285376131534576, "kl": 0.4950512908399105, "learning_rate": 9.999863041749942e-06, "loss": 0.0418, "step": 580, "step_time": 2.065672505988914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 178.53125, "completions/mean_terminated_length": 178.53125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.16316974023357034, "epoch": 0.00581, "frac_reward_zero_std": 0.5, "grad_norm": 0.8627434372901917, "kl": 0.5527270883321762, "learning_rate": 9.999862537767247e-06, "loss": -0.0339, "num_tokens": 5582890.0, "reward": 0.9388749599456787, "reward_std": 0.05906832963228226, "rewards/rollout_reward_func/mean": 0.9388749599456787, "rewards/rollout_reward_func/std": 0.2518100142478943, "sampling/importance_sampling_ratio/max": 1.2754160165786743, "sampling/importance_sampling_ratio/mean": 1.0151711702346802, "sampling/importance_sampling_ratio/min": 0.06738310307264328, "sampling/sampling_logp_difference/max": 2.509119749069214, "sampling/sampling_logp_difference/mean": 0.02618386223912239, "step": 581, "step_time": 4.2679351170081645 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.16662046127021313, "epoch": 0.00582, "grad_norm": 0.2603260278701782, "kl": 0.5385167934000492, "learning_rate": 9.999862032858985e-06, "loss": -0.0367, "step": 582, "step_time": 2.014352856007463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 98.5, "completions/mean_terminated_length": 98.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2026505209505558, "epoch": 0.00583, "frac_reward_zero_std": 0.75, "grad_norm": 0.033519309014081955, "kl": 0.54886344820261, "learning_rate": 9.999861527025157e-06, "loss": -0.0158, "num_tokens": 5598954.0, "reward": 0.6502403616905212, "reward_std": 0.026516500860452652, "rewards/rollout_reward_func/mean": 0.6502403616905212, "rewards/rollout_reward_func/std": 0.18044275045394897, "sampling/importance_sampling_ratio/max": 0.9938141703605652, "sampling/importance_sampling_ratio/mean": 0.9577100276947021, "sampling/importance_sampling_ratio/min": 0.10023245960474014, "sampling/sampling_logp_difference/max": 2.164916515350342, "sampling/sampling_logp_difference/mean": 0.021859534084796906, "step": 583, "step_time": 3.726978538019466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20704209059476852, "epoch": 0.00584, "grad_norm": 0.029176654294133186, "kl": 0.5560449101030827, "learning_rate": 9.999861020265767e-06, "loss": -0.0159, "step": 584, "step_time": 2.418702705996111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 212.5, "completions/mean_terminated_length": 212.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.1896136119030416, "epoch": 0.00585, "frac_reward_zero_std": 1.0, "grad_norm": 0.010084322653710842, "kl": 0.46885228902101517, "learning_rate": 9.999860512580808e-06, "loss": 0.0019, "num_tokens": 5618882.0, "reward": 0.8916538953781128, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8916538953781128, "rewards/rollout_reward_func/std": 0.18061095476150513, "sampling/importance_sampling_ratio/max": 1.0037418603897095, "sampling/importance_sampling_ratio/mean": 0.963196873664856, "sampling/importance_sampling_ratio/min": 0.008610271848738194, "sampling/sampling_logp_difference/max": 2.070098638534546, "sampling/sampling_logp_difference/mean": 0.02311640791594982, "step": 585, "step_time": 5.053659022996726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1853465368039906, "epoch": 0.00586, "grad_norm": 0.010677773505449295, "kl": 0.47354133799672127, "learning_rate": 9.999860003970287e-06, "loss": 0.0019, "step": 586, "step_time": 2.0238219989914796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 189.3125, "completions/mean_terminated_length": 189.3125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.35654676146805286, "epoch": 0.00587, "frac_reward_zero_std": 0.75, "grad_norm": 0.013650248758494854, "kl": 0.45695506036281586, "learning_rate": 9.9998594944342e-06, "loss": -0.0268, "num_tokens": 5637876.0, "reward": 0.5789312720298767, "reward_std": 0.06285499036312103, "rewards/rollout_reward_func/mean": 0.5789312720298767, "rewards/rollout_reward_func/std": 0.37738198041915894, "sampling/importance_sampling_ratio/max": 1.0125149488449097, "sampling/importance_sampling_ratio/mean": 0.9621522426605225, "sampling/importance_sampling_ratio/min": 8.666926078149118e-06, "sampling/sampling_logp_difference/max": 2.3736348152160645, "sampling/sampling_logp_difference/mean": 0.06037468463182449, "step": 587, "step_time": 4.557439230011369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3517012121155858, "epoch": 0.00588, "grad_norm": 0.011376464739441872, "kl": 0.4583461545407772, "learning_rate": 9.99985898397255e-06, "loss": -0.0268, "step": 588, "step_time": 2.0566261720086914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.08540689386427402, "epoch": 0.00589, "frac_reward_zero_std": 0.75, "grad_norm": 0.3075904846191406, "kl": 0.5036493092775345, "learning_rate": 9.999858472585334e-06, "loss": 0.0019, "num_tokens": 5657020.0, "reward": 0.2937692403793335, "reward_std": 0.18605493009090424, "rewards/rollout_reward_func/mean": 0.2937692403793335, "rewards/rollout_reward_func/std": 0.5160201787948608, "sampling/importance_sampling_ratio/max": 1.0003478527069092, "sampling/importance_sampling_ratio/mean": 0.9866231679916382, "sampling/importance_sampling_ratio/min": 0.9615564346313477, "sampling/sampling_logp_difference/max": 0.015984199941158295, "sampling/sampling_logp_difference/mean": 0.0025057175662368536, "step": 589, "step_time": 4.558817552002438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08648429019376636, "epoch": 0.0059, "grad_norm": 0.3594357669353485, "kl": 0.5035416074097157, "learning_rate": 9.999857960272553e-06, "loss": 0.0018, "step": 590, "step_time": 2.947005011010333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 178.6875, "completions/mean_terminated_length": 178.6875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3878110246732831, "epoch": 0.00591, "frac_reward_zero_std": 0.25, "grad_norm": 0.5224819779396057, "kl": 0.6424956731498241, "learning_rate": 9.99985744703421e-06, "loss": -0.0023, "num_tokens": 5675298.0, "reward": 0.2792932689189911, "reward_std": 0.4259030520915985, "rewards/rollout_reward_func/mean": 0.2792932689189911, "rewards/rollout_reward_func/std": 0.7955617904663086, "sampling/importance_sampling_ratio/max": 1.0063856840133667, "sampling/importance_sampling_ratio/mean": 0.9177218675613403, "sampling/importance_sampling_ratio/min": 0.003274014685302973, "sampling/sampling_logp_difference/max": 2.2458362579345703, "sampling/sampling_logp_difference/mean": 0.05326750874519348, "step": 591, "step_time": 4.627018920014962 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017708333674818277, "entropy": 0.3916383869946003, "epoch": 0.00592, "grad_norm": 0.6240778565406799, "kl": 0.6316216699779034, "learning_rate": 9.9998569328703e-06, "loss": -0.0045, "step": 592, "step_time": 2.0464710829983233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 126.5, "completions/mean_terminated_length": 126.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3097229404374957, "epoch": 0.00593, "frac_reward_zero_std": 0.75, "grad_norm": 0.01591629534959793, "kl": 0.4708476960659027, "learning_rate": 9.99985641778083e-06, "loss": -0.0174, "num_tokens": 5692666.0, "reward": 0.5175961256027222, "reward_std": 0.0176776684820652, "rewards/rollout_reward_func/mean": 0.5175961256027222, "rewards/rollout_reward_func/std": 0.2490103840827942, "sampling/importance_sampling_ratio/max": 1.0004478693008423, "sampling/importance_sampling_ratio/mean": 0.955543577671051, "sampling/importance_sampling_ratio/min": 0.004055165220052004, "sampling/sampling_logp_difference/max": 1.884890079498291, "sampling/sampling_logp_difference/mean": 0.03623073920607567, "step": 593, "step_time": 4.08578766501887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3143031829968095, "epoch": 0.00594, "grad_norm": 0.015563342720270157, "kl": 0.46986745670437813, "learning_rate": 9.999855901765791e-06, "loss": -0.0173, "step": 594, "step_time": 2.007769087002089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 187.46875, "completions/mean_terminated_length": 187.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5948136942461133, "epoch": 0.00595, "frac_reward_zero_std": 0.5, "grad_norm": 0.09121229499578476, "kl": 0.8592243567109108, "learning_rate": 9.99985538482519e-06, "loss": -0.0589, "num_tokens": 5711609.0, "reward": 0.6717836260795593, "reward_std": 0.2623599171638489, "rewards/rollout_reward_func/mean": 0.6717836260795593, "rewards/rollout_reward_func/std": 0.4471849501132965, "sampling/importance_sampling_ratio/max": 1.3950891494750977, "sampling/importance_sampling_ratio/mean": 0.9237210154533386, "sampling/importance_sampling_ratio/min": 1.624558032043734e-14, "sampling/sampling_logp_difference/max": 3.8006467819213867, "sampling/sampling_logp_difference/mean": 0.17565229535102844, "step": 595, "step_time": 4.3657486390002305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.597375700250268, "epoch": 0.00596, "grad_norm": 0.09652739018201828, "kl": 0.8990072421729565, "learning_rate": 9.999854866959026e-06, "loss": -0.0588, "step": 596, "step_time": 2.907260732004943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 216.0, "completions/mean_terminated_length": 216.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.1895548179745674, "epoch": 0.00597, "frac_reward_zero_std": 0.5, "grad_norm": 0.917389452457428, "kl": 0.49392109364271164, "learning_rate": 9.999854348167299e-06, "loss": -0.06, "num_tokens": 5731241.0, "reward": 0.6505336761474609, "reward_std": 0.10389473289251328, "rewards/rollout_reward_func/mean": 0.6505336761474609, "rewards/rollout_reward_func/std": 0.3792053759098053, "sampling/importance_sampling_ratio/max": 0.9977957010269165, "sampling/importance_sampling_ratio/mean": 0.8142368793487549, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7231963872909546, "sampling/sampling_logp_difference/mean": 0.04102646932005882, "step": 597, "step_time": 4.696117517996754 }, { "clip_ratio/high_max": 0.046130954287946224, "clip_ratio/high_mean": 0.023065477143973112, "clip_ratio/low_mean": 0.028273810632526875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0513392873108387, "entropy": 0.2162509048357606, "epoch": 0.00598, "grad_norm": 0.03160576522350311, "kl": 0.48236118629574776, "learning_rate": 9.999853828450009e-06, "loss": -0.0613, "step": 598, "step_time": 2.0428782750095706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 53.1875, "completions/mean_terminated_length": 53.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8382059279829264, "epoch": 0.00599, "frac_reward_zero_std": 0.5, "grad_norm": 0.01581570692360401, "kl": 0.7422221973538399, "learning_rate": 9.999853307807155e-06, "loss": -0.0269, "num_tokens": 5745791.0, "reward": 0.7420192360877991, "reward_std": 0.035627301782369614, "rewards/rollout_reward_func/mean": 0.7420192360877991, "rewards/rollout_reward_func/std": 0.17344777286052704, "sampling/importance_sampling_ratio/max": 1.0046192407608032, "sampling/importance_sampling_ratio/mean": 0.9019453525543213, "sampling/importance_sampling_ratio/min": 1.3872697309125215e-05, "sampling/sampling_logp_difference/max": 2.959707498550415, "sampling/sampling_logp_difference/mean": 0.20251873135566711, "step": 599, "step_time": 3.7851065109935007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8346075415611267, "epoch": 0.006, "grad_norm": 0.015274576842784882, "kl": 0.742184117436409, "learning_rate": 9.999852786238737e-06, "loss": -0.0269, "step": 600, "step_time": 1.9880009850021452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 198.5, "completions/mean_terminated_length": 198.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.056643462274223566, "epoch": 0.00601, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005858411896042526, "kl": 0.41675687581300735, "learning_rate": 9.999852263744758e-06, "loss": 0.0015, "num_tokens": 5764823.0, "reward": 1.0340769290924072, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0340769290924072, "rewards/rollout_reward_func/std": 0.3529582917690277, "sampling/importance_sampling_ratio/max": 1.0056337118148804, "sampling/importance_sampling_ratio/mean": 0.997256875038147, "sampling/importance_sampling_ratio/min": 0.9912053942680359, "sampling/sampling_logp_difference/max": 0.007689394056797028, "sampling/sampling_logp_difference/mean": 0.0017371228896081448, "step": 601, "step_time": 4.224608546006493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05650326982140541, "epoch": 0.00602, "grad_norm": 0.0005881523247808218, "kl": 0.41675012931227684, "learning_rate": 9.999851740325214e-06, "loss": 0.0015, "step": 602, "step_time": 2.5181287759987754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 217.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.26283385418355465, "epoch": 0.00603, "frac_reward_zero_std": 0.75, "grad_norm": 0.3552459478378296, "kl": 0.4526195526123047, "learning_rate": 9.999851215980108e-06, "loss": -0.025, "num_tokens": 5784491.0, "reward": 0.702423095703125, "reward_std": 0.017804233357310295, "rewards/rollout_reward_func/mean": 0.702423095703125, "rewards/rollout_reward_func/std": 0.447466105222702, "sampling/importance_sampling_ratio/max": 1.0144517421722412, "sampling/importance_sampling_ratio/mean": 0.8999935388565063, "sampling/importance_sampling_ratio/min": 0.015637550503015518, "sampling/sampling_logp_difference/max": 1.94087553024292, "sampling/sampling_logp_difference/mean": 0.04116165265440941, "step": 603, "step_time": 4.600066073006019 }, { "clip_ratio/high_max": 0.031250000931322575, "clip_ratio/high_mean": 0.015625000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625000465661287, "entropy": 0.2464360548183322, "epoch": 0.00604, "grad_norm": 0.05708470568060875, "kl": 0.4608061797916889, "learning_rate": 9.99985069070944e-06, "loss": -0.0254, "step": 604, "step_time": 2.032470333004312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.20198502950370312, "epoch": 0.00605, "frac_reward_zero_std": 0.5, "grad_norm": 0.09664832055568695, "kl": 0.5780481547117233, "learning_rate": 9.999850164513208e-06, "loss": -0.0284, "num_tokens": 5802623.0, "reward": 0.7745672464370728, "reward_std": 0.05761364474892616, "rewards/rollout_reward_func/mean": 0.7745672464370728, "rewards/rollout_reward_func/std": 0.3227130174636841, "sampling/importance_sampling_ratio/max": 1.0571664571762085, "sampling/importance_sampling_ratio/mean": 0.9722798466682434, "sampling/importance_sampling_ratio/min": 0.009371067397296429, "sampling/sampling_logp_difference/max": 2.133582592010498, "sampling/sampling_logp_difference/mean": 0.027811698615550995, "step": 605, "step_time": 4.262806942992029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.20571562740951777, "epoch": 0.00606, "grad_norm": 0.11231128126382828, "kl": 0.5795321427285671, "learning_rate": 9.999849637391415e-06, "loss": -0.0282, "step": 606, "step_time": 2.0170472360114218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 149.96875, "completions/mean_terminated_length": 149.96875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5333646340295672, "epoch": 0.00607, "frac_reward_zero_std": 0.5, "grad_norm": 0.024302510544657707, "kl": 0.6229367628693581, "learning_rate": 9.99984910934406e-06, "loss": -0.0359, "num_tokens": 5819870.0, "reward": 0.9006441831588745, "reward_std": 0.07139059156179428, "rewards/rollout_reward_func/mean": 0.9006441831588745, "rewards/rollout_reward_func/std": 0.2868698537349701, "sampling/importance_sampling_ratio/max": 0.9947630763053894, "sampling/importance_sampling_ratio/mean": 0.9235025644302368, "sampling/importance_sampling_ratio/min": 0.00024276197655126452, "sampling/sampling_logp_difference/max": 2.1357693672180176, "sampling/sampling_logp_difference/mean": 0.08211413025856018, "step": 607, "step_time": 4.508626047005237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5316923893988132, "epoch": 0.00608, "grad_norm": 0.02394806034862995, "kl": 0.6204392127692699, "learning_rate": 9.999848580371143e-06, "loss": -0.0359, "step": 608, "step_time": 2.5063179819917423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 150.5, "completions/mean_terminated_length": 150.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.09713264089077711, "epoch": 0.00609, "frac_reward_zero_std": 0.75, "grad_norm": 0.01760881207883358, "kl": 0.509792897850275, "learning_rate": 9.999848050472662e-06, "loss": 0.0011, "num_tokens": 5837110.0, "reward": 0.6764423251152039, "reward_std": 0.04578319564461708, "rewards/rollout_reward_func/mean": 0.6764423251152039, "rewards/rollout_reward_func/std": 0.3314490020275116, "sampling/importance_sampling_ratio/max": 1.0115318298339844, "sampling/importance_sampling_ratio/mean": 0.9946833848953247, "sampling/importance_sampling_ratio/min": 0.9667032361030579, "sampling/sampling_logp_difference/max": 0.023550543934106827, "sampling/sampling_logp_difference/mean": 0.0030600226018577814, "step": 609, "step_time": 4.08831884299434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0976322004571557, "epoch": 0.0061, "grad_norm": 0.025274179875850677, "kl": 0.5095813684165478, "learning_rate": 9.99984751964862e-06, "loss": 0.0012, "step": 610, "step_time": 2.015351030000602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 79.5, "completions/mean_terminated_length": 79.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6195004023611546, "epoch": 0.00611, "frac_reward_zero_std": 0.75, "grad_norm": 0.01837080717086792, "kl": 0.9322483167052269, "learning_rate": 9.999846987899019e-06, "loss": -0.0096, "num_tokens": 5852190.0, "reward": 0.5860576629638672, "reward_std": 0.029376976191997528, "rewards/rollout_reward_func/mean": 0.5860576629638672, "rewards/rollout_reward_func/std": 0.16335895657539368, "sampling/importance_sampling_ratio/max": 1.0053095817565918, "sampling/importance_sampling_ratio/mean": 0.9246397018432617, "sampling/importance_sampling_ratio/min": 0.015208233147859573, "sampling/sampling_logp_difference/max": 1.934950828552246, "sampling/sampling_logp_difference/mean": 0.08944666385650635, "step": 611, "step_time": 3.735767090009176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6229163352400064, "epoch": 0.00612, "grad_norm": 0.017311938107013702, "kl": 0.91647569835186, "learning_rate": 9.999846455223852e-06, "loss": -0.0096, "step": 612, "step_time": 1.9905616420073784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 170.21875, "completions/mean_terminated_length": 170.21875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4543438167311251, "epoch": 0.00613, "frac_reward_zero_std": 0.75, "grad_norm": 0.009422477334737778, "kl": 0.4433828517794609, "learning_rate": 9.999845921623126e-06, "loss": -0.027, "num_tokens": 5870581.0, "reward": 0.9706202149391174, "reward_std": 0.013802183791995049, "rewards/rollout_reward_func/mean": 0.9706202149391174, "rewards/rollout_reward_func/std": 0.149695485830307, "sampling/importance_sampling_ratio/max": 1.0013611316680908, "sampling/importance_sampling_ratio/mean": 0.9656591415405273, "sampling/importance_sampling_ratio/min": 1.3669082074256355e-19, "sampling/sampling_logp_difference/max": 3.0681650638580322, "sampling/sampling_logp_difference/mean": 0.22761841118335724, "step": 613, "step_time": 4.847307729985914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.45524171367287636, "epoch": 0.00614, "grad_norm": 0.009490645490586758, "kl": 0.44412341341376305, "learning_rate": 9.999845387096839e-06, "loss": -0.027, "step": 614, "step_time": 2.0354056840005796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 241.5, "completions/mean_terminated_length": 241.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.09740833565592766, "epoch": 0.00615, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023073749616742134, "kl": 0.40419643744826317, "learning_rate": 9.99984485164499e-06, "loss": 0.0018, "num_tokens": 5890213.0, "reward": 1.024384617805481, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.024384617805481, "rewards/rollout_reward_func/std": 0.25060388445854187, "sampling/importance_sampling_ratio/max": 2.3286831378936768, "sampling/importance_sampling_ratio/mean": 1.0095577239990234, "sampling/importance_sampling_ratio/min": 0.380736380815506, "sampling/sampling_logp_difference/max": 0.9773988723754883, "sampling/sampling_logp_difference/mean": 0.016501877456903458, "step": 615, "step_time": 4.811261255999852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09369855653494596, "epoch": 0.00616, "grad_norm": 0.0023758155293762684, "kl": 0.4053357504308224, "learning_rate": 9.99984431526758e-06, "loss": 0.0018, "step": 616, "step_time": 2.042099705984583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 151.3125, "completions/mean_terminated_length": 151.3125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7437649797648191, "epoch": 0.00617, "frac_reward_zero_std": 0.5, "grad_norm": 0.03997695446014404, "kl": 0.6033024415373802, "learning_rate": 9.99984377796461e-06, "loss": -0.0533, "num_tokens": 5908647.0, "reward": 0.54942786693573, "reward_std": 0.0585130900144577, "rewards/rollout_reward_func/mean": 0.54942786693573, "rewards/rollout_reward_func/std": 0.16699153184890747, "sampling/importance_sampling_ratio/max": 1.0096994638442993, "sampling/importance_sampling_ratio/mean": 0.9061462879180908, "sampling/importance_sampling_ratio/min": 5.155259899525255e-13, "sampling/sampling_logp_difference/max": 3.7633771896362305, "sampling/sampling_logp_difference/mean": 0.21079857647418976, "step": 617, "step_time": 4.542179428994132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7441392336040735, "epoch": 0.00618, "grad_norm": 0.03563015162944794, "kl": 0.6143784075975418, "learning_rate": 9.999843239736079e-06, "loss": -0.0533, "step": 618, "step_time": 2.0836674889942515 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 141.3125, "completions/mean_terminated_length": 141.3125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.27209753450006247, "epoch": 0.00619, "frac_reward_zero_std": 0.75, "grad_norm": 0.15146292746067047, "kl": 0.5507066249847412, "learning_rate": 9.999842700581986e-06, "loss": -0.0255, "num_tokens": 5926193.0, "reward": 0.7788509130477905, "reward_std": 0.018806321546435356, "rewards/rollout_reward_func/mean": 0.7788509130477905, "rewards/rollout_reward_func/std": 0.20860223472118378, "sampling/importance_sampling_ratio/max": 1.194359302520752, "sampling/importance_sampling_ratio/mean": 0.9002439975738525, "sampling/importance_sampling_ratio/min": 0.00809680949896574, "sampling/sampling_logp_difference/max": 1.736802577972412, "sampling/sampling_logp_difference/mean": 0.048184528946876526, "step": 619, "step_time": 5.357732593991386 }, { "clip_ratio/high_max": 0.0193452388048172, "clip_ratio/high_mean": 0.0096726194024086, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0096726194024086, "entropy": 0.26904471684247255, "epoch": 0.0062, "grad_norm": 0.05435088276863098, "kl": 0.5513041093945503, "learning_rate": 9.999842160502334e-06, "loss": -0.0255, "step": 620, "step_time": 2.0254883840098046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 186.0, "completions/mean_terminated_length": 186.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.054287965409457684, "epoch": 0.00621, "frac_reward_zero_std": 1.0, "grad_norm": 0.00046490112436003983, "kl": 0.4154471084475517, "learning_rate": 9.99984161949712e-06, "loss": 0.0016, "num_tokens": 5945137.0, "reward": 0.8280768990516663, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8280768990516663, "rewards/rollout_reward_func/std": 0.2575747072696686, "sampling/importance_sampling_ratio/max": 0.9993473291397095, "sampling/importance_sampling_ratio/mean": 0.9948965311050415, "sampling/importance_sampling_ratio/min": 0.9922547936439514, "sampling/sampling_logp_difference/max": 0.004727287217974663, "sampling/sampling_logp_difference/mean": 0.0014122787397354841, "step": 621, "step_time": 4.242352864006534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05429221224039793, "epoch": 0.00622, "grad_norm": 0.000458077120129019, "kl": 0.4154219701886177, "learning_rate": 9.999841077566347e-06, "loss": 0.0016, "step": 622, "step_time": 2.0114293429942336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 168.625, "completions/mean_terminated_length": 168.625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3742101127281785, "epoch": 0.00623, "frac_reward_zero_std": 0.75, "grad_norm": 0.01570269465446472, "kl": 0.5212467685341835, "learning_rate": 9.999840534710012e-06, "loss": -0.0174, "num_tokens": 5963037.0, "reward": 0.7672788500785828, "reward_std": 0.01563793420791626, "rewards/rollout_reward_func/mean": 0.7672788500785828, "rewards/rollout_reward_func/std": 0.35345688462257385, "sampling/importance_sampling_ratio/max": 1.0155529975891113, "sampling/importance_sampling_ratio/mean": 0.9629439115524292, "sampling/importance_sampling_ratio/min": 0.0003103054768871516, "sampling/sampling_logp_difference/max": 2.0854744911193848, "sampling/sampling_logp_difference/mean": 0.05294202268123627, "step": 623, "step_time": 4.364555432017369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.37678910326212645, "epoch": 0.00624, "grad_norm": 0.014905346557497978, "kl": 0.521346140652895, "learning_rate": 9.99983999092812e-06, "loss": -0.0174, "step": 624, "step_time": 2.0330318120031734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 76.46875, "completions/mean_terminated_length": 76.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4328268300741911, "epoch": 0.00625, "frac_reward_zero_std": 0.5, "grad_norm": 0.1436389535665512, "kl": 0.75273547321558, "learning_rate": 9.999839446220667e-06, "loss": -0.0309, "num_tokens": 5978428.0, "reward": 0.8149038553237915, "reward_std": 0.06663121283054352, "rewards/rollout_reward_func/mean": 0.8149038553237915, "rewards/rollout_reward_func/std": 0.1750950813293457, "sampling/importance_sampling_ratio/max": 1.00200355052948, "sampling/importance_sampling_ratio/mean": 0.9284379482269287, "sampling/importance_sampling_ratio/min": 0.10261686891317368, "sampling/sampling_logp_difference/max": 1.01966392993927, "sampling/sampling_logp_difference/mean": 0.04699702560901642, "step": 625, "step_time": 4.784095068003808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43991614878177643, "epoch": 0.00626, "grad_norm": 0.13027192652225494, "kl": 0.7666438743472099, "learning_rate": 9.999838900587653e-06, "loss": -0.0312, "step": 626, "step_time": 2.0042108289999305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 149.09375, "completions/mean_terminated_length": 149.09375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3503986271098256, "epoch": 0.00627, "frac_reward_zero_std": 0.75, "grad_norm": 0.031044159084558487, "kl": 0.48867008462548256, "learning_rate": 9.999838354029082e-06, "loss": 0.0196, "num_tokens": 5996503.0, "reward": 0.8731634616851807, "reward_std": 0.0006799129769206047, "rewards/rollout_reward_func/mean": 0.8731634616851807, "rewards/rollout_reward_func/std": 0.3984382748603821, "sampling/importance_sampling_ratio/max": 1.0046266317367554, "sampling/importance_sampling_ratio/mean": 0.9329265356063843, "sampling/importance_sampling_ratio/min": 1.744735744768633e-10, "sampling/sampling_logp_difference/max": 19.62625503540039, "sampling/sampling_logp_difference/mean": 0.1712799072265625, "step": 627, "step_time": 4.123919456003932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3563398625701666, "epoch": 0.00628, "grad_norm": 0.02886122465133667, "kl": 0.4917067512869835, "learning_rate": 9.99983780654495e-06, "loss": 0.0197, "step": 628, "step_time": 2.018704587004322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 145.625, "completions/mean_terminated_length": 145.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.8199109309352934, "epoch": 0.00629, "frac_reward_zero_std": 0.25, "grad_norm": 0.05150682106614113, "kl": 0.853702038526535, "learning_rate": 9.999837258135259e-06, "loss": 0.0412, "num_tokens": 6014019.0, "reward": 0.7381393909454346, "reward_std": 0.013774977996945381, "rewards/rollout_reward_func/mean": 0.7381393909454346, "rewards/rollout_reward_func/std": 0.2583564519882202, "sampling/importance_sampling_ratio/max": 1.0036872625350952, "sampling/importance_sampling_ratio/mean": 0.9036210775375366, "sampling/importance_sampling_ratio/min": 6.438462492042163e-07, "sampling/sampling_logp_difference/max": 2.8836965560913086, "sampling/sampling_logp_difference/mean": 0.17912371456623077, "step": 629, "step_time": 4.265804352005944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8249099175445735, "epoch": 0.0063, "grad_norm": 0.05098960921168327, "kl": 0.8467474691569805, "learning_rate": 9.999836708800008e-06, "loss": 0.0412, "step": 630, "step_time": 2.033998224993411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 170.1875, "completions/mean_terminated_length": 170.1875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.45005506090819836, "epoch": 0.00631, "frac_reward_zero_std": 0.5, "grad_norm": 0.019395310431718826, "kl": 0.6253640912473202, "learning_rate": 9.999836158539198e-06, "loss": 0.0117, "num_tokens": 6032321.0, "reward": 0.37078845500946045, "reward_std": 0.04460212215781212, "rewards/rollout_reward_func/mean": 0.37078845500946045, "rewards/rollout_reward_func/std": 0.10186334699392319, "sampling/importance_sampling_ratio/max": 1.0003451108932495, "sampling/importance_sampling_ratio/mean": 0.9306512475013733, "sampling/importance_sampling_ratio/min": 7.25821009837091e-05, "sampling/sampling_logp_difference/max": 2.6840453147888184, "sampling/sampling_logp_difference/mean": 0.09567420184612274, "step": 631, "step_time": 5.76228655999148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4528496041893959, "epoch": 0.00632, "grad_norm": 0.01922324299812317, "kl": 0.6243899464607239, "learning_rate": 9.99983560735283e-06, "loss": 0.0117, "step": 632, "step_time": 2.035032043015235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5503037525340915, "epoch": 0.00633, "frac_reward_zero_std": 0.5, "grad_norm": 0.052552178502082825, "kl": 0.5110547281801701, "learning_rate": 9.999835055240903e-06, "loss": -0.0434, "num_tokens": 6050233.0, "reward": 0.8167163729667664, "reward_std": 0.017228921875357628, "rewards/rollout_reward_func/mean": 0.8167163729667664, "rewards/rollout_reward_func/std": 0.3480328917503357, "sampling/importance_sampling_ratio/max": 1.139699101448059, "sampling/importance_sampling_ratio/mean": 0.9144136905670166, "sampling/importance_sampling_ratio/min": 0.0007604396087117493, "sampling/sampling_logp_difference/max": 2.5780556201934814, "sampling/sampling_logp_difference/mean": 0.09279778599739075, "step": 633, "step_time": 4.688368832998094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5482689253985882, "epoch": 0.00634, "grad_norm": 0.04971202462911606, "kl": 0.5101210288703442, "learning_rate": 9.999834502203417e-06, "loss": -0.0434, "step": 634, "step_time": 2.036307663001935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 133.3125, "completions/mean_terminated_length": 133.3125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6542719947174191, "epoch": 0.00635, "frac_reward_zero_std": 0.25, "grad_norm": 0.11026772856712341, "kl": 0.5742432735860348, "learning_rate": 9.999833948240373e-06, "loss": -0.0595, "num_tokens": 6067123.0, "reward": 0.5449855327606201, "reward_std": 0.08841682970523834, "rewards/rollout_reward_func/mean": 0.5449855327606201, "rewards/rollout_reward_func/std": 0.20172666013240814, "sampling/importance_sampling_ratio/max": 1.1136748790740967, "sampling/importance_sampling_ratio/mean": 0.8842746019363403, "sampling/importance_sampling_ratio/min": 0.010917040519416332, "sampling/sampling_logp_difference/max": 2.252890110015869, "sampling/sampling_logp_difference/mean": 0.10276907682418823, "step": 635, "step_time": 4.397147246003442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.6563875824213028, "epoch": 0.00636, "grad_norm": 0.04649810492992401, "kl": 0.585708525031805, "learning_rate": 9.99983339335177e-06, "loss": -0.0596, "step": 636, "step_time": 2.4884569790156092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014136905316263437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014136905316263437, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 135.59375, "completions/mean_terminated_length": 135.59375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.44074865337461233, "epoch": 0.00637, "frac_reward_zero_std": 0.5, "grad_norm": 0.2061687409877777, "kl": 0.5663844048976898, "learning_rate": 9.99983283753761e-06, "loss": -0.0559, "num_tokens": 6084654.0, "reward": 0.7432019710540771, "reward_std": 0.033358972519636154, "rewards/rollout_reward_func/mean": 0.7432019710540771, "rewards/rollout_reward_func/std": 0.12880952656269073, "sampling/importance_sampling_ratio/max": 1.1803276538848877, "sampling/importance_sampling_ratio/mean": 0.9291670322418213, "sampling/importance_sampling_ratio/min": 0.006724283564835787, "sampling/sampling_logp_difference/max": 2.0350465774536133, "sampling/sampling_logp_difference/mean": 0.06739106774330139, "step": 637, "step_time": 4.988836311982595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.025595238897949457, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025595238897949457, "entropy": 0.4764476977288723, "epoch": 0.00638, "grad_norm": 0.060999490320682526, "kl": 0.5753549858927727, "learning_rate": 9.99983228079789e-06, "loss": -0.0563, "step": 638, "step_time": 2.0359813869799837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 88.75, "completions/mean_terminated_length": 88.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2585596293210983, "epoch": 0.00639, "frac_reward_zero_std": 0.0, "grad_norm": 0.019094089046120644, "kl": 0.811336487531662, "learning_rate": 9.999831723132612e-06, "loss": -0.0645, "num_tokens": 6100254.0, "reward": 0.7218269109725952, "reward_std": 0.13625404238700867, "rewards/rollout_reward_func/mean": 0.7218269109725952, "rewards/rollout_reward_func/std": 0.2847832441329956, "sampling/importance_sampling_ratio/max": 1.0012073516845703, "sampling/importance_sampling_ratio/mean": 0.8648294806480408, "sampling/importance_sampling_ratio/min": 4.311816326207918e-07, "sampling/sampling_logp_difference/max": 2.8248417377471924, "sampling/sampling_logp_difference/mean": 0.2753353714942932, "step": 639, "step_time": 4.424368790998415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2629176285117865, "epoch": 0.0064, "grad_norm": 0.01816694065928459, "kl": 0.8214796744287014, "learning_rate": 9.999831164541778e-06, "loss": -0.0645, "step": 640, "step_time": 2.052680546999909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 215.28125, "completions/mean_terminated_length": 215.28125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3058623317629099, "epoch": 0.00641, "frac_reward_zero_std": 0.75, "grad_norm": 0.012948614545166492, "kl": 0.4767492935061455, "learning_rate": 9.999830605025384e-06, "loss": -0.0363, "num_tokens": 6119351.0, "reward": 1.0949039459228516, "reward_std": 0.04770251736044884, "rewards/rollout_reward_func/mean": 1.0949039459228516, "rewards/rollout_reward_func/std": 0.3891761898994446, "sampling/importance_sampling_ratio/max": 1.0021718740463257, "sampling/importance_sampling_ratio/mean": 0.9632158875465393, "sampling/importance_sampling_ratio/min": 3.1096280963538447e-06, "sampling/sampling_logp_difference/max": 2.6698122024536133, "sampling/sampling_logp_difference/mean": 0.05851081758737564, "step": 641, "step_time": 4.941996443987591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30538133857771754, "epoch": 0.00642, "grad_norm": 0.013810113072395325, "kl": 0.4800092540681362, "learning_rate": 9.999830044583436e-06, "loss": -0.0363, "step": 642, "step_time": 2.976816983995377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 170.0625, "completions/mean_terminated_length": 170.0625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3294738791882992, "epoch": 0.00643, "frac_reward_zero_std": 0.75, "grad_norm": 0.010593459941446781, "kl": 0.5629266984760761, "learning_rate": 9.999829483215928e-06, "loss": -0.0266, "num_tokens": 6137409.0, "reward": 0.943913459777832, "reward_std": 0.00231169443577528, "rewards/rollout_reward_func/mean": 0.943913459777832, "rewards/rollout_reward_func/std": 0.14785441756248474, "sampling/importance_sampling_ratio/max": 1.0028903484344482, "sampling/importance_sampling_ratio/mean": 0.9649864435195923, "sampling/importance_sampling_ratio/min": 2.9035567422397435e-05, "sampling/sampling_logp_difference/max": 2.328205108642578, "sampling/sampling_logp_difference/mean": 0.06524751335382462, "step": 643, "step_time": 4.687623117992189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.327488477807492, "epoch": 0.00644, "grad_norm": 0.010778658092021942, "kl": 0.5636148974299431, "learning_rate": 9.999828920922866e-06, "loss": -0.0266, "step": 644, "step_time": 2.065646012000798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 129.90625, "completions/mean_terminated_length": 129.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6611105892807245, "epoch": 0.00645, "frac_reward_zero_std": 0.5, "grad_norm": 0.07530811429023743, "kl": 0.997717946767807, "learning_rate": 9.999828357704242e-06, "loss": 0.0314, "num_tokens": 6154646.0, "reward": 0.9273509979248047, "reward_std": 0.058540284633636475, "rewards/rollout_reward_func/mean": 0.9273509979248047, "rewards/rollout_reward_func/std": 0.25747933983802795, "sampling/importance_sampling_ratio/max": 0.9966669678688049, "sampling/importance_sampling_ratio/mean": 0.9257306456565857, "sampling/importance_sampling_ratio/min": 0.00016928487457334995, "sampling/sampling_logp_difference/max": 2.7945291996002197, "sampling/sampling_logp_difference/mean": 0.14474593102931976, "step": 645, "step_time": 4.569930679994286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6575237298384309, "epoch": 0.00646, "grad_norm": 0.07070205360651016, "kl": 0.9718384295701981, "learning_rate": 9.999827793560063e-06, "loss": 0.0313, "step": 646, "step_time": 2.03791329300293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2288416251540184, "epoch": 0.00647, "frac_reward_zero_std": 0.5, "grad_norm": 2.1273019313812256, "kl": 0.5266018770635128, "learning_rate": 9.999827228490327e-06, "loss": -0.0513, "num_tokens": 6173130.0, "reward": 0.716245174407959, "reward_std": 0.05189630389213562, "rewards/rollout_reward_func/mean": 0.716245174407959, "rewards/rollout_reward_func/std": 0.2882266938686371, "sampling/importance_sampling_ratio/max": 1.5993297100067139, "sampling/importance_sampling_ratio/mean": 0.9825690984725952, "sampling/importance_sampling_ratio/min": 0.012023930437862873, "sampling/sampling_logp_difference/max": 2.2640814781188965, "sampling/sampling_logp_difference/mean": 0.036692485213279724, "step": 647, "step_time": 4.619338390009943 }, { "clip_ratio/high_max": 0.0193452388048172, "clip_ratio/high_mean": 0.0096726194024086, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020089286379516125, "entropy": 0.26448762929067016, "epoch": 0.00648, "grad_norm": 0.037672486156225204, "kl": 0.5615395084023476, "learning_rate": 9.999826662495036e-06, "loss": -0.0537, "step": 648, "step_time": 2.9232576310023433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 166.53125, "completions/mean_terminated_length": 166.53125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4511957811191678, "epoch": 0.00649, "frac_reward_zero_std": 0.5, "grad_norm": 0.00921099167317152, "kl": 0.6402861922979355, "learning_rate": 9.999826095574187e-06, "loss": -0.0458, "num_tokens": 6191235.0, "reward": 0.7044423222541809, "reward_std": 0.02474873512983322, "rewards/rollout_reward_func/mean": 0.7044423222541809, "rewards/rollout_reward_func/std": 0.14901740849018097, "sampling/importance_sampling_ratio/max": 0.9984588623046875, "sampling/importance_sampling_ratio/mean": 0.93173748254776, "sampling/importance_sampling_ratio/min": 0.0005867824074812233, "sampling/sampling_logp_difference/max": 2.4805731773376465, "sampling/sampling_logp_difference/mean": 0.08102750778198242, "step": 649, "step_time": 4.420843784995668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4513337966054678, "epoch": 0.0065, "grad_norm": 0.009591582231223583, "kl": 0.6374106705188751, "learning_rate": 9.999825527727781e-06, "loss": -0.0458, "step": 650, "step_time": 2.043887737992918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 127.375, "completions/mean_terminated_length": 127.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.22984180878847837, "epoch": 0.00651, "frac_reward_zero_std": 0.75, "grad_norm": 0.017235681414604187, "kl": 0.5963293574750423, "learning_rate": 9.99982495895582e-06, "loss": 0.0204, "num_tokens": 6208583.0, "reward": 0.7306922674179077, "reward_std": 0.01359820831567049, "rewards/rollout_reward_func/mean": 0.7306922674179077, "rewards/rollout_reward_func/std": 0.16422048211097717, "sampling/importance_sampling_ratio/max": 1.0049974918365479, "sampling/importance_sampling_ratio/mean": 0.956226646900177, "sampling/importance_sampling_ratio/min": 0.013013980351388454, "sampling/sampling_logp_difference/max": 2.3840456008911133, "sampling/sampling_logp_difference/mean": 0.026782343164086342, "step": 651, "step_time": 4.374113953002961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2290146704763174, "epoch": 0.00652, "grad_norm": 0.01715739443898201, "kl": 0.5962748751044273, "learning_rate": 9.999824389258302e-06, "loss": 0.0204, "step": 652, "step_time": 2.0352598159806803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 172.9375, "completions/mean_terminated_length": 172.9375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.21707469830289483, "epoch": 0.00653, "frac_reward_zero_std": 0.75, "grad_norm": 0.015279958955943584, "kl": 0.5352743752300739, "learning_rate": 9.999823818635227e-06, "loss": -0.0174, "num_tokens": 6227253.0, "reward": 0.8217452168464661, "reward_std": 0.02878740429878235, "rewards/rollout_reward_func/mean": 0.8217452168464661, "rewards/rollout_reward_func/std": 0.32683998346328735, "sampling/importance_sampling_ratio/max": 1.0124964714050293, "sampling/importance_sampling_ratio/mean": 0.9665747880935669, "sampling/importance_sampling_ratio/min": 0.0077198464423418045, "sampling/sampling_logp_difference/max": 2.3522536754608154, "sampling/sampling_logp_difference/mean": 0.031470272690057755, "step": 653, "step_time": 4.715973188023781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21780101815238595, "epoch": 0.00654, "grad_norm": 0.015595619566738605, "kl": 0.5346203744411469, "learning_rate": 9.9998232470866e-06, "loss": -0.0174, "step": 654, "step_time": 2.525760120006453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.057592667173594236, "epoch": 0.00655, "frac_reward_zero_std": 1.0, "grad_norm": 0.000609563197940588, "kl": 0.4141409732401371, "learning_rate": 9.999822674612414e-06, "loss": 0.0017, "num_tokens": 6246885.0, "reward": 0.6102308034896851, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6102308034896851, "rewards/rollout_reward_func/std": 0.25157636404037476, "sampling/importance_sampling_ratio/max": 0.997225284576416, "sampling/importance_sampling_ratio/mean": 0.9932668209075928, "sampling/importance_sampling_ratio/min": 0.9872238636016846, "sampling/sampling_logp_difference/max": 0.010714944452047348, "sampling/sampling_logp_difference/mean": 0.0015902642626315355, "step": 655, "step_time": 4.635984404972987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05845359340310097, "epoch": 0.00656, "grad_norm": 0.0006279945373535156, "kl": 0.4139965809881687, "learning_rate": 9.999822101212674e-06, "loss": 0.0017, "step": 656, "step_time": 2.0383419699937804 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.009114583488553762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016927083488553762, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 171.8125, "completions/mean_terminated_length": 171.8125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3914207611232996, "epoch": 0.00657, "frac_reward_zero_std": 0.25, "grad_norm": 5.080357074737549, "kl": 0.549419716000557, "learning_rate": 9.999821526887376e-06, "loss": -0.1611, "num_tokens": 6265223.0, "reward": 0.6284807920455933, "reward_std": 0.08127638697624207, "rewards/rollout_reward_func/mean": 0.6284807920455933, "rewards/rollout_reward_func/std": 0.2880755364894867, "sampling/importance_sampling_ratio/max": 2.359113931655884, "sampling/importance_sampling_ratio/mean": 0.9483354091644287, "sampling/importance_sampling_ratio/min": 0.01204471942037344, "sampling/sampling_logp_difference/max": 2.2057089805603027, "sampling/sampling_logp_difference/mean": 0.11496429890394211, "step": 657, "step_time": 4.358834234000824 }, { "clip_ratio/high_max": 0.033854166977107525, "clip_ratio/high_mean": 0.016927083488553762, "clip_ratio/low_mean": 0.14140625344589353, "clip_ratio/low_min": 0.06406250037252903, "clip_ratio/region_mean": 0.15833333833143115, "entropy": 0.6589196920394897, "epoch": 0.00658, "grad_norm": 0.20104949176311493, "kl": 0.5354598574340343, "learning_rate": 9.999820951636526e-06, "loss": -0.1728, "step": 658, "step_time": 2.0339550629942096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 238.5, "completions/mean_terminated_length": 238.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.05634666979312897, "epoch": 0.00659, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006624193047173321, "kl": 0.40621306002140045, "learning_rate": 9.99982037546012e-06, "loss": 0.0018, "num_tokens": 6284655.0, "reward": 0.8913077116012573, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8913077116012573, "rewards/rollout_reward_func/std": 0.3657207190990448, "sampling/importance_sampling_ratio/max": 1.0033459663391113, "sampling/importance_sampling_ratio/mean": 0.9950686693191528, "sampling/importance_sampling_ratio/min": 0.9881273508071899, "sampling/sampling_logp_difference/max": 0.007736552506685257, "sampling/sampling_logp_difference/mean": 0.0015435019740834832, "step": 659, "step_time": 5.159671398992941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0555781670846045, "epoch": 0.0066, "grad_norm": 0.0006619445630349219, "kl": 0.4063342921435833, "learning_rate": 9.999819798358157e-06, "loss": 0.0018, "step": 660, "step_time": 2.497632638005598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 127.34375, "completions/mean_terminated_length": 127.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6881395075470209, "epoch": 0.00661, "frac_reward_zero_std": 0.5, "grad_norm": 0.02115464024245739, "kl": 0.592023529112339, "learning_rate": 9.999819220330643e-06, "loss": -0.0451, "num_tokens": 6300810.0, "reward": 0.5748077034950256, "reward_std": 0.05534440279006958, "rewards/rollout_reward_func/mean": 0.5748077034950256, "rewards/rollout_reward_func/std": 0.18590563535690308, "sampling/importance_sampling_ratio/max": 0.999329149723053, "sampling/importance_sampling_ratio/mean": 0.8881173133850098, "sampling/importance_sampling_ratio/min": 1.0005745934904553e-05, "sampling/sampling_logp_difference/max": 2.7995457649230957, "sampling/sampling_logp_difference/mean": 0.14223146438598633, "step": 661, "step_time": 4.488318025003537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.6985418447293341, "epoch": 0.00662, "grad_norm": 0.020771484822034836, "kl": 0.5974417105317116, "learning_rate": 9.99981864137757e-06, "loss": -0.0451, "step": 662, "step_time": 2.037578770010441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 221.3125, "completions/mean_terminated_length": 221.3125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.17251488333567977, "epoch": 0.00663, "frac_reward_zero_std": 0.75, "grad_norm": 0.022889291867613792, "kl": 0.5332348048686981, "learning_rate": 9.999818061498945e-06, "loss": -0.0263, "num_tokens": 6320428.0, "reward": 1.087841272354126, "reward_std": 0.005670460872352123, "rewards/rollout_reward_func/mean": 1.087841272354126, "rewards/rollout_reward_func/std": 0.1805165410041809, "sampling/importance_sampling_ratio/max": 1.0014444589614868, "sampling/importance_sampling_ratio/mean": 0.9649415612220764, "sampling/importance_sampling_ratio/min": 0.006781473755836487, "sampling/sampling_logp_difference/max": 1.8279399871826172, "sampling/sampling_logp_difference/mean": 0.026709342375397682, "step": 663, "step_time": 4.634559279998939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1722865621559322, "epoch": 0.00664, "grad_norm": 0.022726815193891525, "kl": 0.5329555012285709, "learning_rate": 9.999817480694764e-06, "loss": -0.0263, "step": 664, "step_time": 2.47507957801281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4078441420570016, "epoch": 0.00665, "frac_reward_zero_std": 0.5, "grad_norm": 0.43523550033569336, "kl": 0.45236481726169586, "learning_rate": 9.99981689896503e-06, "loss": -0.019, "num_tokens": 6337020.0, "reward": 0.5068269371986389, "reward_std": 0.0999118983745575, "rewards/rollout_reward_func/mean": 0.5068269371986389, "rewards/rollout_reward_func/std": 0.19555938243865967, "sampling/importance_sampling_ratio/max": 1.0032355785369873, "sampling/importance_sampling_ratio/mean": 0.9596936702728271, "sampling/importance_sampling_ratio/min": 1.0825406207004562e-05, "sampling/sampling_logp_difference/max": 2.202688455581665, "sampling/sampling_logp_difference/mean": 0.08948305249214172, "step": 665, "step_time": 4.532322034006938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.026562500279396772, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026562500279396772, "entropy": 0.4221604191698134, "epoch": 0.00666, "grad_norm": 0.31089988350868225, "kl": 0.4386899098753929, "learning_rate": 9.99981631630974e-06, "loss": -0.0207, "step": 666, "step_time": 2.0127366299930145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 128.09375, "completions/mean_terminated_length": 128.09375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.6597962388768792, "epoch": 0.00667, "frac_reward_zero_std": 0.5, "grad_norm": 0.02535727433860302, "kl": 0.6283199861645699, "learning_rate": 9.999815732728897e-06, "loss": -0.0362, "num_tokens": 6354191.0, "reward": 0.6820192337036133, "reward_std": 0.036715153604745865, "rewards/rollout_reward_func/mean": 0.6820192337036133, "rewards/rollout_reward_func/std": 0.32189562916755676, "sampling/importance_sampling_ratio/max": 0.9985686540603638, "sampling/importance_sampling_ratio/mean": 0.9312734007835388, "sampling/importance_sampling_ratio/min": 1.2808665417196607e-07, "sampling/sampling_logp_difference/max": 2.3997533321380615, "sampling/sampling_logp_difference/mean": 0.15069925785064697, "step": 667, "step_time": 4.251913762003824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013392857275903225, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013392857275903225, "entropy": 0.6606062287464738, "epoch": 0.00668, "grad_norm": 0.02496900036931038, "kl": 0.628997839987278, "learning_rate": 9.9998151482225e-06, "loss": -0.0362, "step": 668, "step_time": 2.046859489986673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 143.6875, "completions/mean_terminated_length": 143.6875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.5690674697980285, "epoch": 0.00669, "frac_reward_zero_std": 0.75, "grad_norm": 0.05956979840993881, "kl": 0.7282170057296753, "learning_rate": 9.99981456279055e-06, "loss": -0.0218, "num_tokens": 6371101.0, "reward": 0.5953077077865601, "reward_std": 0.04222019389271736, "rewards/rollout_reward_func/mean": 0.5953077077865601, "rewards/rollout_reward_func/std": 0.16217999160289764, "sampling/importance_sampling_ratio/max": 1.000533103942871, "sampling/importance_sampling_ratio/mean": 0.9064910411834717, "sampling/importance_sampling_ratio/min": 3.697378997458145e-05, "sampling/sampling_logp_difference/max": 3.853449583053589, "sampling/sampling_logp_difference/mean": 0.08578860014677048, "step": 669, "step_time": 4.405522630018822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5694878492504358, "epoch": 0.0067, "grad_norm": 0.05108555406332016, "kl": 0.7118921652436256, "learning_rate": 9.999813976433047e-06, "loss": -0.0219, "step": 670, "step_time": 2.4651726860101917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 191.1875, "completions/mean_terminated_length": 191.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4819955066777766, "epoch": 0.00671, "frac_reward_zero_std": 0.75, "grad_norm": 0.009245780296623707, "kl": 0.5063516274094582, "learning_rate": 9.99981338914999e-06, "loss": -0.0229, "num_tokens": 6389619.0, "reward": 0.6727308034896851, "reward_std": 0.008902115747332573, "rewards/rollout_reward_func/mean": 0.6727308034896851, "rewards/rollout_reward_func/std": 0.39375221729278564, "sampling/importance_sampling_ratio/max": 0.9999231696128845, "sampling/importance_sampling_ratio/mean": 0.9328432083129883, "sampling/importance_sampling_ratio/min": 0.0001694987586233765, "sampling/sampling_logp_difference/max": 3.455350160598755, "sampling/sampling_logp_difference/mean": 0.07655403017997742, "step": 671, "step_time": 5.05953500099713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4816198884509504, "epoch": 0.00672, "grad_norm": 0.009890681132674217, "kl": 0.5088240429759026, "learning_rate": 9.99981280094138e-06, "loss": -0.0229, "step": 672, "step_time": 2.0534886120076408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 221.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.05463898228481412, "epoch": 0.00673, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009482172317802906, "kl": 0.39379846304655075, "learning_rate": 9.999812211807216e-06, "loss": 0.0016, "num_tokens": 6409267.0, "reward": 0.9449231624603271, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9449231624603271, "rewards/rollout_reward_func/std": 0.35669711232185364, "sampling/importance_sampling_ratio/max": 1.004712700843811, "sampling/importance_sampling_ratio/mean": 0.9936162233352661, "sampling/importance_sampling_ratio/min": 0.9838131666183472, "sampling/sampling_logp_difference/max": 0.01346462219953537, "sampling/sampling_logp_difference/mean": 0.0014689784729853272, "step": 673, "step_time": 4.60217428598844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05632179183885455, "epoch": 0.00674, "grad_norm": 0.0016132342861965299, "kl": 0.3934677094221115, "learning_rate": 9.9998116217475e-06, "loss": 0.0016, "step": 674, "step_time": 2.0365683999916655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 171.375, "completions/mean_terminated_length": 171.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.22215479146689177, "epoch": 0.00675, "frac_reward_zero_std": 0.75, "grad_norm": 0.007761626038700342, "kl": 0.6688997000455856, "learning_rate": 9.99981103076223e-06, "loss": -0.0259, "num_tokens": 6427415.0, "reward": 0.7843269109725952, "reward_std": 0.04487408697605133, "rewards/rollout_reward_func/mean": 0.7843269109725952, "rewards/rollout_reward_func/std": 0.22060680389404297, "sampling/importance_sampling_ratio/max": 1.002586007118225, "sampling/importance_sampling_ratio/mean": 0.9569649696350098, "sampling/importance_sampling_ratio/min": 0.012071940116584301, "sampling/sampling_logp_difference/max": 2.2848033905029297, "sampling/sampling_logp_difference/mean": 0.03011872060596943, "step": 675, "step_time": 4.276854020012252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2225980395451188, "epoch": 0.00676, "grad_norm": 0.007797398138791323, "kl": 0.6688604056835175, "learning_rate": 9.999810438851407e-06, "loss": -0.0259, "step": 676, "step_time": 2.454209132003598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 90.78125, "completions/mean_terminated_length": 90.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6131224781274796, "epoch": 0.00677, "frac_reward_zero_std": 0.5, "grad_norm": 1.136515498161316, "kl": 0.8372727707028389, "learning_rate": 9.999809846015032e-06, "loss": -0.0149, "num_tokens": 6443024.0, "reward": 0.6563509702682495, "reward_std": 0.20680588483810425, "rewards/rollout_reward_func/mean": 0.6563509702682495, "rewards/rollout_reward_func/std": 0.4032760262489319, "sampling/importance_sampling_ratio/max": 1.1505969762802124, "sampling/importance_sampling_ratio/mean": 0.8715060949325562, "sampling/importance_sampling_ratio/min": 0.00020576590031851083, "sampling/sampling_logp_difference/max": 2.591106653213501, "sampling/sampling_logp_difference/mean": 0.08914445340633392, "step": 677, "step_time": 4.809841629998118 }, { "clip_ratio/high_max": 0.039062500931322575, "clip_ratio/high_mean": 0.019531250465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019531250465661287, "entropy": 0.5994730778038502, "epoch": 0.00678, "grad_norm": 0.13142916560173035, "kl": 0.8463224694132805, "learning_rate": 9.999809252253105e-06, "loss": -0.0159, "step": 678, "step_time": 2.0468869710093713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 173.4375, "completions/mean_terminated_length": 173.4375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4054745025932789, "epoch": 0.00679, "frac_reward_zero_std": 0.5, "grad_norm": 0.04771072417497635, "kl": 0.4170893467962742, "learning_rate": 9.999808657565626e-06, "loss": -0.0501, "num_tokens": 6460886.0, "reward": 0.8193076848983765, "reward_std": 0.25390368700027466, "rewards/rollout_reward_func/mean": 0.8193076848983765, "rewards/rollout_reward_func/std": 0.4988728165626526, "sampling/importance_sampling_ratio/max": 1.0038657188415527, "sampling/importance_sampling_ratio/mean": 0.9581089019775391, "sampling/importance_sampling_ratio/min": 6.439492172052039e-15, "sampling/sampling_logp_difference/max": 3.1136178970336914, "sampling/sampling_logp_difference/mean": 0.14750789105892181, "step": 679, "step_time": 4.566388203005772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4066464230418205, "epoch": 0.0068, "grad_norm": 0.056621354073286057, "kl": 0.417278278619051, "learning_rate": 9.999808061952593e-06, "loss": -0.0501, "step": 680, "step_time": 2.034980235999683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 103.1875, "completions/mean_terminated_length": 103.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.586292115971446, "epoch": 0.00681, "frac_reward_zero_std": 0.5, "grad_norm": 0.20079739391803741, "kl": 0.7828980833292007, "learning_rate": 9.999807465414011e-06, "loss": -0.0189, "num_tokens": 6477084.0, "reward": 0.3693028688430786, "reward_std": 0.39332816004753113, "rewards/rollout_reward_func/mean": 0.3693028688430786, "rewards/rollout_reward_func/std": 0.8871973156929016, "sampling/importance_sampling_ratio/max": 1.033754825592041, "sampling/importance_sampling_ratio/mean": 0.9227455854415894, "sampling/importance_sampling_ratio/min": 0.0004227114550303668, "sampling/sampling_logp_difference/max": 2.105665445327759, "sampling/sampling_logp_difference/mean": 0.08843684941530228, "step": 681, "step_time": 4.277028540003812 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.5593828279525042, "epoch": 0.00682, "grad_norm": 0.18060478568077087, "kl": 0.7791288122534752, "learning_rate": 9.999806867949875e-06, "loss": -0.0192, "step": 682, "step_time": 2.8718289750031545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 194.625, "completions/mean_terminated_length": 194.625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3729820316657424, "epoch": 0.00683, "frac_reward_zero_std": 1.0, "grad_norm": 0.02186650037765503, "kl": 0.5721336528658867, "learning_rate": 9.999806269560189e-06, "loss": 0.002, "num_tokens": 6496088.0, "reward": 0.758884608745575, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.758884608745575, "rewards/rollout_reward_func/std": 0.42316943407058716, "sampling/importance_sampling_ratio/max": 0.9972240924835205, "sampling/importance_sampling_ratio/mean": 0.9333208203315735, "sampling/importance_sampling_ratio/min": 0.01722966879606247, "sampling/sampling_logp_difference/max": 1.959996223449707, "sampling/sampling_logp_difference/mean": 0.041139110922813416, "step": 683, "step_time": 4.408643116003077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3701413697563112, "epoch": 0.00684, "grad_norm": 0.021077444776892662, "kl": 0.5635989047586918, "learning_rate": 9.99980567024495e-06, "loss": 0.002, "step": 684, "step_time": 2.0475616440016893 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 198.34375, "completions/mean_terminated_length": 198.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5139507008716464, "epoch": 0.00685, "frac_reward_zero_std": 0.25, "grad_norm": 0.8234489560127258, "kl": 0.5293397307395935, "learning_rate": 9.99980507000416e-06, "loss": -0.0674, "num_tokens": 6515571.0, "reward": 0.3615625202655792, "reward_std": 0.4661003649234772, "rewards/rollout_reward_func/mean": 0.3615625202655792, "rewards/rollout_reward_func/std": 0.722388744354248, "sampling/importance_sampling_ratio/max": 1.031185507774353, "sampling/importance_sampling_ratio/mean": 0.8788821697235107, "sampling/importance_sampling_ratio/min": 0.004759612493216991, "sampling/sampling_logp_difference/max": 2.2457962036132812, "sampling/sampling_logp_difference/mean": 0.06394410133361816, "step": 685, "step_time": 4.713207096989208 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 0.48783792834728956, "epoch": 0.00686, "grad_norm": 0.06257620453834534, "kl": 0.5548375211656094, "learning_rate": 9.999804468837818e-06, "loss": -0.0683, "step": 686, "step_time": 2.0427356389845954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 152.875, "completions/mean_terminated_length": 152.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4175726789981127, "epoch": 0.00687, "frac_reward_zero_std": 0.5, "grad_norm": 0.4080876410007477, "kl": 0.5136157162487507, "learning_rate": 9.999803866745927e-06, "loss": -0.0466, "num_tokens": 6532887.0, "reward": 0.6711009740829468, "reward_std": 0.05885990336537361, "rewards/rollout_reward_func/mean": 0.6711009740829468, "rewards/rollout_reward_func/std": 0.15845470130443573, "sampling/importance_sampling_ratio/max": 1.0489181280136108, "sampling/importance_sampling_ratio/mean": 0.9352025389671326, "sampling/importance_sampling_ratio/min": 0.0006329687894321978, "sampling/sampling_logp_difference/max": 2.5763654708862305, "sampling/sampling_logp_difference/mean": 0.058725755661726, "step": 687, "step_time": 4.982435585006897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.45442432072013617, "epoch": 0.00688, "grad_norm": 0.9059562683105469, "kl": 0.49956557154655457, "learning_rate": 9.999803263728482e-06, "loss": -0.0483, "step": 688, "step_time": 2.510474503993464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 155.8125, "completions/mean_terminated_length": 155.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.15848072012886405, "epoch": 0.00689, "frac_reward_zero_std": 0.5, "grad_norm": 1.6874135732650757, "kl": 0.5693972036242485, "learning_rate": 9.999802659785488e-06, "loss": 0.0009, "num_tokens": 6551089.0, "reward": 0.5923268795013428, "reward_std": 0.20111025869846344, "rewards/rollout_reward_func/mean": 0.5923268795013428, "rewards/rollout_reward_func/std": 0.7715802192687988, "sampling/importance_sampling_ratio/max": 1.1948013305664062, "sampling/importance_sampling_ratio/mean": 0.9910339117050171, "sampling/importance_sampling_ratio/min": 0.26319068670272827, "sampling/sampling_logp_difference/max": 0.6113002300262451, "sampling/sampling_logp_difference/mean": 0.019167212769389153, "step": 689, "step_time": 4.5792851129954215 }, { "clip_ratio/high_max": 0.03750000149011612, "clip_ratio/high_mean": 0.01875000074505806, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029166667722165585, "entropy": 0.1351192742586136, "epoch": 0.0069, "grad_norm": 0.36288827657699585, "kl": 0.5673299729824066, "learning_rate": 9.999802054916945e-06, "loss": -0.0006, "step": 690, "step_time": 2.034351935995801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.19436066038906574, "epoch": 0.00691, "frac_reward_zero_std": 0.75, "grad_norm": 0.11036239564418793, "kl": 0.6534232348203659, "learning_rate": 9.99980144912285e-06, "loss": -0.0234, "num_tokens": 6567769.0, "reward": 0.7459135055541992, "reward_std": 0.01590990275144577, "rewards/rollout_reward_func/mean": 0.7459135055541992, "rewards/rollout_reward_func/std": 0.23829486966133118, "sampling/importance_sampling_ratio/max": 1.0183384418487549, "sampling/importance_sampling_ratio/mean": 0.9611093997955322, "sampling/importance_sampling_ratio/min": 0.18522784113883972, "sampling/sampling_logp_difference/max": 1.5613017082214355, "sampling/sampling_logp_difference/mean": 0.01400312501937151, "step": 691, "step_time": 4.213023102012812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20451552234590054, "epoch": 0.00692, "grad_norm": 0.0811862125992775, "kl": 0.6645270511507988, "learning_rate": 9.999800842403203e-06, "loss": -0.0236, "step": 692, "step_time": 1.9998296789999586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 141.28125, "completions/mean_terminated_length": 141.28125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5321009056642652, "epoch": 0.00693, "frac_reward_zero_std": 0.25, "grad_norm": 0.09287930279970169, "kl": 0.5855035781860352, "learning_rate": 9.999800234758007e-06, "loss": -0.0719, "num_tokens": 6585146.0, "reward": 0.6412067413330078, "reward_std": 0.025197478011250496, "rewards/rollout_reward_func/mean": 0.6412067413330078, "rewards/rollout_reward_func/std": 0.12902404367923737, "sampling/importance_sampling_ratio/max": 1.015775203704834, "sampling/importance_sampling_ratio/mean": 0.9098789691925049, "sampling/importance_sampling_ratio/min": 0.03060535341501236, "sampling/sampling_logp_difference/max": 1.3473289012908936, "sampling/sampling_logp_difference/mean": 0.06125316023826599, "step": 693, "step_time": 5.291186559006746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5380773367360234, "epoch": 0.00694, "grad_norm": 0.08168762177228928, "kl": 0.6049415618181229, "learning_rate": 9.999799626187263e-06, "loss": -0.072, "step": 694, "step_time": 2.0402325699906214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 154.4375, "completions/mean_terminated_length": 154.4375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7173235947266221, "epoch": 0.00695, "frac_reward_zero_std": 0.5, "grad_norm": 0.04172411933541298, "kl": 0.5128116421401501, "learning_rate": 9.999799016690968e-06, "loss": 0.0205, "num_tokens": 6602184.0, "reward": 0.8952212333679199, "reward_std": 0.02910780720412731, "rewards/rollout_reward_func/mean": 0.8952212333679199, "rewards/rollout_reward_func/std": 0.2710008919239044, "sampling/importance_sampling_ratio/max": 1.024623155593872, "sampling/importance_sampling_ratio/mean": 0.9080319404602051, "sampling/importance_sampling_ratio/min": 0.0008177620475180447, "sampling/sampling_logp_difference/max": 2.04789137840271, "sampling/sampling_logp_difference/mean": 0.1091439425945282, "step": 695, "step_time": 4.251045823992172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.7345240581780672, "epoch": 0.00696, "grad_norm": 0.037162672728300095, "kl": 0.5182303488254547, "learning_rate": 9.999798406269121e-06, "loss": 0.0205, "step": 696, "step_time": 2.0510602479989757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 82.125, "completions/mean_terminated_length": 82.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7502229865640402, "epoch": 0.00697, "frac_reward_zero_std": 0.5, "grad_norm": 0.2901154160499573, "kl": 0.7846877127885818, "learning_rate": 9.999797794921726e-06, "loss": -0.0111, "num_tokens": 6617436.0, "reward": 0.6088942289352417, "reward_std": 0.27824828028678894, "rewards/rollout_reward_func/mean": 0.6088942289352417, "rewards/rollout_reward_func/std": 0.7421329021453857, "sampling/importance_sampling_ratio/max": 1.1182817220687866, "sampling/importance_sampling_ratio/mean": 0.9122017025947571, "sampling/importance_sampling_ratio/min": 0.006344410125166178, "sampling/sampling_logp_difference/max": 2.2959611415863037, "sampling/sampling_logp_difference/mean": 0.14162936806678772, "step": 697, "step_time": 4.069137892001891 }, { "clip_ratio/high_max": 0.078125, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.7198022948578, "epoch": 0.00698, "grad_norm": 0.21179361641407013, "kl": 0.7885401174426079, "learning_rate": 9.999797182648783e-06, "loss": -0.0119, "step": 698, "step_time": 2.0086392300072475 }, { "clip_ratio/high_max": 0.06107954680919647, "clip_ratio/high_mean": 0.030539773404598236, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030539773404598236, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 136.21875, "completions/mean_terminated_length": 136.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4658389948308468, "epoch": 0.00699, "frac_reward_zero_std": 0.5, "grad_norm": 0.4522523581981659, "kl": 0.494923435151577, "learning_rate": 9.999796569450289e-06, "loss": -0.0089, "num_tokens": 6634387.0, "reward": 0.6516009569168091, "reward_std": 0.2106914073228836, "rewards/rollout_reward_func/mean": 0.6516009569168091, "rewards/rollout_reward_func/std": 0.3770601451396942, "sampling/importance_sampling_ratio/max": 1.4075126647949219, "sampling/importance_sampling_ratio/mean": 0.9215960502624512, "sampling/importance_sampling_ratio/min": 0.0006234848406165838, "sampling/sampling_logp_difference/max": 2.7890121936798096, "sampling/sampling_logp_difference/mean": 0.0948573648929596, "step": 699, "step_time": 5.497802652003884 }, { "clip_ratio/high_max": 0.09659091010689735, "clip_ratio/high_mean": 0.04829545505344868, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0587121220305562, "entropy": 0.3266245052218437, "epoch": 0.007, "grad_norm": 0.24690206348896027, "kl": 0.49814777448773384, "learning_rate": 9.999795955326245e-06, "loss": -0.0109, "step": 700, "step_time": 2.0235508529949584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 175.5, "completions/mean_terminated_length": 175.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2992029348388314, "epoch": 0.00701, "frac_reward_zero_std": 0.5, "grad_norm": 3.2506821155548096, "kl": 0.44325781613588333, "learning_rate": 9.999795340276655e-06, "loss": -0.037, "num_tokens": 6652515.0, "reward": 0.9792788624763489, "reward_std": 0.05521008372306824, "rewards/rollout_reward_func/mean": 0.9792788624763489, "rewards/rollout_reward_func/std": 0.37265151739120483, "sampling/importance_sampling_ratio/max": 1.3135536909103394, "sampling/importance_sampling_ratio/mean": 0.9739469289779663, "sampling/importance_sampling_ratio/min": 0.020755643025040627, "sampling/sampling_logp_difference/max": 1.8154810667037964, "sampling/sampling_logp_difference/mean": 0.038570910692214966, "step": 701, "step_time": 4.4603756590004195 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033854166977107525, "entropy": 0.3723378456197679, "epoch": 0.00702, "grad_norm": 0.046242229640483856, "kl": 0.6944298893213272, "learning_rate": 9.999794724301514e-06, "loss": -0.0391, "step": 702, "step_time": 2.0376838780066464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 246.65625, "completions/mean_terminated_length": 246.65625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.38637975323945284, "epoch": 0.00703, "frac_reward_zero_std": 0.5, "grad_norm": 0.015482010319828987, "kl": 0.461770873516798, "learning_rate": 9.999794107400824e-06, "loss": -0.0648, "num_tokens": 6673264.0, "reward": 0.9137365221977234, "reward_std": 0.030334889888763428, "rewards/rollout_reward_func/mean": 0.9137365221977234, "rewards/rollout_reward_func/std": 0.3817671835422516, "sampling/importance_sampling_ratio/max": 1.0097825527191162, "sampling/importance_sampling_ratio/mean": 0.9351024627685547, "sampling/importance_sampling_ratio/min": 1.637430122047867e-11, "sampling/sampling_logp_difference/max": 10.491726875305176, "sampling/sampling_logp_difference/mean": 0.1269911676645279, "step": 703, "step_time": 4.904001340000832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3826519697904587, "epoch": 0.00704, "grad_norm": 0.015083663165569305, "kl": 0.46285149827599525, "learning_rate": 9.999793489574587e-06, "loss": -0.0648, "step": 704, "step_time": 2.5201443820114946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 154.53125, "completions/mean_terminated_length": 154.53125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.859066553413868, "epoch": 0.00705, "frac_reward_zero_std": 0.25, "grad_norm": 0.08036965131759644, "kl": 0.4798268973827362, "learning_rate": 9.999792870822801e-06, "loss": -0.0562, "num_tokens": 6691129.0, "reward": 0.7493942379951477, "reward_std": 0.05868803337216377, "rewards/rollout_reward_func/mean": 0.7493942379951477, "rewards/rollout_reward_func/std": 0.15802063047885895, "sampling/importance_sampling_ratio/max": 1.0339778661727905, "sampling/importance_sampling_ratio/mean": 0.8867658972740173, "sampling/importance_sampling_ratio/min": 0.00014929226017557085, "sampling/sampling_logp_difference/max": 2.3586673736572266, "sampling/sampling_logp_difference/mean": 0.13669681549072266, "step": 705, "step_time": 5.107399712003826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.8616774277761579, "epoch": 0.00706, "grad_norm": 0.08198332041501999, "kl": 0.4870312660932541, "learning_rate": 9.999792251145466e-06, "loss": -0.0562, "step": 706, "step_time": 2.0409603040025104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 151.1875, "completions/mean_terminated_length": 151.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7737821256741881, "epoch": 0.00707, "frac_reward_zero_std": 0.5, "grad_norm": 3.585118532180786, "kl": 0.6513385400176048, "learning_rate": 9.999791630542584e-06, "loss": -0.0424, "num_tokens": 6708423.0, "reward": 0.4795384705066681, "reward_std": 0.10124801844358444, "rewards/rollout_reward_func/mean": 0.4795384705066681, "rewards/rollout_reward_func/std": 0.18130189180374146, "sampling/importance_sampling_ratio/max": 2.6515133380889893, "sampling/importance_sampling_ratio/mean": 0.9288061857223511, "sampling/importance_sampling_ratio/min": 0.0038852475117892027, "sampling/sampling_logp_difference/max": 2.267672061920166, "sampling/sampling_logp_difference/mean": 0.11990012228488922, "step": 707, "step_time": 4.4914176689853775 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.07500000018626451, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09062500018626451, "entropy": 0.8762858472764492, "epoch": 0.00708, "grad_norm": 0.08432383835315704, "kl": 0.6511237360537052, "learning_rate": 9.999791009014154e-06, "loss": -0.0489, "step": 708, "step_time": 2.032402662996901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 167.96875, "completions/mean_terminated_length": 167.96875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.23476041061803699, "epoch": 0.00709, "frac_reward_zero_std": 0.75, "grad_norm": 0.01912401244044304, "kl": 0.4922516904771328, "learning_rate": 9.999790386560175e-06, "loss": 0.0305, "num_tokens": 6726246.0, "reward": 0.7262211441993713, "reward_std": 0.01128651387989521, "rewards/rollout_reward_func/mean": 0.7262211441993713, "rewards/rollout_reward_func/std": 0.4628288447856903, "sampling/importance_sampling_ratio/max": 1.0004867315292358, "sampling/importance_sampling_ratio/mean": 0.9645602107048035, "sampling/importance_sampling_ratio/min": 0.000503441144246608, "sampling/sampling_logp_difference/max": 2.679049253463745, "sampling/sampling_logp_difference/mean": 0.04833053797483444, "step": 709, "step_time": 4.480905115000496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23448500968515873, "epoch": 0.0071, "grad_norm": 0.017329111695289612, "kl": 0.48346855118870735, "learning_rate": 9.99978976318065e-06, "loss": 0.0305, "step": 710, "step_time": 2.9176889849986765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 122.15625, "completions/mean_terminated_length": 122.15625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7286367854103446, "epoch": 0.00711, "frac_reward_zero_std": 0.5, "grad_norm": 0.050832346081733704, "kl": 0.8160603195428848, "learning_rate": 9.999789138875577e-06, "loss": -0.0425, "num_tokens": 6742691.0, "reward": 0.6658798456192017, "reward_std": 0.03558650612831116, "rewards/rollout_reward_func/mean": 0.6658798456192017, "rewards/rollout_reward_func/std": 0.16295848786830902, "sampling/importance_sampling_ratio/max": 1.0275776386260986, "sampling/importance_sampling_ratio/mean": 0.9086731672286987, "sampling/importance_sampling_ratio/min": 1.1129178308089127e-11, "sampling/sampling_logp_difference/max": 3.9548420906066895, "sampling/sampling_logp_difference/mean": 0.20204317569732666, "step": 711, "step_time": 4.440579455993429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7283653132617474, "epoch": 0.00712, "grad_norm": 0.04360794275999069, "kl": 0.7946902737021446, "learning_rate": 9.999788513644958e-06, "loss": -0.0426, "step": 712, "step_time": 2.0378727069983142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 126.0, "completions/mean_terminated_length": 126.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07768803741782904, "epoch": 0.00713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007479861960746348, "kl": 0.454049538820982, "learning_rate": 9.999787887488789e-06, "loss": 0.0013, "num_tokens": 6758995.0, "reward": 0.46703845262527466, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.46703845262527466, "rewards/rollout_reward_func/std": 0.07133088260889053, "sampling/importance_sampling_ratio/max": 1.0170280933380127, "sampling/importance_sampling_ratio/mean": 1.000454306602478, "sampling/importance_sampling_ratio/min": 0.9858816266059875, "sampling/sampling_logp_difference/max": 0.021034538745880127, "sampling/sampling_logp_difference/mean": 0.002684428123757243, "step": 713, "step_time": 4.290468675004377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07953096926212311, "epoch": 0.00714, "grad_norm": 0.0008295146399177611, "kl": 0.4536726363003254, "learning_rate": 9.999787260407074e-06, "loss": 0.0013, "step": 714, "step_time": 2.016087228003016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.8973808404989541, "epoch": 0.00715, "frac_reward_zero_std": 0.25, "grad_norm": 1.185925006866455, "kl": 0.6552757024765015, "learning_rate": 9.999786632399813e-06, "loss": -0.054, "num_tokens": 6775667.0, "reward": 0.6884182691574097, "reward_std": 0.13756223022937775, "rewards/rollout_reward_func/mean": 0.6884182691574097, "rewards/rollout_reward_func/std": 0.3238620460033417, "sampling/importance_sampling_ratio/max": 1.2062115669250488, "sampling/importance_sampling_ratio/mean": 0.8771838545799255, "sampling/importance_sampling_ratio/min": 6.415679672500119e-05, "sampling/sampling_logp_difference/max": 2.904303550720215, "sampling/sampling_logp_difference/mean": 0.1595776379108429, "step": 715, "step_time": 4.464119349999237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 1.0171726252883673, "epoch": 0.00716, "grad_norm": 0.019324639812111855, "kl": 0.6311075948178768, "learning_rate": 9.999786003467005e-06, "loss": -0.0568, "step": 716, "step_time": 2.482430338000995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 165.15625, "completions/mean_terminated_length": 165.15625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3087025210261345, "epoch": 0.00717, "frac_reward_zero_std": 0.75, "grad_norm": 0.0047145383432507515, "kl": 0.49913374707102776, "learning_rate": 9.99978537360865e-06, "loss": -0.0269, "num_tokens": 6793592.0, "reward": 0.8470193147659302, "reward_std": 0.004079459700733423, "rewards/rollout_reward_func/mean": 0.8470193147659302, "rewards/rollout_reward_func/std": 0.2863578796386719, "sampling/importance_sampling_ratio/max": 1.002853274345398, "sampling/importance_sampling_ratio/mean": 0.9639461040496826, "sampling/importance_sampling_ratio/min": 0.003859676653519273, "sampling/sampling_logp_difference/max": 2.1417787075042725, "sampling/sampling_logp_difference/mean": 0.03663189336657524, "step": 717, "step_time": 4.289036197005771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29473454877734184, "epoch": 0.00718, "grad_norm": 0.004949420690536499, "kl": 0.4977327883243561, "learning_rate": 9.99978474282475e-06, "loss": -0.0269, "step": 718, "step_time": 2.0246247289906023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 215.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 121.46875, "completions/mean_terminated_length": 118.45160675048828, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9693686552345753, "epoch": 0.00719, "frac_reward_zero_std": 0.25, "grad_norm": 0.0259455144405365, "kl": 0.6646481975913048, "learning_rate": 9.999784111115302e-06, "loss": -0.0076, "num_tokens": 6810319.0, "reward": 0.874302864074707, "reward_std": 0.0702347457408905, "rewards/rollout_reward_func/mean": 0.874302864074707, "rewards/rollout_reward_func/std": 0.2088913917541504, "sampling/importance_sampling_ratio/max": 1.0071916580200195, "sampling/importance_sampling_ratio/mean": 0.8892971277236938, "sampling/importance_sampling_ratio/min": 7.67287204433842e-19, "sampling/sampling_logp_difference/max": 4.19041109085083, "sampling/sampling_logp_difference/mean": 0.262186735868454, "step": 719, "step_time": 4.330535291002889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.972534816712141, "epoch": 0.0072, "grad_norm": 0.026830103248357773, "kl": 0.6635816656053066, "learning_rate": 9.99978347848031e-06, "loss": -0.0076, "step": 720, "step_time": 2.013648394982738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 132.5625, "completions/mean_terminated_length": 132.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4279369842261076, "epoch": 0.00721, "frac_reward_zero_std": 0.75, "grad_norm": 0.01170934084802866, "kl": 0.4766099564731121, "learning_rate": 9.99978284491977e-06, "loss": -0.0179, "num_tokens": 6827073.0, "reward": 0.7697115540504456, "reward_std": 0.0176776684820652, "rewards/rollout_reward_func/mean": 0.7697115540504456, "rewards/rollout_reward_func/std": 0.27588874101638794, "sampling/importance_sampling_ratio/max": 1.0264509916305542, "sampling/importance_sampling_ratio/mean": 0.958979070186615, "sampling/importance_sampling_ratio/min": 4.322704626247287e-06, "sampling/sampling_logp_difference/max": 2.437976837158203, "sampling/sampling_logp_difference/mean": 0.08237168937921524, "step": 721, "step_time": 4.183217652978783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4291055426001549, "epoch": 0.00722, "grad_norm": 0.0110014071688056, "kl": 0.4715690053999424, "learning_rate": 9.999782210433683e-06, "loss": -0.0179, "step": 722, "step_time": 2.4434731140063377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 221.75, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.17696860246360302, "epoch": 0.00723, "frac_reward_zero_std": 0.75, "grad_norm": 0.009293708950281143, "kl": 0.49224264174699783, "learning_rate": 9.999781575022053e-06, "loss": -0.0361, "num_tokens": 6846297.0, "reward": 0.9059480428695679, "reward_std": 0.012657222338020802, "rewards/rollout_reward_func/mean": 0.9059480428695679, "rewards/rollout_reward_func/std": 0.47063547372817993, "sampling/importance_sampling_ratio/max": 1.0046517848968506, "sampling/importance_sampling_ratio/mean": 0.9634565114974976, "sampling/importance_sampling_ratio/min": 0.0026992636267095804, "sampling/sampling_logp_difference/max": 2.6712393760681152, "sampling/sampling_logp_difference/mean": 0.02830291911959648, "step": 723, "step_time": 4.7718672750052065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1774310115724802, "epoch": 0.00724, "grad_norm": 0.009662305936217308, "kl": 0.494182251393795, "learning_rate": 9.999780938684877e-06, "loss": -0.0361, "step": 724, "step_time": 2.079598968011851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 224.375, "completions/mean_terminated_length": 224.375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.4889486012980342, "epoch": 0.00725, "frac_reward_zero_std": 0.25, "grad_norm": 0.11272431164979935, "kl": 0.5636697821319103, "learning_rate": 9.999780301422157e-06, "loss": -0.0913, "num_tokens": 6866773.0, "reward": 1.1047461032867432, "reward_std": 0.07223650068044662, "rewards/rollout_reward_func/mean": 1.1047461032867432, "rewards/rollout_reward_func/std": 0.3491557836532593, "sampling/importance_sampling_ratio/max": 1.0012515783309937, "sampling/importance_sampling_ratio/mean": 0.8825978636741638, "sampling/importance_sampling_ratio/min": 0.00012328299635555595, "sampling/sampling_logp_difference/max": 2.474663019180298, "sampling/sampling_logp_difference/mean": 0.08294366300106049, "step": 725, "step_time": 4.746912971000711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4904574938118458, "epoch": 0.00726, "grad_norm": 0.11257512867450714, "kl": 0.5638933330774307, "learning_rate": 9.99977966323389e-06, "loss": -0.0913, "step": 726, "step_time": 2.0847660309955245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 172.90625, "completions/mean_terminated_length": 172.90625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.6067733913660049, "epoch": 0.00727, "frac_reward_zero_std": 0.5, "grad_norm": 0.050919514149427414, "kl": 0.5907211527228355, "learning_rate": 9.99977902412008e-06, "loss": -0.0348, "num_tokens": 6884946.0, "reward": 0.6525673270225525, "reward_std": 0.06051202490925789, "rewards/rollout_reward_func/mean": 0.6525673270225525, "rewards/rollout_reward_func/std": 0.10673075914382935, "sampling/importance_sampling_ratio/max": 1.034452199935913, "sampling/importance_sampling_ratio/mean": 0.9358268976211548, "sampling/importance_sampling_ratio/min": 1.2196161937064107e-22, "sampling/sampling_logp_difference/max": 4.240099906921387, "sampling/sampling_logp_difference/mean": 0.32629159092903137, "step": 727, "step_time": 5.041013660003955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6055658739060163, "epoch": 0.00728, "grad_norm": 0.04514826834201813, "kl": 0.5991804525256157, "learning_rate": 9.999778384080722e-06, "loss": -0.0349, "step": 728, "step_time": 2.4942906940050307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5773423267528415, "epoch": 0.00729, "frac_reward_zero_std": 0.75, "grad_norm": 0.009190932847559452, "kl": 0.5948562324047089, "learning_rate": 9.999777743115822e-06, "loss": -0.0224, "num_tokens": 6901530.0, "reward": 0.6802884936332703, "reward_std": 0.011976571753621101, "rewards/rollout_reward_func/mean": 0.6802884936332703, "rewards/rollout_reward_func/std": 0.11902347207069397, "sampling/importance_sampling_ratio/max": 1.0209752321243286, "sampling/importance_sampling_ratio/mean": 0.9343789219856262, "sampling/importance_sampling_ratio/min": 0.00018503968021832407, "sampling/sampling_logp_difference/max": 1.9706449508666992, "sampling/sampling_logp_difference/mean": 0.0960070788860321, "step": 729, "step_time": 4.0187037409996265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5715496111661196, "epoch": 0.0073, "grad_norm": 0.008195084519684315, "kl": 0.5885727256536484, "learning_rate": 9.999777101225378e-06, "loss": -0.0224, "step": 730, "step_time": 2.0374653990002116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07475058129057288, "epoch": 0.00731, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007466258248314261, "kl": 0.485619954764843, "learning_rate": 9.999776458409387e-06, "loss": 0.0017, "num_tokens": 6919138.0, "reward": 0.7317308187484741, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7317308187484741, "rewards/rollout_reward_func/std": 0.3408556878566742, "sampling/importance_sampling_ratio/max": 0.9982012510299683, "sampling/importance_sampling_ratio/mean": 0.9914509057998657, "sampling/importance_sampling_ratio/min": 0.977492094039917, "sampling/sampling_logp_difference/max": 0.010949943214654922, "sampling/sampling_logp_difference/mean": 0.0019267038442194462, "step": 731, "step_time": 4.293066339989309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07222120929509401, "epoch": 0.00732, "grad_norm": 0.000728268176317215, "kl": 0.4859689176082611, "learning_rate": 9.999775814667854e-06, "loss": 0.0017, "step": 732, "step_time": 2.0390290310024284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 165.40625, "completions/mean_terminated_length": 165.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5869998103007674, "epoch": 0.00733, "frac_reward_zero_std": 0.75, "grad_norm": 0.00821780413389206, "kl": 0.5699105970561504, "learning_rate": 9.999775170000777e-06, "loss": -0.0366, "num_tokens": 6936503.0, "reward": 1.0000962018966675, "reward_std": 0.06130071356892586, "rewards/rollout_reward_func/mean": 1.0000962018966675, "rewards/rollout_reward_func/std": 0.4266221523284912, "sampling/importance_sampling_ratio/max": 1.0010641813278198, "sampling/importance_sampling_ratio/mean": 0.958638072013855, "sampling/importance_sampling_ratio/min": 1.0299240347181953e-20, "sampling/sampling_logp_difference/max": 3.4266788959503174, "sampling/sampling_logp_difference/mean": 0.21552802622318268, "step": 733, "step_time": 5.311707906992524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5823481604456902, "epoch": 0.00734, "grad_norm": 0.009186931885778904, "kl": 0.575159378349781, "learning_rate": 9.999774524408155e-06, "loss": -0.0366, "step": 734, "step_time": 2.0508853219871526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 129.25, "completions/mean_terminated_length": 129.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5617186743766069, "epoch": 0.00735, "frac_reward_zero_std": 0.5, "grad_norm": 1.7776962518692017, "kl": 0.6726420074701309, "learning_rate": 9.99977387788999e-06, "loss": 0.0001, "num_tokens": 6953775.0, "reward": 0.5515096187591553, "reward_std": 0.33099910616874695, "rewards/rollout_reward_func/mean": 0.5515096187591553, "rewards/rollout_reward_func/std": 0.9648192524909973, "sampling/importance_sampling_ratio/max": 1.0366647243499756, "sampling/importance_sampling_ratio/mean": 0.9204356670379639, "sampling/importance_sampling_ratio/min": 7.096189074218273e-05, "sampling/sampling_logp_difference/max": 2.8557896614074707, "sampling/sampling_logp_difference/mean": 0.11471323668956757, "step": 735, "step_time": 4.35974089299998 }, { "clip_ratio/high_max": 0.05208333395421505, "clip_ratio/high_mean": 0.026041666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03645833395421505, "entropy": 0.5349251851439476, "epoch": 0.00736, "grad_norm": 0.0536775216460228, "kl": 0.6752902790904045, "learning_rate": 9.99977323044628e-06, "loss": -0.0028, "step": 736, "step_time": 2.0438164419974783 }, { "clip_ratio/high_max": 0.026041666977107525, "clip_ratio/high_mean": 0.013020833488553762, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.7340248199179769, "epoch": 0.00737, "frac_reward_zero_std": 0.25, "grad_norm": 0.2741067707538605, "kl": 0.5825763307511806, "learning_rate": 9.99977258207703e-06, "loss": -0.0565, "num_tokens": 6971759.0, "reward": 0.6414614915847778, "reward_std": 0.017949629575014114, "rewards/rollout_reward_func/mean": 0.6414614915847778, "rewards/rollout_reward_func/std": 0.16271758079528809, "sampling/importance_sampling_ratio/max": 1.1390868425369263, "sampling/importance_sampling_ratio/mean": 0.8076183795928955, "sampling/importance_sampling_ratio/min": 4.2165628627577334e-11, "sampling/sampling_logp_difference/max": 13.32427978515625, "sampling/sampling_logp_difference/mean": 0.20018447935581207, "step": 737, "step_time": 4.450267252010235 }, { "clip_ratio/high_max": 0.05208333395421505, "clip_ratio/high_mean": 0.026041666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 0.7415440632030368, "epoch": 0.00738, "grad_norm": 0.0626945048570633, "kl": 0.5837773494422436, "learning_rate": 9.999771932782234e-06, "loss": -0.0569, "step": 738, "step_time": 2.034026295012154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 99.90625, "completions/mean_terminated_length": 99.90625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.28665051609277725, "epoch": 0.00739, "frac_reward_zero_std": 0.75, "grad_norm": 0.03497757390141487, "kl": 0.5512785501778126, "learning_rate": 9.999771282561895e-06, "loss": 0.0206, "num_tokens": 6987820.0, "reward": 0.7654327154159546, "reward_std": 0.019717399030923843, "rewards/rollout_reward_func/mean": 0.7654327154159546, "rewards/rollout_reward_func/std": 0.22932249307632446, "sampling/importance_sampling_ratio/max": 1.0071368217468262, "sampling/importance_sampling_ratio/mean": 0.9684053659439087, "sampling/importance_sampling_ratio/min": 0.006307181902229786, "sampling/sampling_logp_difference/max": 2.1340854167938232, "sampling/sampling_logp_difference/mean": 0.041692301630973816, "step": 739, "step_time": 4.66307120599231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2848251797258854, "epoch": 0.0074, "grad_norm": 0.03628513589501381, "kl": 0.5400401651859283, "learning_rate": 9.999770631416015e-06, "loss": 0.0205, "step": 740, "step_time": 2.0031313129948103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 214.0, "completions/mean_terminated_length": 214.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.054978542029857635, "epoch": 0.00741, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006039009895175695, "kl": 0.45833273231983185, "learning_rate": 9.99976997934459e-06, "loss": 0.0018, "num_tokens": 7007252.0, "reward": 0.8558076620101929, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8558076620101929, "rewards/rollout_reward_func/std": 0.3611586093902588, "sampling/importance_sampling_ratio/max": 1.0079792737960815, "sampling/importance_sampling_ratio/mean": 0.9962319135665894, "sampling/importance_sampling_ratio/min": 0.9900215268135071, "sampling/sampling_logp_difference/max": 0.012217916548252106, "sampling/sampling_logp_difference/mean": 0.0013565490953624249, "step": 741, "step_time": 4.563444632993196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0560819786041975, "epoch": 0.00742, "grad_norm": 0.0006600136985071003, "kl": 0.458060909062624, "learning_rate": 9.999769326347624e-06, "loss": 0.0018, "step": 742, "step_time": 2.0508200480107917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 103.4375, "completions/mean_terminated_length": 103.4375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.28693859465420246, "epoch": 0.00743, "frac_reward_zero_std": 0.75, "grad_norm": 0.029919613152742386, "kl": 0.5755419284105301, "learning_rate": 9.999768672425116e-06, "loss": -0.0176, "num_tokens": 7023858.0, "reward": 0.5836538672447205, "reward_std": 0.024476774036884308, "rewards/rollout_reward_func/mean": 0.5836538672447205, "rewards/rollout_reward_func/std": 0.08978671580553055, "sampling/importance_sampling_ratio/max": 1.2240172624588013, "sampling/importance_sampling_ratio/mean": 0.9927273988723755, "sampling/importance_sampling_ratio/min": 0.020616702735424042, "sampling/sampling_logp_difference/max": 1.7804046869277954, "sampling/sampling_logp_difference/mean": 0.03947693854570389, "step": 743, "step_time": 3.771684233994165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28736876230686903, "epoch": 0.00744, "grad_norm": 0.036272082477808, "kl": 0.5694590881466866, "learning_rate": 9.999768017577065e-06, "loss": -0.0176, "step": 744, "step_time": 2.4566365919890814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.26234476175159216, "epoch": 0.00745, "frac_reward_zero_std": 0.75, "grad_norm": 0.02291148155927658, "kl": 0.5606066361069679, "learning_rate": 9.999767361803471e-06, "loss": -0.0268, "num_tokens": 7041310.0, "reward": 0.7532211542129517, "reward_std": 0.022437043488025665, "rewards/rollout_reward_func/mean": 0.7532211542129517, "rewards/rollout_reward_func/std": 0.10955625772476196, "sampling/importance_sampling_ratio/max": 0.9999902248382568, "sampling/importance_sampling_ratio/mean": 0.9646879434585571, "sampling/importance_sampling_ratio/min": 0.0003464969340711832, "sampling/sampling_logp_difference/max": 2.1113226413726807, "sampling/sampling_logp_difference/mean": 0.05035601556301117, "step": 745, "step_time": 4.781833799002925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2644689613953233, "epoch": 0.00746, "grad_norm": 0.022780658677220345, "kl": 0.5611625537276268, "learning_rate": 9.999766705104336e-06, "loss": -0.0268, "step": 746, "step_time": 2.0424805359944003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 198.75, "completions/mean_terminated_length": 198.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.21293358178809285, "epoch": 0.00747, "frac_reward_zero_std": 0.75, "grad_norm": 0.004462528042495251, "kl": 0.4916588366031647, "learning_rate": 9.999766047479658e-06, "loss": -0.0172, "num_tokens": 7060342.0, "reward": 0.7149423360824585, "reward_std": 0.006799102760851383, "rewards/rollout_reward_func/mean": 0.7149423360824585, "rewards/rollout_reward_func/std": 0.4531472623348236, "sampling/importance_sampling_ratio/max": 1.0193058252334595, "sampling/importance_sampling_ratio/mean": 0.9683248996734619, "sampling/importance_sampling_ratio/min": 0.007899188436567783, "sampling/sampling_logp_difference/max": 2.453786849975586, "sampling/sampling_logp_difference/mean": 0.03206000104546547, "step": 747, "step_time": 4.596289928012993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21244566049426794, "epoch": 0.00748, "grad_norm": 0.0054801832884550095, "kl": 0.49712103977799416, "learning_rate": 9.99976538892944e-06, "loss": -0.0172, "step": 748, "step_time": 2.0489125729945954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 100.5625, "completions/mean_terminated_length": 100.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3195541501045227, "epoch": 0.00749, "frac_reward_zero_std": 0.75, "grad_norm": 0.1544867753982544, "kl": 0.6581261456012726, "learning_rate": 9.99976472945368e-06, "loss": -0.0051, "num_tokens": 7076760.0, "reward": 0.6937211751937866, "reward_std": 0.03331560641527176, "rewards/rollout_reward_func/mean": 0.6937211751937866, "rewards/rollout_reward_func/std": 0.15953700244426727, "sampling/importance_sampling_ratio/max": 1.0044418573379517, "sampling/importance_sampling_ratio/mean": 0.9601712226867676, "sampling/importance_sampling_ratio/min": 0.1547730416059494, "sampling/sampling_logp_difference/max": 1.2686060667037964, "sampling/sampling_logp_difference/mean": 0.02377956733107567, "step": 749, "step_time": 4.137631175995921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31854370795190334, "epoch": 0.0075, "grad_norm": 0.14960405230522156, "kl": 0.6608842387795448, "learning_rate": 9.999764069052378e-06, "loss": -0.0052, "step": 750, "step_time": 2.5766212289890973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 199.96875, "completions/mean_terminated_length": 199.96875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.5672802785411477, "epoch": 0.00751, "frac_reward_zero_std": 0.5, "grad_norm": 0.03455379605293274, "kl": 0.5982917994260788, "learning_rate": 9.999763407725536e-06, "loss": -0.0534, "num_tokens": 7095615.0, "reward": 0.8322836756706238, "reward_std": 0.03879569470882416, "rewards/rollout_reward_func/mean": 0.8322836756706238, "rewards/rollout_reward_func/std": 0.27122369408607483, "sampling/importance_sampling_ratio/max": 1.0163878202438354, "sampling/importance_sampling_ratio/mean": 0.9129247665405273, "sampling/importance_sampling_ratio/min": 4.9991245759883896e-05, "sampling/sampling_logp_difference/max": 2.6399810314178467, "sampling/sampling_logp_difference/mean": 0.10879163444042206, "step": 751, "step_time": 5.052755678007088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.57264085393399, "epoch": 0.00752, "grad_norm": 0.034946076571941376, "kl": 0.6006041057407856, "learning_rate": 9.999762745473153e-06, "loss": -0.0534, "step": 752, "step_time": 2.0569953420053935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 165.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.04343463201075792, "epoch": 0.00753, "frac_reward_zero_std": 1.0, "grad_norm": 0.00035495104384608567, "kl": 0.43901053071022034, "learning_rate": 9.999762082295227e-06, "loss": 0.0015, "num_tokens": 7112991.0, "reward": 0.7812691926956177, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7812691926956177, "rewards/rollout_reward_func/std": 0.17421554028987885, "sampling/importance_sampling_ratio/max": 1.0048598051071167, "sampling/importance_sampling_ratio/mean": 0.9972927570343018, "sampling/importance_sampling_ratio/min": 0.9931395053863525, "sampling/sampling_logp_difference/max": 0.007696516811847687, "sampling/sampling_logp_difference/mean": 0.001215996453538537, "step": 753, "step_time": 4.310060103998694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04268712783232331, "epoch": 0.00754, "grad_norm": 0.0003371075144968927, "kl": 0.439113050699234, "learning_rate": 9.999761418191762e-06, "loss": 0.0015, "step": 754, "step_time": 2.0385041479967185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 100.15625, "completions/mean_terminated_length": 100.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8017022218555212, "epoch": 0.00755, "frac_reward_zero_std": 0.5, "grad_norm": 0.012712127529084682, "kl": 0.5875293910503387, "learning_rate": 9.999760753162758e-06, "loss": -0.027, "num_tokens": 7128268.0, "reward": 0.6197596192359924, "reward_std": 0.0496334508061409, "rewards/rollout_reward_func/mean": 0.6197596192359924, "rewards/rollout_reward_func/std": 0.19126330316066742, "sampling/importance_sampling_ratio/max": 1.0131793022155762, "sampling/importance_sampling_ratio/mean": 0.9348534345626831, "sampling/importance_sampling_ratio/min": 6.414003421258972e-35, "sampling/sampling_logp_difference/max": 4.19273042678833, "sampling/sampling_logp_difference/mean": 0.41899538040161133, "step": 755, "step_time": 4.801250920994789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7995501891709864, "epoch": 0.00756, "grad_norm": 0.012901615351438522, "kl": 0.5826267749071121, "learning_rate": 9.999760087208213e-06, "loss": -0.027, "step": 756, "step_time": 2.040252878978208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 192.25, "completions/mean_terminated_length": 192.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05456479359418154, "epoch": 0.00757, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006856332765892148, "kl": 0.45939650014042854, "learning_rate": 9.999759420328126e-06, "loss": 0.0017, "num_tokens": 7146868.0, "reward": 0.768384575843811, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.768384575843811, "rewards/rollout_reward_func/std": 0.3011271059513092, "sampling/importance_sampling_ratio/max": 1.0115104913711548, "sampling/importance_sampling_ratio/mean": 1.0002168416976929, "sampling/importance_sampling_ratio/min": 0.9937780499458313, "sampling/sampling_logp_difference/max": 0.013231560587882996, "sampling/sampling_logp_difference/mean": 0.0017702628392726183, "step": 757, "step_time": 4.862035622005351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05411397200077772, "epoch": 0.00758, "grad_norm": 0.000683603691868484, "kl": 0.4594443663954735, "learning_rate": 9.999758752522502e-06, "loss": 0.0017, "step": 758, "step_time": 2.047814219004067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 157.09375, "completions/mean_terminated_length": 157.09375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.36824065912514925, "epoch": 0.00759, "frac_reward_zero_std": 0.75, "grad_norm": 0.027482250705361366, "kl": 0.5807994864881039, "learning_rate": 9.999758083791337e-06, "loss": -0.0176, "num_tokens": 7164863.0, "reward": 0.7694711685180664, "reward_std": 0.01291829627007246, "rewards/rollout_reward_func/mean": 0.7694711685180664, "rewards/rollout_reward_func/std": 0.2964172661304474, "sampling/importance_sampling_ratio/max": 1.0145742893218994, "sampling/importance_sampling_ratio/mean": 0.9675426483154297, "sampling/importance_sampling_ratio/min": 9.573178431310225e-06, "sampling/sampling_logp_difference/max": 2.5662379264831543, "sampling/sampling_logp_difference/mean": 0.07186554372310638, "step": 759, "step_time": 4.355639190995134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.368709787260741, "epoch": 0.0076, "grad_norm": 0.02801736630499363, "kl": 0.5878086350858212, "learning_rate": 9.999757414134631e-06, "loss": -0.0176, "step": 760, "step_time": 2.0282940469987807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 172.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.04758853930979967, "epoch": 0.00761, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004345474299043417, "kl": 0.4496220052242279, "learning_rate": 9.999756743552387e-06, "loss": 0.0015, "num_tokens": 7182911.0, "reward": 0.8518460988998413, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8518460988998413, "rewards/rollout_reward_func/std": 0.3842533528804779, "sampling/importance_sampling_ratio/max": 1.0049532651901245, "sampling/importance_sampling_ratio/mean": 0.9981929063796997, "sampling/importance_sampling_ratio/min": 0.9940099716186523, "sampling/sampling_logp_difference/max": 0.006941515952348709, "sampling/sampling_logp_difference/mean": 0.001286619808524847, "step": 761, "step_time": 4.865439717999834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04734619287773967, "epoch": 0.00762, "grad_norm": 0.00043285751598887146, "kl": 0.449634350836277, "learning_rate": 9.999756072044602e-06, "loss": 0.0015, "step": 762, "step_time": 2.0223314159884467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.0490763895213604, "epoch": 0.00763, "frac_reward_zero_std": 1.0, "grad_norm": 0.00043949714745394886, "kl": 0.4279319755733013, "learning_rate": 9.99975539961128e-06, "loss": 0.0013, "num_tokens": 7200279.0, "reward": 0.8056923151016235, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8056923151016235, "rewards/rollout_reward_func/std": 0.3987840712070465, "sampling/importance_sampling_ratio/max": 1.0200368165969849, "sampling/importance_sampling_ratio/mean": 1.0017642974853516, "sampling/importance_sampling_ratio/min": 0.9958070516586304, "sampling/sampling_logp_difference/max": 0.021873783320188522, "sampling/sampling_logp_difference/mean": 0.0018717485945671797, "step": 763, "step_time": 4.930634053001995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04856975609436631, "epoch": 0.00764, "grad_norm": 0.0004414272552821785, "kl": 0.42799365893006325, "learning_rate": 9.999754726252418e-06, "loss": 0.0013, "step": 764, "step_time": 2.0298794010086567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 101.125, "completions/mean_terminated_length": 101.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.24487532209604979, "epoch": 0.00765, "frac_reward_zero_std": 0.75, "grad_norm": 0.007829957641661167, "kl": 0.6733703799545765, "learning_rate": 9.999754051968017e-06, "loss": -0.0173, "num_tokens": 7216459.0, "reward": 0.6565384864807129, "reward_std": 0.027196412906050682, "rewards/rollout_reward_func/mean": 0.6565384864807129, "rewards/rollout_reward_func/std": 0.15023401379585266, "sampling/importance_sampling_ratio/max": 1.0042568445205688, "sampling/importance_sampling_ratio/mean": 0.9639676809310913, "sampling/importance_sampling_ratio/min": 0.011715593747794628, "sampling/sampling_logp_difference/max": 2.078385829925537, "sampling/sampling_logp_difference/mean": 0.03615724667906761, "step": 765, "step_time": 4.091983920996427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24364803452044725, "epoch": 0.00766, "grad_norm": 0.007264953572303057, "kl": 0.6795686483383179, "learning_rate": 9.999753376758078e-06, "loss": -0.0173, "step": 766, "step_time": 2.0342870999957086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 148.0, "completions/mean_terminated_length": 148.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.05870850011706352, "epoch": 0.00767, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006153348949737847, "kl": 0.5548103749752045, "learning_rate": 9.9997527006226e-06, "loss": 0.0017, "num_tokens": 7233139.0, "reward": 0.5426923036575317, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5426923036575317, "rewards/rollout_reward_func/std": 0.08030007779598236, "sampling/importance_sampling_ratio/max": 1.0152959823608398, "sampling/importance_sampling_ratio/mean": 1.0011366605758667, "sampling/importance_sampling_ratio/min": 0.9930604100227356, "sampling/sampling_logp_difference/max": 0.016860932111740112, "sampling/sampling_logp_difference/mean": 0.001953843981027603, "step": 767, "step_time": 4.75920058999327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05817727372050285, "epoch": 0.00768, "grad_norm": 0.0005948017933405936, "kl": 0.5549318566918373, "learning_rate": 9.999752023561584e-06, "loss": 0.0017, "step": 768, "step_time": 2.042669038011809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 192.0625, "completions/mean_terminated_length": 192.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4648272795602679, "epoch": 0.00769, "frac_reward_zero_std": 0.5, "grad_norm": 0.00977549608796835, "kl": 0.4874034821987152, "learning_rate": 9.999751345575029e-06, "loss": -0.0557, "num_tokens": 7252285.0, "reward": 0.9209327101707458, "reward_std": 0.04065863788127899, "rewards/rollout_reward_func/mean": 0.9209327101707458, "rewards/rollout_reward_func/std": 0.3124628961086273, "sampling/importance_sampling_ratio/max": 1.013991355895996, "sampling/importance_sampling_ratio/mean": 0.9400625228881836, "sampling/importance_sampling_ratio/min": 4.82460054627154e-05, "sampling/sampling_logp_difference/max": 2.351267099380493, "sampling/sampling_logp_difference/mean": 0.0750003308057785, "step": 769, "step_time": 4.808589631007635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.46696670539677143, "epoch": 0.0077, "grad_norm": 0.009389986284077168, "kl": 0.4856483042240143, "learning_rate": 9.999750666662938e-06, "loss": -0.0557, "step": 770, "step_time": 2.0332104500121204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 165.65625, "completions/mean_terminated_length": 165.65625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.1879445225931704, "epoch": 0.00771, "frac_reward_zero_std": 0.75, "grad_norm": 0.012304186820983887, "kl": 0.5709483698010445, "learning_rate": 9.999749986825307e-06, "loss": -0.0268, "num_tokens": 7269578.0, "reward": 0.7719903588294983, "reward_std": 0.029508113861083984, "rewards/rollout_reward_func/mean": 0.7719903588294983, "rewards/rollout_reward_func/std": 0.12101143598556519, "sampling/importance_sampling_ratio/max": 1.0251671075820923, "sampling/importance_sampling_ratio/mean": 0.9733816981315613, "sampling/importance_sampling_ratio/min": 0.020669618621468544, "sampling/sampling_logp_difference/max": 1.5492758750915527, "sampling/sampling_logp_difference/mean": 0.021731378510594368, "step": 771, "step_time": 4.815962712003966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18693971913307905, "epoch": 0.00772, "grad_norm": 0.012925458140671253, "kl": 0.5702352747321129, "learning_rate": 9.999749306062141e-06, "loss": -0.0268, "step": 772, "step_time": 2.0471723830123665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.5719374464824796, "epoch": 0.00773, "frac_reward_zero_std": 0.5, "grad_norm": 0.022446952760219574, "kl": 0.6804586909711361, "learning_rate": 9.999748624373435e-06, "loss": -0.0458, "num_tokens": 7286402.0, "reward": 0.414903849363327, "reward_std": 0.17976829409599304, "rewards/rollout_reward_func/mean": 0.414903849363327, "rewards/rollout_reward_func/std": 0.30804160237312317, "sampling/importance_sampling_ratio/max": 1.0144879817962646, "sampling/importance_sampling_ratio/mean": 0.9368901252746582, "sampling/importance_sampling_ratio/min": 5.378998557639301e-16, "sampling/sampling_logp_difference/max": 10.44909954071045, "sampling/sampling_logp_difference/mean": 0.27681392431259155, "step": 773, "step_time": 5.053478065005038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5717543261125684, "epoch": 0.00774, "grad_norm": 0.022067135199904442, "kl": 0.6777187772095203, "learning_rate": 9.999747941759192e-06, "loss": -0.0458, "step": 774, "step_time": 2.5093281699810177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 122.0, "completions/mean_terminated_length": 122.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.051330807618796825, "epoch": 0.00775, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004471412976272404, "kl": 0.5315388701856136, "learning_rate": 9.999747258219414e-06, "loss": 0.0015, "num_tokens": 7302930.0, "reward": 0.7157691717147827, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7157691717147827, "rewards/rollout_reward_func/std": 0.22431230545043945, "sampling/importance_sampling_ratio/max": 1.0226118564605713, "sampling/importance_sampling_ratio/mean": 1.0033226013183594, "sampling/importance_sampling_ratio/min": 0.992998480796814, "sampling/sampling_logp_difference/max": 0.02071240544319153, "sampling/sampling_logp_difference/mean": 0.0023441642988473177, "step": 775, "step_time": 4.311424965999322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05141653819009662, "epoch": 0.00776, "grad_norm": 0.0004473782319109887, "kl": 0.5315160304307938, "learning_rate": 9.999746573754097e-06, "loss": 0.0015, "step": 776, "step_time": 2.0178816219995497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 120.25, "completions/mean_terminated_length": 120.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.05881767626851797, "epoch": 0.00777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006189952837303281, "kl": 0.4899386204779148, "learning_rate": 9.999745888363244e-06, "loss": 0.0014, "num_tokens": 7319282.0, "reward": 0.6892307996749878, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6892307996749878, "rewards/rollout_reward_func/std": 0.15053102374076843, "sampling/importance_sampling_ratio/max": 1.0148959159851074, "sampling/importance_sampling_ratio/mean": 1.0015881061553955, "sampling/importance_sampling_ratio/min": 0.9938479661941528, "sampling/sampling_logp_difference/max": 0.017025385051965714, "sampling/sampling_logp_difference/mean": 0.0021869842894375324, "step": 777, "step_time": 4.0587571319993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.059163241647183895, "epoch": 0.00778, "grad_norm": 0.000620249891653657, "kl": 0.48986391723155975, "learning_rate": 9.999745202046853e-06, "loss": 0.0014, "step": 778, "step_time": 2.0222278390065185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 215.46875, "completions/mean_terminated_length": 215.46875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.35556449787691236, "epoch": 0.00779, "frac_reward_zero_std": 0.5, "grad_norm": 0.6677530407905579, "kl": 0.5636951923370361, "learning_rate": 9.999744514804925e-06, "loss": -0.0439, "num_tokens": 7338681.0, "reward": 0.8535961508750916, "reward_std": 0.025776660069823265, "rewards/rollout_reward_func/mean": 0.8535961508750916, "rewards/rollout_reward_func/std": 0.29374781250953674, "sampling/importance_sampling_ratio/max": 1.010684847831726, "sampling/importance_sampling_ratio/mean": 0.9306033253669739, "sampling/importance_sampling_ratio/min": 5.102228639586315e-10, "sampling/sampling_logp_difference/max": 11.065295219421387, "sampling/sampling_logp_difference/mean": 0.10360623896121979, "step": 779, "step_time": 5.1296225449987105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0416666679084301, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0416666679084301, "entropy": 0.4641166105866432, "epoch": 0.0078, "grad_norm": 0.011275513097643852, "kl": 0.5497795268893242, "learning_rate": 9.999743826637464e-06, "loss": -0.0458, "step": 780, "step_time": 2.5241496540038497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 190.6875, "completions/mean_terminated_length": 190.6875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.10878885257989168, "epoch": 0.00781, "frac_reward_zero_std": 0.75, "grad_norm": 0.014553384855389595, "kl": 0.611416082829237, "learning_rate": 9.999743137544465e-06, "loss": -0.0251, "num_tokens": 7357623.0, "reward": 0.8910096287727356, "reward_std": 0.04947028309106827, "rewards/rollout_reward_func/mean": 0.8910096287727356, "rewards/rollout_reward_func/std": 0.3039686679840088, "sampling/importance_sampling_ratio/max": 1.0157454013824463, "sampling/importance_sampling_ratio/mean": 0.9708251953125, "sampling/importance_sampling_ratio/min": 0.09234470129013062, "sampling/sampling_logp_difference/max": 2.2634174823760986, "sampling/sampling_logp_difference/mean": 0.017007431015372276, "step": 781, "step_time": 4.67210483198869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10879738675430417, "epoch": 0.00782, "grad_norm": 0.01670524664223194, "kl": 0.6184930168092251, "learning_rate": 9.999742447525931e-06, "loss": -0.0251, "step": 782, "step_time": 2.0288261329988018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 229.84375, "completions/mean_terminated_length": 229.84375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.22837736504152417, "epoch": 0.00783, "frac_reward_zero_std": 0.75, "grad_norm": 0.008829329162836075, "kl": 0.5491015836596489, "learning_rate": 9.99974175658186e-06, "loss": -0.0362, "num_tokens": 7377154.0, "reward": 0.8033894300460815, "reward_std": 0.030228814110159874, "rewards/rollout_reward_func/mean": 0.8033894300460815, "rewards/rollout_reward_func/std": 0.29947328567504883, "sampling/importance_sampling_ratio/max": 1.0047838687896729, "sampling/importance_sampling_ratio/mean": 0.9661059975624084, "sampling/importance_sampling_ratio/min": 0.002019256819039583, "sampling/sampling_logp_difference/max": 2.486557960510254, "sampling/sampling_logp_difference/mean": 0.02999807521700859, "step": 783, "step_time": 4.53830670901516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22918316256254911, "epoch": 0.00784, "grad_norm": 0.0092212138697505, "kl": 0.5518978200852871, "learning_rate": 9.999741064712254e-06, "loss": -0.0362, "step": 784, "step_time": 2.04125149299216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11021666135638952, "epoch": 0.00785, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007613211637362838, "kl": 0.8062503337860107, "learning_rate": 9.999740371917113e-06, "loss": 0.0014, "num_tokens": 7390146.0, "reward": 0.7699999809265137, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7699999809265137, "rewards/rollout_reward_func/std": 0.014890315942466259, "sampling/importance_sampling_ratio/max": 1.0177638530731201, "sampling/importance_sampling_ratio/mean": 0.996843695640564, "sampling/importance_sampling_ratio/min": 0.9772284030914307, "sampling/sampling_logp_difference/max": 0.01955880969762802, "sampling/sampling_logp_difference/mean": 0.004476648755371571, "step": 785, "step_time": 4.057385994987271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11066481471061707, "epoch": 0.00786, "grad_norm": 0.000766492506954819, "kl": 0.8061844184994698, "learning_rate": 9.999739678196437e-06, "loss": 0.0014, "step": 786, "step_time": 2.4806068589969072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 220.5, "completions/mean_terminated_length": 220.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.10221279924735427, "epoch": 0.00787, "frac_reward_zero_std": 1.0, "grad_norm": 0.046800658106803894, "kl": 0.6102414391934872, "learning_rate": 9.999738983550224e-06, "loss": 0.0024, "num_tokens": 7409570.0, "reward": 0.5824230909347534, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5824230909347534, "rewards/rollout_reward_func/std": 0.2462160885334015, "sampling/importance_sampling_ratio/max": 1.0169163942337036, "sampling/importance_sampling_ratio/mean": 0.9759256839752197, "sampling/importance_sampling_ratio/min": 0.11894861608743668, "sampling/sampling_logp_difference/max": 1.5831732749938965, "sampling/sampling_logp_difference/mean": 0.013456527143716812, "step": 787, "step_time": 4.564679447983508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10276684165000916, "epoch": 0.00788, "grad_norm": 0.046949613839387894, "kl": 0.6092802733182907, "learning_rate": 9.999738287978477e-06, "loss": 0.0024, "step": 788, "step_time": 2.0406384209927637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09348419960588217, "epoch": 0.00789, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005735917948186398, "kl": 0.6417467221617699, "learning_rate": 9.999737591481196e-06, "loss": 0.0015, "num_tokens": 7426170.0, "reward": 0.9267692565917969, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9267692565917969, "rewards/rollout_reward_func/std": 0.24584540724754333, "sampling/importance_sampling_ratio/max": 1.0024712085723877, "sampling/importance_sampling_ratio/mean": 0.991368293762207, "sampling/importance_sampling_ratio/min": 0.9740568399429321, "sampling/sampling_logp_difference/max": 0.018510516732931137, "sampling/sampling_logp_difference/mean": 0.0025766738690435886, "step": 789, "step_time": 4.190127214991662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09430988505482674, "epoch": 0.0079, "grad_norm": 0.000590045761782676, "kl": 0.6415953561663628, "learning_rate": 9.999736894058379e-06, "loss": 0.0015, "step": 790, "step_time": 2.5184516989975236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 141.90625, "completions/mean_terminated_length": 141.90625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.35339345782995224, "epoch": 0.00791, "frac_reward_zero_std": 0.75, "grad_norm": 0.01988901197910309, "kl": 0.5957599729299545, "learning_rate": 9.999736195710027e-06, "loss": -0.0269, "num_tokens": 7442783.0, "reward": 0.8339422941207886, "reward_std": 0.02474873699247837, "rewards/rollout_reward_func/mean": 0.8339422941207886, "rewards/rollout_reward_func/std": 0.2517957389354706, "sampling/importance_sampling_ratio/max": 1.0020105838775635, "sampling/importance_sampling_ratio/mean": 0.9652345180511475, "sampling/importance_sampling_ratio/min": 1.3020794540352654e-06, "sampling/sampling_logp_difference/max": 2.622288465499878, "sampling/sampling_logp_difference/mean": 0.08395814895629883, "step": 791, "step_time": 4.595268340992334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35384215600788593, "epoch": 0.00792, "grad_norm": 0.020312245935201645, "kl": 0.6018579490482807, "learning_rate": 9.999735496436145e-06, "loss": -0.0268, "step": 792, "step_time": 2.025317924009869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 137.15625, "completions/mean_terminated_length": 137.15625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5926713012158871, "epoch": 0.00793, "frac_reward_zero_std": 0.25, "grad_norm": 0.03201569244265556, "kl": 0.7807380929589272, "learning_rate": 9.999734796236725e-06, "loss": -0.0739, "num_tokens": 7459700.0, "reward": 0.6198701858520508, "reward_std": 0.23099273443222046, "rewards/rollout_reward_func/mean": 0.6198701858520508, "rewards/rollout_reward_func/std": 0.43276143074035645, "sampling/importance_sampling_ratio/max": 0.9988707304000854, "sampling/importance_sampling_ratio/mean": 0.9022443294525146, "sampling/importance_sampling_ratio/min": 0.004041644278913736, "sampling/sampling_logp_difference/max": 2.2533676624298096, "sampling/sampling_logp_difference/mean": 0.09754366427659988, "step": 793, "step_time": 4.1844290619919775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5897656409069896, "epoch": 0.00794, "grad_norm": 0.03088696300983429, "kl": 0.7676459327340126, "learning_rate": 9.999734095111773e-06, "loss": -0.0739, "step": 794, "step_time": 2.041995482002676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 154.4375, "completions/mean_terminated_length": 154.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.43256147112697363, "epoch": 0.00795, "frac_reward_zero_std": 0.5, "grad_norm": 0.00919558946043253, "kl": 0.5989341214299202, "learning_rate": 9.999733393061286e-06, "loss": -0.046, "num_tokens": 7477266.0, "reward": 1.070115327835083, "reward_std": 0.02447676658630371, "rewards/rollout_reward_func/mean": 1.070115327835083, "rewards/rollout_reward_func/std": 0.21785655617713928, "sampling/importance_sampling_ratio/max": 1.0023387670516968, "sampling/importance_sampling_ratio/mean": 0.9329115748405457, "sampling/importance_sampling_ratio/min": 0.0016810307279229164, "sampling/sampling_logp_difference/max": 2.204718828201294, "sampling/sampling_logp_difference/mean": 0.08202825486660004, "step": 795, "step_time": 4.673832018008397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42861572187393904, "epoch": 0.00796, "grad_norm": 0.009553714655339718, "kl": 0.5968006961047649, "learning_rate": 9.999732690085267e-06, "loss": -0.046, "step": 796, "step_time": 2.5578719389959588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 102.5, "completions/mean_terminated_length": 102.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.10375636350363493, "epoch": 0.00797, "frac_reward_zero_std": 1.0, "grad_norm": 0.006747572682797909, "kl": 0.5389275811612606, "learning_rate": 9.999731986183711e-06, "loss": 0.0013, "num_tokens": 7493906.0, "reward": 0.680384635925293, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.680384635925293, "rewards/rollout_reward_func/std": 0.05489024519920349, "sampling/importance_sampling_ratio/max": 1.030920147895813, "sampling/importance_sampling_ratio/mean": 0.9804919958114624, "sampling/importance_sampling_ratio/min": 0.6439587473869324, "sampling/sampling_logp_difference/max": 0.43065810203552246, "sampling/sampling_logp_difference/mean": 0.01181541383266449, "step": 797, "step_time": 4.200659584006644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10275151394307613, "epoch": 0.00798, "grad_norm": 0.00613736966624856, "kl": 0.5406363755464554, "learning_rate": 9.999731281356627e-06, "loss": 0.0014, "step": 798, "step_time": 1.9952185899892356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3378728209063411, "epoch": 0.00799, "frac_reward_zero_std": 0.75, "grad_norm": 0.0052392566576600075, "kl": 0.5069085359573364, "learning_rate": 9.999730575604006e-06, "loss": -0.0271, "num_tokens": 7510926.0, "reward": 0.9304038286209106, "reward_std": 0.004079459700733423, "rewards/rollout_reward_func/mean": 0.9304038286209106, "rewards/rollout_reward_func/std": 0.3321099579334259, "sampling/importance_sampling_ratio/max": 1.0192210674285889, "sampling/importance_sampling_ratio/mean": 0.9695160388946533, "sampling/importance_sampling_ratio/min": 4.977924322746574e-11, "sampling/sampling_logp_difference/max": 15.090997695922852, "sampling/sampling_logp_difference/mean": 0.14820967614650726, "step": 799, "step_time": 4.179459694998513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33482864312827587, "epoch": 0.008, "grad_norm": 0.004760375712066889, "kl": 0.5010694228112698, "learning_rate": 9.999729868925855e-06, "loss": -0.0271, "step": 800, "step_time": 2.047696204012027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 143.3125, "completions/mean_terminated_length": 143.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3021934758871794, "epoch": 0.00801, "frac_reward_zero_std": 0.5, "grad_norm": 0.011867828667163849, "kl": 0.5898523554205894, "learning_rate": 9.99972916132217e-06, "loss": -0.0654, "num_tokens": 7527536.0, "reward": 0.9175129532814026, "reward_std": 0.02618064358830452, "rewards/rollout_reward_func/mean": 0.9175129532814026, "rewards/rollout_reward_func/std": 0.24119111895561218, "sampling/importance_sampling_ratio/max": 1.0025177001953125, "sampling/importance_sampling_ratio/mean": 0.9312081336975098, "sampling/importance_sampling_ratio/min": 3.60049316749933e-11, "sampling/sampling_logp_difference/max": 20.813961029052734, "sampling/sampling_logp_difference/mean": 0.18215039372444153, "step": 801, "step_time": 4.736911411993788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3008739911019802, "epoch": 0.00802, "grad_norm": 0.01182637456804514, "kl": 0.5890650600194931, "learning_rate": 9.999728452792951e-06, "loss": -0.0654, "step": 802, "step_time": 2.0543475930026034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 126.25, "completions/mean_terminated_length": 126.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05711535830050707, "epoch": 0.00803, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005304087535478175, "kl": 0.48169204220175743, "learning_rate": 9.999727743338202e-06, "loss": 0.0013, "num_tokens": 7544336.0, "reward": 0.8478077054023743, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8478077054023743, "rewards/rollout_reward_func/std": 0.20396773517131805, "sampling/importance_sampling_ratio/max": 1.0055116415023804, "sampling/importance_sampling_ratio/mean": 0.998684287071228, "sampling/importance_sampling_ratio/min": 0.9932592511177063, "sampling/sampling_logp_difference/max": 0.008808311074972153, "sampling/sampling_logp_difference/mean": 0.0016439375467598438, "step": 803, "step_time": 4.516195882984903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.056781523395329714, "epoch": 0.00804, "grad_norm": 0.0005204658955335617, "kl": 0.4817669838666916, "learning_rate": 9.99972703295792e-06, "loss": 0.0013, "step": 804, "step_time": 1.9990140399968368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 146.40625, "completions/mean_terminated_length": 146.40625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.8344134436920285, "epoch": 0.00805, "frac_reward_zero_std": 0.5, "grad_norm": 0.014671452343463898, "kl": 0.5739657133817673, "learning_rate": 9.999726321652106e-06, "loss": -0.0557, "num_tokens": 7561197.0, "reward": 0.5305706858634949, "reward_std": 0.1825600266456604, "rewards/rollout_reward_func/mean": 0.5305706858634949, "rewards/rollout_reward_func/std": 0.3608057498931885, "sampling/importance_sampling_ratio/max": 1.1155201196670532, "sampling/importance_sampling_ratio/mean": 0.9257422685623169, "sampling/importance_sampling_ratio/min": 1.9435421114167157e-16, "sampling/sampling_logp_difference/max": 8.577910423278809, "sampling/sampling_logp_difference/mean": 0.24976152181625366, "step": 805, "step_time": 4.452221844992891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8310700664296746, "epoch": 0.00806, "grad_norm": 0.014536059461534023, "kl": 0.5692535787820816, "learning_rate": 9.999725609420761e-06, "loss": -0.0557, "step": 806, "step_time": 2.038573315978283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 171.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.06399986334145069, "epoch": 0.00807, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006862773443572223, "kl": 0.4512898214161396, "learning_rate": 9.999724896263882e-06, "loss": 0.0015, "num_tokens": 7579197.0, "reward": 0.7228845953941345, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7228845953941345, "rewards/rollout_reward_func/std": 0.31601467728614807, "sampling/importance_sampling_ratio/max": 1.007335901260376, "sampling/importance_sampling_ratio/mean": 0.997682511806488, "sampling/importance_sampling_ratio/min": 0.9924694895744324, "sampling/sampling_logp_difference/max": 0.0056895483285188675, "sampling/sampling_logp_difference/mean": 0.0015635709278285503, "step": 807, "step_time": 4.767128647996287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06401184434071183, "epoch": 0.00808, "grad_norm": 0.0006862974842078984, "kl": 0.4513041637837887, "learning_rate": 9.999724182181473e-06, "loss": 0.0015, "step": 808, "step_time": 2.472894954000367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 203.0625, "completions/mean_terminated_length": 203.0625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.398679519072175, "epoch": 0.00809, "frac_reward_zero_std": 0.75, "grad_norm": 0.13856515288352966, "kl": 0.47756868600845337, "learning_rate": 9.999723467173534e-06, "loss": 0.0373, "num_tokens": 7598607.0, "reward": 0.970166802406311, "reward_std": 0.02790348045527935, "rewards/rollout_reward_func/mean": 0.970166802406311, "rewards/rollout_reward_func/std": 0.29064124822616577, "sampling/importance_sampling_ratio/max": 1.0287187099456787, "sampling/importance_sampling_ratio/mean": 0.937545895576477, "sampling/importance_sampling_ratio/min": 3.5327096270520997e-07, "sampling/sampling_logp_difference/max": 2.6118576526641846, "sampling/sampling_logp_difference/mean": 0.08673780411481857, "step": 809, "step_time": 4.807295792983496 }, { "clip_ratio/high_max": 0.03303571464493871, "clip_ratio/high_mean": 0.016517857322469354, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016517857322469354, "entropy": 0.3799394373781979, "epoch": 0.0081, "grad_norm": 0.06810833513736725, "kl": 0.45634011924266815, "learning_rate": 9.999722751240062e-06, "loss": 0.037, "step": 810, "step_time": 2.073843036989274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 103.625, "completions/mean_terminated_length": 103.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6004348555579782, "epoch": 0.00811, "frac_reward_zero_std": 0.5, "grad_norm": 0.35418832302093506, "kl": 0.6082273386418819, "learning_rate": 9.999722034381061e-06, "loss": 0.0177, "num_tokens": 7614379.0, "reward": 0.6220817565917969, "reward_std": 0.0725192278623581, "rewards/rollout_reward_func/mean": 0.6220817565917969, "rewards/rollout_reward_func/std": 0.22643230855464935, "sampling/importance_sampling_ratio/max": 1.0423656702041626, "sampling/importance_sampling_ratio/mean": 0.9376611709594727, "sampling/importance_sampling_ratio/min": 9.027812097883725e-07, "sampling/sampling_logp_difference/max": 3.2970714569091797, "sampling/sampling_logp_difference/mean": 0.14677780866622925, "step": 811, "step_time": 4.398228201003803 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04375000111758709, "entropy": 0.544532704167068, "epoch": 0.00812, "grad_norm": 0.009845816530287266, "kl": 0.5929336212575436, "learning_rate": 9.999721316596529e-06, "loss": 0.0167, "step": 812, "step_time": 2.0384629259933718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 243.53125, "completions/mean_terminated_length": 243.53125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.39381208876147866, "epoch": 0.00813, "frac_reward_zero_std": 0.75, "grad_norm": 0.013720512390136719, "kl": 0.4041705057024956, "learning_rate": 9.999720597886464e-06, "loss": -0.0356, "num_tokens": 7634132.0, "reward": 0.9611202478408813, "reward_std": 0.013050204142928123, "rewards/rollout_reward_func/mean": 0.9611202478408813, "rewards/rollout_reward_func/std": 0.12367349117994308, "sampling/importance_sampling_ratio/max": 1.0085561275482178, "sampling/importance_sampling_ratio/mean": 0.9346498847007751, "sampling/importance_sampling_ratio/min": 0.0008740086341276765, "sampling/sampling_logp_difference/max": 2.347477436065674, "sampling/sampling_logp_difference/mean": 0.0498509556055069, "step": 813, "step_time": 5.4979007380097755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3919723625294864, "epoch": 0.00814, "grad_norm": 0.01319202221930027, "kl": 0.40446652099490166, "learning_rate": 9.99971987825087e-06, "loss": -0.0356, "step": 814, "step_time": 2.494769863980764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 216.25, "completions/mean_terminated_length": 216.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.07977608172222972, "epoch": 0.00815, "frac_reward_zero_std": 1.0, "grad_norm": 0.002177849179133773, "kl": 0.4387410581111908, "learning_rate": 9.999719157689747e-06, "loss": 0.0018, "num_tokens": 7653012.0, "reward": 0.7786922454833984, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7786922454833984, "rewards/rollout_reward_func/std": 0.4269687533378601, "sampling/importance_sampling_ratio/max": 1.0771054029464722, "sampling/importance_sampling_ratio/mean": 0.9951076507568359, "sampling/importance_sampling_ratio/min": 0.9272874593734741, "sampling/sampling_logp_difference/max": 0.22389769554138184, "sampling/sampling_logp_difference/mean": 0.0038766602519899607, "step": 815, "step_time": 4.655102544005786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08031109534204006, "epoch": 0.00816, "grad_norm": 0.002325671259313822, "kl": 0.438099704682827, "learning_rate": 9.999718436203094e-06, "loss": 0.0018, "step": 816, "step_time": 2.0495052299957024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.31915694009512663, "epoch": 0.00817, "frac_reward_zero_std": 0.75, "grad_norm": 0.01505633071064949, "kl": 0.6214557737112045, "learning_rate": 9.999717713790909e-06, "loss": -0.0267, "num_tokens": 7669064.0, "reward": 0.5990384817123413, "reward_std": 0.05167318880558014, "rewards/rollout_reward_func/mean": 0.5990384817123413, "rewards/rollout_reward_func/std": 0.17135760188102722, "sampling/importance_sampling_ratio/max": 0.9997591972351074, "sampling/importance_sampling_ratio/mean": 0.9625767469406128, "sampling/importance_sampling_ratio/min": 0.007270278409123421, "sampling/sampling_logp_difference/max": 1.5970090627670288, "sampling/sampling_logp_difference/mean": 0.03333583101630211, "step": 817, "step_time": 4.075370295999164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3161790454760194, "epoch": 0.00818, "grad_norm": 0.01617618277668953, "kl": 0.6180068403482437, "learning_rate": 9.999716990453195e-06, "loss": -0.0267, "step": 818, "step_time": 2.0196151030031615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 170.25, "completions/mean_terminated_length": 170.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.07291264738887548, "epoch": 0.00819, "frac_reward_zero_std": 1.0, "grad_norm": 0.000825963041279465, "kl": 0.5153171308338642, "learning_rate": 9.999716266189952e-06, "loss": 0.0017, "num_tokens": 7687184.0, "reward": 0.7093461751937866, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7093461751937866, "rewards/rollout_reward_func/std": 0.2459312528371811, "sampling/importance_sampling_ratio/max": 1.007265329360962, "sampling/importance_sampling_ratio/mean": 0.9959096908569336, "sampling/importance_sampling_ratio/min": 0.9878761768341064, "sampling/sampling_logp_difference/max": 0.010420192033052444, "sampling/sampling_logp_difference/mean": 0.0016092164441943169, "step": 819, "step_time": 4.753631780004071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07304707262665033, "epoch": 0.0082, "grad_norm": 0.0008472788613289595, "kl": 0.5152863748371601, "learning_rate": 9.99971554100118e-06, "loss": 0.0017, "step": 820, "step_time": 2.4695317200021236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 147.75, "completions/mean_terminated_length": 147.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.06304021086543798, "epoch": 0.00821, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005967738688923419, "kl": 0.5354128628969193, "learning_rate": 9.99971481488688e-06, "loss": 0.0017, "num_tokens": 7703928.0, "reward": 0.6611538529396057, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6611538529396057, "rewards/rollout_reward_func/std": 0.19303028285503387, "sampling/importance_sampling_ratio/max": 1.0023696422576904, "sampling/importance_sampling_ratio/mean": 0.9959160685539246, "sampling/importance_sampling_ratio/min": 0.9917483925819397, "sampling/sampling_logp_difference/max": 0.007461462169885635, "sampling/sampling_logp_difference/mean": 0.0016496042953804135, "step": 821, "step_time": 4.234062778996304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06345280446112156, "epoch": 0.00822, "grad_norm": 0.0006172290304675698, "kl": 0.5353375002741814, "learning_rate": 9.99971408784705e-06, "loss": 0.0017, "step": 822, "step_time": 2.0063092190102907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 123.875, "completions/mean_terminated_length": 123.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.21034253062680364, "epoch": 0.00823, "frac_reward_zero_std": 0.75, "grad_norm": 0.03996925428509712, "kl": 0.5841947831213474, "learning_rate": 9.99971335988169e-06, "loss": -0.0159, "num_tokens": 7719884.0, "reward": 0.4361538290977478, "reward_std": 0.02719641663134098, "rewards/rollout_reward_func/mean": 0.4361538290977478, "rewards/rollout_reward_func/std": 0.20295055210590363, "sampling/importance_sampling_ratio/max": 1.0025768280029297, "sampling/importance_sampling_ratio/mean": 0.9644196033477783, "sampling/importance_sampling_ratio/min": 0.06288782507181168, "sampling/sampling_logp_difference/max": 1.4329864978790283, "sampling/sampling_logp_difference/mean": 0.018856529146432877, "step": 823, "step_time": 4.038370520996978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2118730405345559, "epoch": 0.00824, "grad_norm": 0.041412632912397385, "kl": 0.5849714130163193, "learning_rate": 9.999712630990802e-06, "loss": -0.0159, "step": 824, "step_time": 2.503914139007975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 142.5, "completions/mean_terminated_length": 142.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1271075652912259, "epoch": 0.00825, "frac_reward_zero_std": 1.0, "grad_norm": 0.024491839110851288, "kl": 0.5813149660825729, "learning_rate": 9.999711901174385e-06, "loss": 0.0017, "num_tokens": 7736844.0, "reward": 0.6401153802871704, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6401153802871704, "rewards/rollout_reward_func/std": 0.2035870999097824, "sampling/importance_sampling_ratio/max": 1.4843659400939941, "sampling/importance_sampling_ratio/mean": 1.042081356048584, "sampling/importance_sampling_ratio/min": 0.2911195456981659, "sampling/sampling_logp_difference/max": 0.9856550693511963, "sampling/sampling_logp_difference/mean": 0.016199931502342224, "step": 825, "step_time": 4.537331851002818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1234201192855835, "epoch": 0.00826, "grad_norm": 0.02268097922205925, "kl": 0.5756053179502487, "learning_rate": 9.999711170432441e-06, "loss": 0.0016, "step": 826, "step_time": 2.4833402339863824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 186.5625, "completions/mean_terminated_length": 186.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2862175181508064, "epoch": 0.00827, "frac_reward_zero_std": 0.25, "grad_norm": 0.42351043224334717, "kl": 0.523941945284605, "learning_rate": 9.999710438764968e-06, "loss": 0.0503, "num_tokens": 7755246.0, "reward": 0.6911009550094604, "reward_std": 0.028558805584907532, "rewards/rollout_reward_func/mean": 0.6911009550094604, "rewards/rollout_reward_func/std": 0.2692355811595917, "sampling/importance_sampling_ratio/max": 1.0932867527008057, "sampling/importance_sampling_ratio/mean": 0.8916785717010498, "sampling/importance_sampling_ratio/min": 0.15327896177768707, "sampling/sampling_logp_difference/max": 1.5836430788040161, "sampling/sampling_logp_difference/mean": 0.04777591675519943, "step": 827, "step_time": 4.368316808002419 }, { "clip_ratio/high_max": 0.05000000074505806, "clip_ratio/high_mean": 0.02500000037252903, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.035416667349636555, "entropy": 0.2789023816585541, "epoch": 0.00828, "grad_norm": 0.1780337244272232, "kl": 0.5176284834742546, "learning_rate": 9.999709706171968e-06, "loss": 0.0493, "step": 828, "step_time": 2.038357855992217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 233.03125, "completions/mean_terminated_length": 233.03125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.7279171012341976, "epoch": 0.00829, "frac_reward_zero_std": 0.5, "grad_norm": 0.15698762238025665, "kl": 0.40712349116802216, "learning_rate": 9.999708972653441e-06, "loss": -0.0761, "num_tokens": 7774335.0, "reward": 0.8168076872825623, "reward_std": 0.16588762402534485, "rewards/rollout_reward_func/mean": 0.8168076872825623, "rewards/rollout_reward_func/std": 0.3133774697780609, "sampling/importance_sampling_ratio/max": 1.5030441284179688, "sampling/importance_sampling_ratio/mean": 0.8710284233093262, "sampling/importance_sampling_ratio/min": 1.579393028805498e-06, "sampling/sampling_logp_difference/max": 2.132969617843628, "sampling/sampling_logp_difference/mean": 0.13202516734600067, "step": 829, "step_time": 4.71555495299981 }, { "clip_ratio/high_max": 0.016741071827709675, "clip_ratio/high_mean": 0.008370535913854837, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012276785913854837, "entropy": 0.6964020915329456, "epoch": 0.0083, "grad_norm": 0.24483831226825714, "kl": 0.40215105935931206, "learning_rate": 9.999708238209385e-06, "loss": -0.0756, "step": 830, "step_time": 2.540330775991606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 191.0625, "completions/mean_terminated_length": 191.0625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4374280031770468, "epoch": 0.00831, "frac_reward_zero_std": 0.75, "grad_norm": 0.010750457644462585, "kl": 0.4336305074393749, "learning_rate": 9.999707502839802e-06, "loss": -0.0366, "num_tokens": 7792905.0, "reward": 1.1343317031860352, "reward_std": 0.05334576964378357, "rewards/rollout_reward_func/mean": 1.1343317031860352, "rewards/rollout_reward_func/std": 0.14282841980457306, "sampling/importance_sampling_ratio/max": 1.0068049430847168, "sampling/importance_sampling_ratio/mean": 0.9658924341201782, "sampling/importance_sampling_ratio/min": 9.874291545700682e-18, "sampling/sampling_logp_difference/max": 3.7352089881896973, "sampling/sampling_logp_difference/mean": 0.16197898983955383, "step": 831, "step_time": 5.068648681990453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43821599753573537, "epoch": 0.00832, "grad_norm": 0.010771448723971844, "kl": 0.43078210949897766, "learning_rate": 9.999706766544692e-06, "loss": -0.0366, "step": 832, "step_time": 2.0310609599837335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 189.90625, "completions/mean_terminated_length": 189.90625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.1908945757895708, "epoch": 0.00833, "frac_reward_zero_std": 0.5, "grad_norm": 0.5185715556144714, "kl": 0.5429900512099266, "learning_rate": 9.999706029324055e-06, "loss": -0.0054, "num_tokens": 7811294.0, "reward": 0.9584518671035767, "reward_std": 0.021945461630821228, "rewards/rollout_reward_func/mean": 0.9584518671035767, "rewards/rollout_reward_func/std": 0.27331942319869995, "sampling/importance_sampling_ratio/max": 1.4641735553741455, "sampling/importance_sampling_ratio/mean": 1.0071958303451538, "sampling/importance_sampling_ratio/min": 0.05180308222770691, "sampling/sampling_logp_difference/max": 1.3724387884140015, "sampling/sampling_logp_difference/mean": 0.023149482905864716, "step": 833, "step_time": 4.402230084000621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18791814427822828, "epoch": 0.00834, "grad_norm": 0.20192214846611023, "kl": 0.5445484779775143, "learning_rate": 9.999705291177891e-06, "loss": -0.0074, "step": 834, "step_time": 2.0349787199957063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 101.75, "completions/mean_terminated_length": 101.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5170535482466221, "epoch": 0.00835, "frac_reward_zero_std": 0.25, "grad_norm": 0.14999519288539886, "kl": 0.9437958747148514, "learning_rate": 9.999704552106202e-06, "loss": -0.0117, "num_tokens": 7826982.0, "reward": 0.8088605999946594, "reward_std": 0.03920362889766693, "rewards/rollout_reward_func/mean": 0.8088605999946594, "rewards/rollout_reward_func/std": 0.17630718648433685, "sampling/importance_sampling_ratio/max": 0.9965561628341675, "sampling/importance_sampling_ratio/mean": 0.9111373424530029, "sampling/importance_sampling_ratio/min": 0.0037611545994877815, "sampling/sampling_logp_difference/max": 2.52626895904541, "sampling/sampling_logp_difference/mean": 0.10426772385835648, "step": 835, "step_time": 4.397121175992652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5126263527199626, "epoch": 0.00836, "grad_norm": 0.15263821184635162, "kl": 0.9277568683028221, "learning_rate": 9.999703812108984e-06, "loss": -0.0116, "step": 836, "step_time": 2.524585817991465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.09765372890979052, "epoch": 0.00837, "frac_reward_zero_std": 1.0, "grad_norm": 0.008900608867406845, "kl": 0.43232735991477966, "learning_rate": 9.999703071186241e-06, "loss": 0.0017, "num_tokens": 7846470.0, "reward": 0.6947307586669922, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6947307586669922, "rewards/rollout_reward_func/std": 0.45467543601989746, "sampling/importance_sampling_ratio/max": 1.0504142045974731, "sampling/importance_sampling_ratio/mean": 0.9990224242210388, "sampling/importance_sampling_ratio/min": 0.7119683623313904, "sampling/sampling_logp_difference/max": 0.33450639247894287, "sampling/sampling_logp_difference/mean": 0.005426753778010607, "step": 837, "step_time": 4.96898283000337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09743308555334806, "epoch": 0.00838, "grad_norm": 0.004934805445373058, "kl": 0.42716849967837334, "learning_rate": 9.999702329337973e-06, "loss": 0.0017, "step": 838, "step_time": 2.026235491000989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 168.0625, "completions/mean_terminated_length": 168.0625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2094206102192402, "epoch": 0.00839, "frac_reward_zero_std": 0.5, "grad_norm": 0.48326680064201355, "kl": 0.5135747380554676, "learning_rate": 9.999701586564176e-06, "loss": -0.0639, "num_tokens": 7864112.0, "reward": 1.0055527687072754, "reward_std": 0.062073878943920135, "rewards/rollout_reward_func/mean": 1.0055527687072754, "rewards/rollout_reward_func/std": 0.17670129239559174, "sampling/importance_sampling_ratio/max": 1.007318377494812, "sampling/importance_sampling_ratio/mean": 0.8853681087493896, "sampling/importance_sampling_ratio/min": 0.20840494334697723, "sampling/sampling_logp_difference/max": 1.3416682481765747, "sampling/sampling_logp_difference/mean": 0.03703319653868675, "step": 839, "step_time": 4.410770606002188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02864583395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02864583395421505, "entropy": 0.2371637523174286, "epoch": 0.0084, "grad_norm": 0.10918887704610825, "kl": 0.5307427197694778, "learning_rate": 9.999700842864858e-06, "loss": -0.065, "step": 840, "step_time": 2.0401759430096718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 218.09375, "completions/mean_terminated_length": 218.09375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3095542429946363, "epoch": 0.00841, "frac_reward_zero_std": 0.5, "grad_norm": 0.19141927361488342, "kl": 0.3998360112309456, "learning_rate": 9.999700098240011e-06, "loss": -0.0539, "num_tokens": 7883603.0, "reward": 0.8787499666213989, "reward_std": 0.071091428399086, "rewards/rollout_reward_func/mean": 0.8787499666213989, "rewards/rollout_reward_func/std": 0.43576765060424805, "sampling/importance_sampling_ratio/max": 1.017917513847351, "sampling/importance_sampling_ratio/mean": 0.9419111013412476, "sampling/importance_sampling_ratio/min": 0.04743245616555214, "sampling/sampling_logp_difference/max": 1.3035551309585571, "sampling/sampling_logp_difference/mean": 0.029117155820131302, "step": 841, "step_time": 5.069530547007162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.3243110114708543, "epoch": 0.00842, "grad_norm": 0.14862771332263947, "kl": 0.41335613280534744, "learning_rate": 9.999699352689638e-06, "loss": -0.0543, "step": 842, "step_time": 2.537237352989905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 205.25, "completions/mean_terminated_length": 205.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3922344362363219, "epoch": 0.00843, "frac_reward_zero_std": 0.75, "grad_norm": 0.10146322101354599, "kl": 0.4188529774546623, "learning_rate": 9.999698606213743e-06, "loss": 0.0266, "num_tokens": 7902571.0, "reward": 0.6736442446708679, "reward_std": 0.007479018531739712, "rewards/rollout_reward_func/mean": 0.6736442446708679, "rewards/rollout_reward_func/std": 0.1201007068157196, "sampling/importance_sampling_ratio/max": 2.076525926589966, "sampling/importance_sampling_ratio/mean": 0.9487936496734619, "sampling/importance_sampling_ratio/min": 0.00021917687263339758, "sampling/sampling_logp_difference/max": 3.164938449859619, "sampling/sampling_logp_difference/mean": 0.10232064872980118, "step": 843, "step_time": 4.647603910991165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.05625000037252903, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05625000037252903, "entropy": 0.4978570407256484, "epoch": 0.00844, "grad_norm": 0.07742339372634888, "kl": 0.4441653713583946, "learning_rate": 9.999697858812321e-06, "loss": 0.0269, "step": 844, "step_time": 2.021431292996567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 170.71875, "completions/mean_terminated_length": 170.71875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4107222887687385, "epoch": 0.00845, "frac_reward_zero_std": 0.75, "grad_norm": 0.07676831632852554, "kl": 0.704888429492712, "learning_rate": 9.999697110485375e-06, "loss": -0.0204, "num_tokens": 7919914.0, "reward": 0.596552848815918, "reward_std": 0.03878072649240494, "rewards/rollout_reward_func/mean": 0.596552848815918, "rewards/rollout_reward_func/std": 0.2901214361190796, "sampling/importance_sampling_ratio/max": 1.041690468788147, "sampling/importance_sampling_ratio/mean": 0.942440390586853, "sampling/importance_sampling_ratio/min": 0.00020074188068974763, "sampling/sampling_logp_difference/max": 2.0874452590942383, "sampling/sampling_logp_difference/mean": 0.07467003911733627, "step": 845, "step_time": 4.466686993000621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4200358148664236, "epoch": 0.00846, "grad_norm": 0.0692308098077774, "kl": 0.67469821870327, "learning_rate": 9.999696361232904e-06, "loss": -0.0205, "step": 846, "step_time": 2.030338116994244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 193.0, "completions/mean_terminated_length": 193.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.062426199205219746, "epoch": 0.00847, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008741099736653268, "kl": 0.43201741948723793, "learning_rate": 9.999695611054908e-06, "loss": 0.0016, "num_tokens": 7938298.0, "reward": 0.6093461513519287, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6093461513519287, "rewards/rollout_reward_func/std": 0.42185720801353455, "sampling/importance_sampling_ratio/max": 1.0276827812194824, "sampling/importance_sampling_ratio/mean": 1.000791311264038, "sampling/importance_sampling_ratio/min": 0.9903615117073059, "sampling/sampling_logp_difference/max": 0.030064821243286133, "sampling/sampling_logp_difference/mean": 0.0021111811511218548, "step": 847, "step_time": 5.343300635016931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06222018878906965, "epoch": 0.00848, "grad_norm": 0.0008559246198274195, "kl": 0.4321186766028404, "learning_rate": 9.99969485995139e-06, "loss": 0.0016, "step": 848, "step_time": 2.5085237930034054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 223.75, "completions/mean_terminated_length": 223.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.060034702997654676, "epoch": 0.00849, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009108443045988679, "kl": 0.4024211913347244, "learning_rate": 9.999694107922345e-06, "loss": 0.0016, "num_tokens": 7958018.0, "reward": 0.8042691946029663, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8042691946029663, "rewards/rollout_reward_func/std": 0.36758244037628174, "sampling/importance_sampling_ratio/max": 1.0246649980545044, "sampling/importance_sampling_ratio/mean": 0.9997186660766602, "sampling/importance_sampling_ratio/min": 0.9905945062637329, "sampling/sampling_logp_difference/max": 0.02670067548751831, "sampling/sampling_logp_difference/mean": 0.0017097238451242447, "step": 849, "step_time": 5.126252525005839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06029536761343479, "epoch": 0.0085, "grad_norm": 0.0008890130557119846, "kl": 0.4023999162018299, "learning_rate": 9.999693354967777e-06, "loss": 0.0016, "step": 850, "step_time": 2.0461789359978866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 195.09375, "completions/mean_terminated_length": 195.09375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2977218134328723, "epoch": 0.00851, "frac_reward_zero_std": 0.75, "grad_norm": 0.011460207402706146, "kl": 0.4531771019101143, "learning_rate": 9.999692601087686e-06, "loss": -0.0367, "num_tokens": 7976445.0, "reward": 0.8060336709022522, "reward_std": 0.039067648351192474, "rewards/rollout_reward_func/mean": 0.8060336709022522, "rewards/rollout_reward_func/std": 0.11458253115415573, "sampling/importance_sampling_ratio/max": 1.0083905458450317, "sampling/importance_sampling_ratio/mean": 0.9667558670043945, "sampling/importance_sampling_ratio/min": 0.0005217883153818548, "sampling/sampling_logp_difference/max": 1.9830296039581299, "sampling/sampling_logp_difference/mean": 0.0364798866212368, "step": 851, "step_time": 4.833851092000259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29992940416559577, "epoch": 0.00852, "grad_norm": 0.010422713123261929, "kl": 0.45107772946357727, "learning_rate": 9.999691846282073e-06, "loss": -0.0367, "step": 852, "step_time": 2.063120850994892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 247.5, "completions/mean_terminated_length": 247.5, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.06976718781515956, "epoch": 0.00853, "frac_reward_zero_std": 1.0, "grad_norm": 0.002058392623439431, "kl": 0.4484175629913807, "learning_rate": 9.999691090550936e-06, "loss": 0.0019, "num_tokens": 7996733.0, "reward": 0.6346538662910461, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6346538662910461, "rewards/rollout_reward_func/std": 0.3533623516559601, "sampling/importance_sampling_ratio/max": 1.0038076639175415, "sampling/importance_sampling_ratio/mean": 0.9910475015640259, "sampling/importance_sampling_ratio/min": 0.9551634192466736, "sampling/sampling_logp_difference/max": 0.027924805879592896, "sampling/sampling_logp_difference/mean": 0.0019698385149240494, "step": 853, "step_time": 5.1053773569947225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07008530618622899, "epoch": 0.00854, "grad_norm": 0.002252157311886549, "kl": 0.4482443071901798, "learning_rate": 9.999690333894273e-06, "loss": 0.0019, "step": 854, "step_time": 2.5106376330077183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 196.28125, "completions/mean_terminated_length": 196.28125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.6269969190470874, "epoch": 0.00855, "frac_reward_zero_std": 0.25, "grad_norm": 0.15534809231758118, "kl": 0.5050919316709042, "learning_rate": 9.99968957631209e-06, "loss": -0.0967, "num_tokens": 8014590.0, "reward": 0.8578269481658936, "reward_std": 0.19430366158485413, "rewards/rollout_reward_func/mean": 0.8578269481658936, "rewards/rollout_reward_func/std": 0.4133877754211426, "sampling/importance_sampling_ratio/max": 0.9967610239982605, "sampling/importance_sampling_ratio/mean": 0.8561202883720398, "sampling/importance_sampling_ratio/min": 0.005239482503384352, "sampling/sampling_logp_difference/max": 1.848440170288086, "sampling/sampling_logp_difference/mean": 0.08239279687404633, "step": 855, "step_time": 4.580313087011746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.64202372264117, "epoch": 0.00856, "grad_norm": 0.15179023146629333, "kl": 0.5157766677439213, "learning_rate": 9.999688817804385e-06, "loss": -0.0968, "step": 856, "step_time": 2.0642516629959573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 130.0, "completions/mean_terminated_length": 130.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1951238885521889, "epoch": 0.00857, "frac_reward_zero_std": 0.25, "grad_norm": 0.1954914629459381, "kl": 0.6712345592677593, "learning_rate": 9.999688058371155e-06, "loss": -0.0101, "num_tokens": 8030990.0, "reward": 0.6839257478713989, "reward_std": 0.10008642077445984, "rewards/rollout_reward_func/mean": 0.6839257478713989, "rewards/rollout_reward_func/std": 0.30957740545272827, "sampling/importance_sampling_ratio/max": 1.0216221809387207, "sampling/importance_sampling_ratio/mean": 0.8121217489242554, "sampling/importance_sampling_ratio/min": 2.0581428236789154e-18, "sampling/sampling_logp_difference/max": 4.5013909339904785, "sampling/sampling_logp_difference/mean": 0.3466518521308899, "step": 857, "step_time": 4.598360370997398 }, { "clip_ratio/high_max": 0.035416667349636555, "clip_ratio/high_mean": 0.017708333674818277, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017708333674818277, "entropy": 1.182961874641478, "epoch": 0.00858, "grad_norm": 0.21598146855831146, "kl": 0.633332334458828, "learning_rate": 9.999687298012404e-06, "loss": -0.0104, "step": 858, "step_time": 2.515754782005388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 228.5625, "completions/mean_terminated_length": 228.5625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.46156499488279223, "epoch": 0.00859, "frac_reward_zero_std": 0.75, "grad_norm": 0.48660898208618164, "kl": 0.41518739983439445, "learning_rate": 9.999686536728131e-06, "loss": 0.0269, "num_tokens": 8050920.0, "reward": 0.7251538038253784, "reward_std": 0.04423673823475838, "rewards/rollout_reward_func/mean": 0.7251538038253784, "rewards/rollout_reward_func/std": 0.4301164150238037, "sampling/importance_sampling_ratio/max": 1.0026768445968628, "sampling/importance_sampling_ratio/mean": 0.951617419719696, "sampling/importance_sampling_ratio/min": 3.5640931385876906e-20, "sampling/sampling_logp_difference/max": 4.40946102142334, "sampling/sampling_logp_difference/mean": 0.19510290026664734, "step": 859, "step_time": 4.63109739599895 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.4553321120329201, "epoch": 0.0086, "grad_norm": 0.12107274681329727, "kl": 0.4172791615128517, "learning_rate": 9.999685774518335e-06, "loss": 0.0263, "step": 860, "step_time": 2.550651259989536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 149.375, "completions/mean_terminated_length": 149.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.332757793366909, "epoch": 0.00861, "frac_reward_zero_std": 1.0, "grad_norm": 0.012628672644495964, "kl": 0.5752709247171879, "learning_rate": 9.999685011383017e-06, "loss": 0.0019, "num_tokens": 8067828.0, "reward": 0.680384635925293, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.680384635925293, "rewards/rollout_reward_func/std": 0.13700973987579346, "sampling/importance_sampling_ratio/max": 1.0587934255599976, "sampling/importance_sampling_ratio/mean": 0.9651198983192444, "sampling/importance_sampling_ratio/min": 0.00018714804900810122, "sampling/sampling_logp_difference/max": 2.306971549987793, "sampling/sampling_logp_difference/mean": 0.05581716075539589, "step": 861, "step_time": 4.126219878016855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3381122937425971, "epoch": 0.00862, "grad_norm": 0.013166899792850018, "kl": 0.5714859776198864, "learning_rate": 9.999684247322179e-06, "loss": 0.0019, "step": 862, "step_time": 2.0277720259982743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 184.03125, "completions/mean_terminated_length": 184.03125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 1.170385385863483, "epoch": 0.00863, "frac_reward_zero_std": 0.25, "grad_norm": 0.08813850581645966, "kl": 0.5128969214856625, "learning_rate": 9.999683482335817e-06, "loss": -0.0742, "num_tokens": 8085765.0, "reward": 0.6196202039718628, "reward_std": 0.13629484176635742, "rewards/rollout_reward_func/mean": 0.6196202039718628, "rewards/rollout_reward_func/std": 0.18488556146621704, "sampling/importance_sampling_ratio/max": 1.01554274559021, "sampling/importance_sampling_ratio/mean": 0.8967279195785522, "sampling/importance_sampling_ratio/min": 4.580702376565395e-19, "sampling/sampling_logp_difference/max": 3.6378140449523926, "sampling/sampling_logp_difference/mean": 0.420864999294281, "step": 863, "step_time": 4.775397009005246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 1.1714501045644283, "epoch": 0.00864, "grad_norm": 0.08940732479095459, "kl": 0.512485895305872, "learning_rate": 9.999682716423937e-06, "loss": -0.0742, "step": 864, "step_time": 2.6041018040123163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0096726194024086, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0096726194024086, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 153.15625, "completions/mean_terminated_length": 153.15625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.8196515152230859, "epoch": 0.00865, "frac_reward_zero_std": 0.5, "grad_norm": 0.7973894476890564, "kl": 0.4559711739420891, "learning_rate": 9.999681949586533e-06, "loss": -0.0292, "num_tokens": 8102762.0, "reward": 0.7228846549987793, "reward_std": 0.0602555014193058, "rewards/rollout_reward_func/mean": 0.7228846549987793, "rewards/rollout_reward_func/std": 0.3884298503398895, "sampling/importance_sampling_ratio/max": 2.553645610809326, "sampling/importance_sampling_ratio/mean": 0.844221830368042, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.52471923828125, "sampling/sampling_logp_difference/mean": 0.17617449164390564, "step": 865, "step_time": 4.387959703002707 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.1190476231276989, "clip_ratio/low_min": 0.008928571827709675, "clip_ratio/region_mean": 0.12529762322083116, "entropy": 1.2246729196049273, "epoch": 0.00866, "grad_norm": 0.06493120640516281, "kl": 0.4862568937242031, "learning_rate": 9.999681181823611e-06, "loss": -0.032, "step": 866, "step_time": 2.5177073320010095 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 104.5, "completions/mean_terminated_length": 104.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.582830186933279, "epoch": 0.00867, "frac_reward_zero_std": 0.25, "grad_norm": 0.43282073736190796, "kl": 0.7611788511276245, "learning_rate": 9.999680413135167e-06, "loss": -0.0524, "num_tokens": 8118674.0, "reward": 0.45519229769706726, "reward_std": 0.12871712446212769, "rewards/rollout_reward_func/mean": 0.45519229769706726, "rewards/rollout_reward_func/std": 0.20305012166500092, "sampling/importance_sampling_ratio/max": 1.0298216342926025, "sampling/importance_sampling_ratio/mean": 0.846673309803009, "sampling/importance_sampling_ratio/min": 4.23785513061154e-23, "sampling/sampling_logp_difference/max": 4.604095458984375, "sampling/sampling_logp_difference/mean": 0.705298662185669, "step": 867, "step_time": 4.189432368999405 }, { "clip_ratio/high_max": 0.046875, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 1.5694252150133252, "epoch": 0.00868, "grad_norm": 0.024529218673706055, "kl": 0.7739523202180862, "learning_rate": 9.9996796435212e-06, "loss": -0.053, "step": 868, "step_time": 2.050108419993194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.05856772465631366, "epoch": 0.00869, "frac_reward_zero_std": 1.0, "grad_norm": 0.001340551651082933, "kl": 0.404448214918375, "learning_rate": 9.999678872981717e-06, "loss": 0.0016, "num_tokens": 8137762.0, "reward": 1.2238845825195312, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.2238845825195312, "rewards/rollout_reward_func/std": 0.24685068428516388, "sampling/importance_sampling_ratio/max": 1.0323172807693481, "sampling/importance_sampling_ratio/mean": 0.9942002892494202, "sampling/importance_sampling_ratio/min": 0.9733961224555969, "sampling/sampling_logp_difference/max": 0.034033991396427155, "sampling/sampling_logp_difference/mean": 0.0018297327915206552, "step": 869, "step_time": 4.786346114997286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05642598029226065, "epoch": 0.0087, "grad_norm": 0.0010847196681424975, "kl": 0.4050228223204613, "learning_rate": 9.999678101516712e-06, "loss": 0.0016, "step": 870, "step_time": 2.538088665009127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 230.1875, "completions/mean_terminated_length": 230.1875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.42187750339508057, "epoch": 0.00871, "frac_reward_zero_std": 0.25, "grad_norm": 1.6562175750732422, "kl": 0.7404337599873543, "learning_rate": 9.999677329126187e-06, "loss": -0.0072, "num_tokens": 8157032.0, "reward": 0.6856250166893005, "reward_std": 0.10984031856060028, "rewards/rollout_reward_func/mean": 0.6856250166893005, "rewards/rollout_reward_func/std": 0.483832448720932, "sampling/importance_sampling_ratio/max": 1.157301425933838, "sampling/importance_sampling_ratio/mean": 0.8888599276542664, "sampling/importance_sampling_ratio/min": 0.03849909082055092, "sampling/sampling_logp_difference/max": 2.161421775817871, "sampling/sampling_logp_difference/mean": 0.08475802093744278, "step": 871, "step_time": 4.686474267007725 }, { "clip_ratio/high_max": 0.09166666865348816, "clip_ratio/high_mean": 0.04583333432674408, "clip_ratio/low_mean": 0.05000000074505806, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09583333414047956, "entropy": 0.37520834896713495, "epoch": 0.00872, "grad_norm": 0.2622084319591522, "kl": 0.6398925930261612, "learning_rate": 9.999676555810143e-06, "loss": -0.0117, "step": 872, "step_time": 2.4990271420028876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.16030005039647222, "epoch": 0.00873, "frac_reward_zero_std": 0.75, "grad_norm": 0.006638623774051666, "kl": 0.4362022131681442, "learning_rate": 9.999675781568578e-06, "loss": -0.0365, "num_tokens": 8176000.0, "reward": 0.5110369920730591, "reward_std": 0.011861715465784073, "rewards/rollout_reward_func/mean": 0.5110369920730591, "rewards/rollout_reward_func/std": 0.34843355417251587, "sampling/importance_sampling_ratio/max": 1.0079761743545532, "sampling/importance_sampling_ratio/mean": 0.9664410352706909, "sampling/importance_sampling_ratio/min": 0.001799307414330542, "sampling/sampling_logp_difference/max": 2.6964938640594482, "sampling/sampling_logp_difference/mean": 0.02900315448641777, "step": 873, "step_time": 4.711744654006907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.16107898112386465, "epoch": 0.00874, "grad_norm": 0.006678921170532703, "kl": 0.43477120250463486, "learning_rate": 9.999675006401496e-06, "loss": -0.0366, "step": 874, "step_time": 2.0718629940020037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 188.75, "completions/mean_terminated_length": 188.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.22160129016265273, "epoch": 0.00875, "frac_reward_zero_std": 0.75, "grad_norm": 0.004114637617021799, "kl": 0.4370543844997883, "learning_rate": 9.999674230308893e-06, "loss": -0.027, "num_tokens": 8194432.0, "reward": 0.8858365416526794, "reward_std": 0.008838837966322899, "rewards/rollout_reward_func/mean": 0.8858365416526794, "rewards/rollout_reward_func/std": 0.348878413438797, "sampling/importance_sampling_ratio/max": 1.0183823108673096, "sampling/importance_sampling_ratio/mean": 0.9660475850105286, "sampling/importance_sampling_ratio/min": 0.0030747957061976194, "sampling/sampling_logp_difference/max": 1.8713443279266357, "sampling/sampling_logp_difference/mean": 0.030771557241678238, "step": 875, "step_time": 4.527018474989745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2167928470298648, "epoch": 0.00876, "grad_norm": 0.0041971332393586636, "kl": 0.43676699697971344, "learning_rate": 9.999673453290772e-06, "loss": -0.027, "step": 876, "step_time": 2.5142475410102634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 98.25, "completions/mean_terminated_length": 98.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2084312867373228, "epoch": 0.00877, "frac_reward_zero_std": 0.75, "grad_norm": 0.23350666463375092, "kl": 0.656673975288868, "learning_rate": 9.999672675347131e-06, "loss": -0.0147, "num_tokens": 8209328.0, "reward": 0.5824663639068604, "reward_std": 0.010429824702441692, "rewards/rollout_reward_func/mean": 0.5824663639068604, "rewards/rollout_reward_func/std": 0.21110938489437103, "sampling/importance_sampling_ratio/max": 1.0067096948623657, "sampling/importance_sampling_ratio/mean": 0.9650167226791382, "sampling/importance_sampling_ratio/min": 0.09870598465204239, "sampling/sampling_logp_difference/max": 1.1198182106018066, "sampling/sampling_logp_difference/mean": 0.017701109871268272, "step": 877, "step_time": 4.508169216001988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.2237971080467105, "epoch": 0.00878, "grad_norm": 0.08995082229375839, "kl": 0.6574929505586624, "learning_rate": 9.999671896477973e-06, "loss": -0.0151, "step": 878, "step_time": 2.052461043014773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 169.84375, "completions/mean_terminated_length": 169.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5502608269453049, "epoch": 0.00879, "frac_reward_zero_std": 0.5, "grad_norm": 0.005796638783067465, "kl": 0.6330269873142242, "learning_rate": 9.999671116683296e-06, "loss": -0.0561, "num_tokens": 8227219.0, "reward": 1.088649034500122, "reward_std": 0.06449456512928009, "rewards/rollout_reward_func/mean": 1.088649034500122, "rewards/rollout_reward_func/std": 0.266554594039917, "sampling/importance_sampling_ratio/max": 0.9998940229415894, "sampling/importance_sampling_ratio/mean": 0.9010967016220093, "sampling/importance_sampling_ratio/min": 0.0026087018195539713, "sampling/sampling_logp_difference/max": 2.193031072616577, "sampling/sampling_logp_difference/mean": 0.1078057512640953, "step": 879, "step_time": 4.620072943995183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5568311475217342, "epoch": 0.0088, "grad_norm": 0.006176486611366272, "kl": 0.639727272093296, "learning_rate": 9.9996703359631e-06, "loss": -0.0561, "step": 880, "step_time": 2.0631842099974165 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 233.90625, "completions/mean_terminated_length": 233.90625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.6612868066877127, "epoch": 0.00881, "frac_reward_zero_std": 0.5, "grad_norm": 0.027765560895204544, "kl": 0.5388445183634758, "learning_rate": 9.999669554317389e-06, "loss": -0.0642, "num_tokens": 8246960.0, "reward": 0.8646634817123413, "reward_std": 0.09598741680383682, "rewards/rollout_reward_func/mean": 0.8646634817123413, "rewards/rollout_reward_func/std": 0.4295889437198639, "sampling/importance_sampling_ratio/max": 1.010137915611267, "sampling/importance_sampling_ratio/mean": 0.9026265740394592, "sampling/importance_sampling_ratio/min": 7.159837366000829e-14, "sampling/sampling_logp_difference/max": 14.604781150817871, "sampling/sampling_logp_difference/mean": 0.19937095046043396, "step": 881, "step_time": 5.166470825002762 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 0.6554968184791505, "epoch": 0.00882, "grad_norm": 0.02831316739320755, "kl": 0.5452810414135456, "learning_rate": 9.999668771746158e-06, "loss": -0.0642, "step": 882, "step_time": 2.069109042000491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 196.28125, "completions/mean_terminated_length": 198.8064422607422, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5891160648316145, "epoch": 0.00883, "frac_reward_zero_std": 0.5, "grad_norm": 0.020760059356689453, "kl": 0.5527499876916409, "learning_rate": 9.99966798824941e-06, "loss": -0.0453, "num_tokens": 8265609.0, "reward": 0.902423083782196, "reward_std": 0.08322103321552277, "rewards/rollout_reward_func/mean": 0.902423083782196, "rewards/rollout_reward_func/std": 0.4123563766479492, "sampling/importance_sampling_ratio/max": 1.0117815732955933, "sampling/importance_sampling_ratio/mean": 0.9368067383766174, "sampling/importance_sampling_ratio/min": 1.8199004675141914e-22, "sampling/sampling_logp_difference/max": 3.875047206878662, "sampling/sampling_logp_difference/mean": 0.22751355171203613, "step": 883, "step_time": 5.1156123480031965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5899351444095373, "epoch": 0.00884, "grad_norm": 0.019945869222283363, "kl": 0.5563981607556343, "learning_rate": 9.999667203827144e-06, "loss": -0.0453, "step": 884, "step_time": 2.0715701839944813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5611791759729385, "epoch": 0.00885, "frac_reward_zero_std": 0.5, "grad_norm": 0.0071929749101400375, "kl": 0.590234886854887, "learning_rate": 9.999666418479359e-06, "loss": -0.0544, "num_tokens": 8283353.0, "reward": 0.7526105642318726, "reward_std": 0.026339732110500336, "rewards/rollout_reward_func/mean": 0.7526105642318726, "rewards/rollout_reward_func/std": 0.19195549190044403, "sampling/importance_sampling_ratio/max": 1.2346583604812622, "sampling/importance_sampling_ratio/mean": 0.9447344541549683, "sampling/importance_sampling_ratio/min": 1.7287807851827952e-23, "sampling/sampling_logp_difference/max": 9.341339111328125, "sampling/sampling_logp_difference/mean": 0.2969784140586853, "step": 885, "step_time": 4.2536566559938365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5599749605171382, "epoch": 0.00886, "grad_norm": 0.007043230812996626, "kl": 0.589926689863205, "learning_rate": 9.999665632206059e-06, "loss": -0.0544, "step": 886, "step_time": 2.0528713389867335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 103.0, "completions/mean_terminated_length": 103.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06637815339490771, "epoch": 0.00887, "frac_reward_zero_std": 1.0, "grad_norm": 0.00039641911280341446, "kl": 0.6564351096749306, "learning_rate": 9.999664845007243e-06, "loss": 0.0015, "num_tokens": 8299665.0, "reward": 0.8889614939689636, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8889614939689636, "rewards/rollout_reward_func/std": 0.22062861919403076, "sampling/importance_sampling_ratio/max": 1.0117642879486084, "sampling/importance_sampling_ratio/mean": 0.9975931644439697, "sampling/importance_sampling_ratio/min": 0.9838861227035522, "sampling/sampling_logp_difference/max": 0.01525915414094925, "sampling/sampling_logp_difference/mean": 0.0023691554088145494, "step": 887, "step_time": 4.511265992005065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06580934440717101, "epoch": 0.00888, "grad_norm": 0.0004117914941161871, "kl": 0.656411737203598, "learning_rate": 9.99966405688291e-06, "loss": 0.0015, "step": 888, "step_time": 2.047980836010538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 279.90625, "completions/mean_terminated_length": 279.90625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.18326288182288408, "epoch": 0.00889, "frac_reward_zero_std": 0.5, "grad_norm": 0.33526933193206787, "kl": 0.4399650767445564, "learning_rate": 9.99966326783306e-06, "loss": -0.0473, "num_tokens": 8320334.0, "reward": 0.8705913424491882, "reward_std": 0.19965305924415588, "rewards/rollout_reward_func/mean": 0.8705913424491882, "rewards/rollout_reward_func/std": 0.5582543611526489, "sampling/importance_sampling_ratio/max": 1.2399704456329346, "sampling/importance_sampling_ratio/mean": 1.0040632486343384, "sampling/importance_sampling_ratio/min": 0.002444114303216338, "sampling/sampling_logp_difference/max": 2.0181212425231934, "sampling/sampling_logp_difference/mean": 0.029528848826885223, "step": 889, "step_time": 5.182157053990522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18182839266955853, "epoch": 0.0089, "grad_norm": 0.48364195227622986, "kl": 0.4387183152139187, "learning_rate": 9.999662477857692e-06, "loss": -0.0487, "step": 890, "step_time": 2.0363003490056144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 174.5625, "completions/mean_terminated_length": 174.5625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.30706761218607426, "epoch": 0.00891, "frac_reward_zero_std": 0.5, "grad_norm": 0.8235399723052979, "kl": 0.4913969039916992, "learning_rate": 9.99966168695681e-06, "loss": -0.0295, "num_tokens": 8337880.0, "reward": 0.6257355809211731, "reward_std": 0.020846042782068253, "rewards/rollout_reward_func/mean": 0.6257355809211731, "rewards/rollout_reward_func/std": 0.18678408861160278, "sampling/importance_sampling_ratio/max": 1.081130027770996, "sampling/importance_sampling_ratio/mean": 0.9882831573486328, "sampling/importance_sampling_ratio/min": 0.0009443601011298597, "sampling/sampling_logp_difference/max": 1.9564019441604614, "sampling/sampling_logp_difference/mean": 0.04941113665699959, "step": 891, "step_time": 4.371147304009355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30769283743575215, "epoch": 0.00892, "grad_norm": 1.0574486255645752, "kl": 0.4936494790017605, "learning_rate": 9.999660895130413e-06, "loss": -0.0284, "step": 892, "step_time": 2.041173052006343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 99.21875, "completions/mean_terminated_length": 99.21875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.8776137297973037, "epoch": 0.00893, "frac_reward_zero_std": 0.25, "grad_norm": 0.08532270789146423, "kl": 0.8691011071205139, "learning_rate": 9.9996601023785e-06, "loss": 0.0602, "num_tokens": 8353367.0, "reward": 0.5540865659713745, "reward_std": 0.022437050938606262, "rewards/rollout_reward_func/mean": 0.5540865659713745, "rewards/rollout_reward_func/std": 0.1255563497543335, "sampling/importance_sampling_ratio/max": 1.011734127998352, "sampling/importance_sampling_ratio/mean": 0.9058141708374023, "sampling/importance_sampling_ratio/min": 5.7534191000740975e-05, "sampling/sampling_logp_difference/max": 3.844470739364624, "sampling/sampling_logp_difference/mean": 0.20396144688129425, "step": 893, "step_time": 4.331071542997961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8763228384777904, "epoch": 0.00894, "grad_norm": 0.07770726829767227, "kl": 0.8473782241344452, "learning_rate": 9.999659308701071e-06, "loss": 0.0601, "step": 894, "step_time": 2.011980029012193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 222.0, "completions/mean_terminated_length": 222.0, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.041943850461393595, "epoch": 0.00895, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004892533179372549, "kl": 0.400910884141922, "learning_rate": 9.999658514098125e-06, "loss": 0.0016, "num_tokens": 8372703.0, "reward": 1.200115442276001, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.200115442276001, "rewards/rollout_reward_func/std": 0.14329874515533447, "sampling/importance_sampling_ratio/max": 0.9981308579444885, "sampling/importance_sampling_ratio/mean": 0.9949750900268555, "sampling/importance_sampling_ratio/min": 0.9910646080970764, "sampling/sampling_logp_difference/max": 0.004348892718553543, "sampling/sampling_logp_difference/mean": 0.001075733220204711, "step": 895, "step_time": 4.9794379269951605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0422617057338357, "epoch": 0.00896, "grad_norm": 0.0004962851526215672, "kl": 0.4008560851216316, "learning_rate": 9.999657718569665e-06, "loss": 0.0016, "step": 896, "step_time": 2.026648908002244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 156.53125, "completions/mean_terminated_length": 156.53125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.286016752012074, "epoch": 0.00897, "frac_reward_zero_std": 0.5, "grad_norm": 0.023998063057661057, "kl": 0.4960264004766941, "learning_rate": 9.99965692211569e-06, "loss": -0.0322, "num_tokens": 8390544.0, "reward": 0.6527355909347534, "reward_std": 0.051080700010061264, "rewards/rollout_reward_func/mean": 0.6527355909347534, "rewards/rollout_reward_func/std": 0.21591737866401672, "sampling/importance_sampling_ratio/max": 1.0124752521514893, "sampling/importance_sampling_ratio/mean": 0.9703414440155029, "sampling/importance_sampling_ratio/min": 0.0003277253999840468, "sampling/sampling_logp_difference/max": 2.312746047973633, "sampling/sampling_logp_difference/mean": 0.04756831005215645, "step": 897, "step_time": 4.271234523999738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2857234519906342, "epoch": 0.00898, "grad_norm": 0.027186548337340355, "kl": 0.49095984175801277, "learning_rate": 9.999656124736203e-06, "loss": -0.0321, "step": 898, "step_time": 2.4763161549853976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 119.90625, "completions/mean_terminated_length": 119.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.23936975095421076, "epoch": 0.00899, "frac_reward_zero_std": 0.75, "grad_norm": 0.01853298582136631, "kl": 0.5551042035222054, "learning_rate": 9.9996553264312e-06, "loss": 0.0208, "num_tokens": 8406429.0, "reward": 0.700817346572876, "reward_std": 0.019717399030923843, "rewards/rollout_reward_func/mean": 0.700817346572876, "rewards/rollout_reward_func/std": 0.31275856494903564, "sampling/importance_sampling_ratio/max": 1.0053373575210571, "sampling/importance_sampling_ratio/mean": 0.9648586511611938, "sampling/importance_sampling_ratio/min": 0.003940679132938385, "sampling/sampling_logp_difference/max": 2.232926368713379, "sampling/sampling_logp_difference/mean": 0.03614422678947449, "step": 899, "step_time": 4.029320756999368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24460846791043878, "epoch": 0.009, "grad_norm": 0.019214635714888573, "kl": 0.5596550703048706, "learning_rate": 9.999654527200682e-06, "loss": 0.0208, "step": 900, "step_time": 2.4524478900129907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 156.25, "completions/mean_terminated_length": 156.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1923578483983874, "epoch": 0.00901, "frac_reward_zero_std": 0.25, "grad_norm": 0.5642706751823425, "kl": 0.9027378335595131, "learning_rate": 9.999653727044649e-06, "loss": -0.0107, "num_tokens": 8423285.0, "reward": 0.5809326767921448, "reward_std": 0.118705615401268, "rewards/rollout_reward_func/mean": 0.5809326767921448, "rewards/rollout_reward_func/std": 0.3761555254459381, "sampling/importance_sampling_ratio/max": 1.736363410949707, "sampling/importance_sampling_ratio/mean": 0.8792210817337036, "sampling/importance_sampling_ratio/min": 0.002253859071061015, "sampling/sampling_logp_difference/max": 2.653688430786133, "sampling/sampling_logp_difference/mean": 0.28142738342285156, "step": 901, "step_time": 3.894485700999212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 1.2352062072604895, "epoch": 0.00902, "grad_norm": 0.3468717038631439, "kl": 0.9915257915854454, "learning_rate": 9.999652925963103e-06, "loss": -0.0121, "step": 902, "step_time": 2.033521080011269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 187.0625, "completions/mean_terminated_length": 187.0625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.5247799367643893, "epoch": 0.00903, "frac_reward_zero_std": 0.5, "grad_norm": 0.007083533797413111, "kl": 0.5528696104884148, "learning_rate": 9.999652123956045e-06, "loss": -0.0653, "num_tokens": 8441807.0, "reward": 1.0938270092010498, "reward_std": 0.11732534319162369, "rewards/rollout_reward_func/mean": 1.0938270092010498, "rewards/rollout_reward_func/std": 0.2660749852657318, "sampling/importance_sampling_ratio/max": 1.005613088607788, "sampling/importance_sampling_ratio/mean": 0.9345001578330994, "sampling/importance_sampling_ratio/min": 0.00016680698900017887, "sampling/sampling_logp_difference/max": 2.4140400886535645, "sampling/sampling_logp_difference/mean": 0.09579509496688843, "step": 903, "step_time": 4.600073713983875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5225099371746182, "epoch": 0.00904, "grad_norm": 0.007249101065099239, "kl": 0.5561617277562618, "learning_rate": 9.99965132102347e-06, "loss": -0.0653, "step": 904, "step_time": 2.4809614669939037 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 224.90625, "completions/mean_terminated_length": 224.90625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.30819529108703136, "epoch": 0.00905, "frac_reward_zero_std": 0.5, "grad_norm": 0.9591244459152222, "kl": 0.45575184002518654, "learning_rate": 9.999650517165385e-06, "loss": 0.0192, "num_tokens": 8461156.0, "reward": 0.8447500467300415, "reward_std": 0.03399552032351494, "rewards/rollout_reward_func/mean": 0.8447500467300415, "rewards/rollout_reward_func/std": 0.4055364727973938, "sampling/importance_sampling_ratio/max": 1.0057965517044067, "sampling/importance_sampling_ratio/mean": 0.9569652676582336, "sampling/importance_sampling_ratio/min": 0.0019479155307635665, "sampling/sampling_logp_difference/max": 2.0711612701416016, "sampling/sampling_logp_difference/mean": 0.03483663871884346, "step": 905, "step_time": 4.651569315006782 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013392857741564512, "entropy": 0.32063817931339145, "epoch": 0.00906, "grad_norm": 0.03964867815375328, "kl": 0.4731341749429703, "learning_rate": 9.999649712381786e-06, "loss": 0.0172, "step": 906, "step_time": 2.5332550250022905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 120.625, "completions/mean_terminated_length": 120.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.14520208537578583, "epoch": 0.00907, "frac_reward_zero_std": 0.75, "grad_norm": 0.5528523325920105, "kl": 0.532177958637476, "learning_rate": 9.999648906672674e-06, "loss": -0.0032, "num_tokens": 8476816.0, "reward": 0.4778076708316803, "reward_std": 0.02719641663134098, "rewards/rollout_reward_func/mean": 0.4778076708316803, "rewards/rollout_reward_func/std": 0.1809646636247635, "sampling/importance_sampling_ratio/max": 1.0216063261032104, "sampling/importance_sampling_ratio/mean": 0.9803811311721802, "sampling/importance_sampling_ratio/min": 0.49241161346435547, "sampling/sampling_logp_difference/max": 0.5001367926597595, "sampling/sampling_logp_difference/mean": 0.009504670277237892, "step": 907, "step_time": 4.383624084992334 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.01875000074505806, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02656250074505806, "entropy": 0.18894072528928518, "epoch": 0.00908, "grad_norm": 0.03940722346305847, "kl": 0.531526155769825, "learning_rate": 9.999648100038048e-06, "loss": -0.0056, "step": 908, "step_time": 2.0304449509931146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 175.1875, "completions/mean_terminated_length": 175.1875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.31523323943838477, "epoch": 0.00909, "frac_reward_zero_std": 0.75, "grad_norm": 0.009302672930061817, "kl": 0.45876747369766235, "learning_rate": 9.999647292477912e-06, "loss": -0.0368, "num_tokens": 8493702.0, "reward": 0.9572741985321045, "reward_std": 0.03686430677771568, "rewards/rollout_reward_func/mean": 0.9572741985321045, "rewards/rollout_reward_func/std": 0.340093731880188, "sampling/importance_sampling_ratio/max": 1.0217912197113037, "sampling/importance_sampling_ratio/mean": 0.9714584350585938, "sampling/importance_sampling_ratio/min": 1.5756848408508828e-13, "sampling/sampling_logp_difference/max": 13.773690223693848, "sampling/sampling_logp_difference/mean": 0.14367206394672394, "step": 909, "step_time": 4.695506750009372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3149532782845199, "epoch": 0.0091, "grad_norm": 0.00831273291260004, "kl": 0.45898884907364845, "learning_rate": 9.999646483992262e-06, "loss": -0.0368, "step": 910, "step_time": 2.591156904003583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 154.9375, "completions/mean_terminated_length": 154.9375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.5465993597172201, "epoch": 0.00911, "frac_reward_zero_std": 0.75, "grad_norm": 0.013013801537454128, "kl": 0.4090523310005665, "learning_rate": 9.9996456745811e-06, "loss": -0.0152, "num_tokens": 8511356.0, "reward": 0.8330288529396057, "reward_std": 0.008838835172355175, "rewards/rollout_reward_func/mean": 0.8330288529396057, "rewards/rollout_reward_func/std": 0.3451460301876068, "sampling/importance_sampling_ratio/max": 1.002204179763794, "sampling/importance_sampling_ratio/mean": 0.9348136782646179, "sampling/importance_sampling_ratio/min": 2.5212579221540816e-17, "sampling/sampling_logp_difference/max": 2.706658363342285, "sampling/sampling_logp_difference/mean": 0.2096332162618637, "step": 911, "step_time": 4.388454000996717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5538133052177727, "epoch": 0.00912, "grad_norm": 0.015110256150364876, "kl": 0.4187667667865753, "learning_rate": 9.999644864244428e-06, "loss": -0.0151, "step": 912, "step_time": 2.5446087999953306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 94.3125, "completions/mean_terminated_length": 94.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.53884765971452, "epoch": 0.00913, "frac_reward_zero_std": 0.5, "grad_norm": 0.08727841079235077, "kl": 0.649520069360733, "learning_rate": 9.999644052982243e-06, "loss": -0.0471, "num_tokens": 8526502.0, "reward": 0.7550961375236511, "reward_std": 0.04215443879365921, "rewards/rollout_reward_func/mean": 0.7550961375236511, "rewards/rollout_reward_func/std": 0.28708815574645996, "sampling/importance_sampling_ratio/max": 1.0507829189300537, "sampling/importance_sampling_ratio/mean": 0.9370533227920532, "sampling/importance_sampling_ratio/min": 2.8415858555064233e-09, "sampling/sampling_logp_difference/max": 14.821351051330566, "sampling/sampling_logp_difference/mean": 0.24647153913974762, "step": 913, "step_time": 4.497278955976071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5379471918568015, "epoch": 0.00914, "grad_norm": 0.07584920525550842, "kl": 0.6474310457706451, "learning_rate": 9.999643240794546e-06, "loss": -0.0472, "step": 914, "step_time": 2.0415825140080415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 101.15625, "completions/mean_terminated_length": 101.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4010646012611687, "epoch": 0.00915, "frac_reward_zero_std": 0.75, "grad_norm": 0.017386458814144135, "kl": 0.6340970620512962, "learning_rate": 9.999642427681338e-06, "loss": -0.0178, "num_tokens": 8541955.0, "reward": 0.7570673227310181, "reward_std": 0.022437043488025665, "rewards/rollout_reward_func/mean": 0.7570673227310181, "rewards/rollout_reward_func/std": 0.20299509167671204, "sampling/importance_sampling_ratio/max": 1.0200377702713013, "sampling/importance_sampling_ratio/mean": 0.9654022455215454, "sampling/importance_sampling_ratio/min": 4.860515900872997e-07, "sampling/sampling_logp_difference/max": 2.448190212249756, "sampling/sampling_logp_difference/mean": 0.08727794885635376, "step": 915, "step_time": 4.836870815997827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40123469941318035, "epoch": 0.00916, "grad_norm": 0.01620044931769371, "kl": 0.6361470744013786, "learning_rate": 9.99964161364262e-06, "loss": -0.0178, "step": 916, "step_time": 2.03301196800021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 119.0, "completions/mean_terminated_length": 119.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.06143828760832548, "epoch": 0.00917, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005778616177849472, "kl": 0.4320349022746086, "learning_rate": 9.999640798678389e-06, "loss": 0.0012, "num_tokens": 8558755.0, "reward": 0.7334614992141724, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7334614992141724, "rewards/rollout_reward_func/std": 0.1904403418302536, "sampling/importance_sampling_ratio/max": 0.9994169473648071, "sampling/importance_sampling_ratio/mean": 0.9955606460571289, "sampling/importance_sampling_ratio/min": 0.9913148283958435, "sampling/sampling_logp_difference/max": 0.005383724346756935, "sampling/sampling_logp_difference/mean": 0.0012821187265217304, "step": 917, "step_time": 4.283548290986801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061219817493110895, "epoch": 0.00918, "grad_norm": 0.000574064499232918, "kl": 0.4320778511464596, "learning_rate": 9.999639982788647e-06, "loss": 0.0012, "step": 918, "step_time": 2.5010306470139767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 126.0, "completions/mean_terminated_length": 126.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.16974514536559582, "epoch": 0.00919, "frac_reward_zero_std": 0.75, "grad_norm": 1.0903725624084473, "kl": 0.5179873071610928, "learning_rate": 9.999639165973397e-06, "loss": 0.0121, "num_tokens": 8575003.0, "reward": 1.023432731628418, "reward_std": 0.0388161838054657, "rewards/rollout_reward_func/mean": 1.023432731628418, "rewards/rollout_reward_func/std": 0.2637842893600464, "sampling/importance_sampling_ratio/max": 1.1000337600708008, "sampling/importance_sampling_ratio/mean": 0.9653797149658203, "sampling/importance_sampling_ratio/min": 0.5378011465072632, "sampling/sampling_logp_difference/max": 0.4260527491569519, "sampling/sampling_logp_difference/mean": 0.011562798172235489, "step": 919, "step_time": 4.302770248999877 }, { "clip_ratio/high_max": 0.06562500074505806, "clip_ratio/high_mean": 0.03281250037252903, "clip_ratio/low_mean": 0.06562500074505806, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09843750111758709, "entropy": 0.23021178413182497, "epoch": 0.0092, "grad_norm": 0.02986862137913704, "kl": 0.49491893500089645, "learning_rate": 9.999638348232636e-06, "loss": 0.0065, "step": 920, "step_time": 2.023096089011233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 167.5, "completions/mean_terminated_length": 167.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2936600623652339, "epoch": 0.00921, "frac_reward_zero_std": 0.75, "grad_norm": 0.006465977057814598, "kl": 0.4772749952971935, "learning_rate": 9.999637529566364e-06, "loss": -0.027, "num_tokens": 8592323.0, "reward": 0.7087981104850769, "reward_std": 0.008838829584419727, "rewards/rollout_reward_func/mean": 0.7087981104850769, "rewards/rollout_reward_func/std": 0.28263676166534424, "sampling/importance_sampling_ratio/max": 1.0132802724838257, "sampling/importance_sampling_ratio/mean": 0.9640580415725708, "sampling/importance_sampling_ratio/min": 0.001224601175636053, "sampling/sampling_logp_difference/max": 1.745326280593872, "sampling/sampling_logp_difference/mean": 0.042294614017009735, "step": 921, "step_time": 5.136056361996452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3033571974374354, "epoch": 0.00922, "grad_norm": 0.006540271919220686, "kl": 0.4772118330001831, "learning_rate": 9.999636709974583e-06, "loss": -0.027, "step": 922, "step_time": 2.030444192008872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 177.25, "completions/mean_terminated_length": 177.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4716881616041064, "epoch": 0.00923, "frac_reward_zero_std": 0.75, "grad_norm": 0.03801235556602478, "kl": 0.7089874148368835, "learning_rate": 9.999635889457293e-06, "loss": 0.0244, "num_tokens": 8609899.0, "reward": 0.8806442022323608, "reward_std": 0.01375033799558878, "rewards/rollout_reward_func/mean": 0.8806442022323608, "rewards/rollout_reward_func/std": 0.36745157837867737, "sampling/importance_sampling_ratio/max": 1.0151945352554321, "sampling/importance_sampling_ratio/mean": 0.9347931742668152, "sampling/importance_sampling_ratio/min": 1.7050711903721094e-05, "sampling/sampling_logp_difference/max": 2.6338658332824707, "sampling/sampling_logp_difference/mean": 0.09236881881952286, "step": 923, "step_time": 5.24433604600199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4658217541873455, "epoch": 0.00924, "grad_norm": 0.03880477324128151, "kl": 0.7144867815077305, "learning_rate": 9.999635068014492e-06, "loss": 0.0244, "step": 924, "step_time": 2.0376850749962614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 103.125, "completions/mean_terminated_length": 103.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.45721957506611943, "epoch": 0.00925, "frac_reward_zero_std": 0.5, "grad_norm": 0.024804115295410156, "kl": 0.571630384773016, "learning_rate": 9.999634245646181e-06, "loss": -0.037, "num_tokens": 8625911.0, "reward": 0.7466826438903809, "reward_std": 0.015909895300865173, "rewards/rollout_reward_func/mean": 0.7466826438903809, "rewards/rollout_reward_func/std": 0.23698383569717407, "sampling/importance_sampling_ratio/max": 1.038185715675354, "sampling/importance_sampling_ratio/mean": 0.9428719282150269, "sampling/importance_sampling_ratio/min": 0.01086376141756773, "sampling/sampling_logp_difference/max": 1.9840201139450073, "sampling/sampling_logp_difference/mean": 0.06890197098255157, "step": 925, "step_time": 4.034022810010356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.45334840565919876, "epoch": 0.00926, "grad_norm": 0.0245908685028553, "kl": 0.570985272526741, "learning_rate": 9.999633422352361e-06, "loss": -0.037, "step": 926, "step_time": 2.002731243002927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 121.40625, "completions/mean_terminated_length": 121.40625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.35321936942636967, "epoch": 0.00927, "frac_reward_zero_std": 0.75, "grad_norm": 0.009145388379693031, "kl": 0.5369541421532631, "learning_rate": 9.999632598133035e-06, "loss": -0.0177, "num_tokens": 8641868.0, "reward": 0.8253365159034729, "reward_std": 0.019717400893568993, "rewards/rollout_reward_func/mean": 0.8253365159034729, "rewards/rollout_reward_func/std": 0.23036523163318634, "sampling/importance_sampling_ratio/max": 1.021061897277832, "sampling/importance_sampling_ratio/mean": 0.9664728045463562, "sampling/importance_sampling_ratio/min": 0.0003630596911534667, "sampling/sampling_logp_difference/max": 2.0586133003234863, "sampling/sampling_logp_difference/mean": 0.06239338964223862, "step": 927, "step_time": 4.506131822003226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35385179426521063, "epoch": 0.00928, "grad_norm": 0.00919543020427227, "kl": 0.5383722260594368, "learning_rate": 9.999631772988198e-06, "loss": -0.0177, "step": 928, "step_time": 2.020626593999623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 148.9375, "completions/mean_terminated_length": 148.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4625723725184798, "epoch": 0.00929, "frac_reward_zero_std": 0.75, "grad_norm": 0.022460276260972023, "kl": 0.5895033329725266, "learning_rate": 9.999630946917853e-06, "loss": -0.0272, "num_tokens": 8658570.0, "reward": 0.5868750214576721, "reward_std": 0.008838835172355175, "rewards/rollout_reward_func/mean": 0.5868750214576721, "rewards/rollout_reward_func/std": 0.19870930910110474, "sampling/importance_sampling_ratio/max": 1.0172125101089478, "sampling/importance_sampling_ratio/mean": 0.9643235206604004, "sampling/importance_sampling_ratio/min": 9.009679395773074e-15, "sampling/sampling_logp_difference/max": 3.7905633449554443, "sampling/sampling_logp_difference/mean": 0.20428280532360077, "step": 929, "step_time": 5.037839311000425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4637847635895014, "epoch": 0.0093, "grad_norm": 0.022682011127471924, "kl": 0.5892852284014225, "learning_rate": 9.999630119922e-06, "loss": -0.0272, "step": 930, "step_time": 2.061341035005171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 107.21875, "completions/mean_terminated_length": 107.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0094041116535664, "epoch": 0.00931, "frac_reward_zero_std": 0.25, "grad_norm": 0.040501918643713, "kl": 0.7732903137803078, "learning_rate": 9.99962929200064e-06, "loss": -0.0084, "num_tokens": 8673857.0, "reward": 0.6521298289299011, "reward_std": 0.12079286575317383, "rewards/rollout_reward_func/mean": 0.6521298289299011, "rewards/rollout_reward_func/std": 0.19046847522258759, "sampling/importance_sampling_ratio/max": 1.0203745365142822, "sampling/importance_sampling_ratio/mean": 0.9059237837791443, "sampling/importance_sampling_ratio/min": 3.915094177580526e-19, "sampling/sampling_logp_difference/max": 4.070131301879883, "sampling/sampling_logp_difference/mean": 0.4329611659049988, "step": 931, "step_time": 4.3154665019974345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 1.0063246618956327, "epoch": 0.00932, "grad_norm": 0.03445185720920563, "kl": 0.7644564360380173, "learning_rate": 9.99962846315377e-06, "loss": -0.0085, "step": 932, "step_time": 2.545796407983289 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 149.5625, "completions/mean_terminated_length": 149.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.22746851155534387, "epoch": 0.00933, "frac_reward_zero_std": 0.75, "grad_norm": 0.36379408836364746, "kl": 0.5149252116680145, "learning_rate": 9.999627633381394e-06, "loss": 0.0078, "num_tokens": 8690451.0, "reward": 0.8128076791763306, "reward_std": 0.0391034372150898, "rewards/rollout_reward_func/mean": 0.8128076791763306, "rewards/rollout_reward_func/std": 0.47767534852027893, "sampling/importance_sampling_ratio/max": 1.039060115814209, "sampling/importance_sampling_ratio/mean": 0.9537520408630371, "sampling/importance_sampling_ratio/min": 0.03284076228737831, "sampling/sampling_logp_difference/max": 1.395117998123169, "sampling/sampling_logp_difference/mean": 0.025484522804617882, "step": 933, "step_time": 4.583057833988278 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 0.22190040908753872, "epoch": 0.00934, "grad_norm": 0.35345685482025146, "kl": 0.5086344368755817, "learning_rate": 9.99962680268351e-06, "loss": 0.0067, "step": 934, "step_time": 2.5452888600266306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 132.375, "completions/mean_terminated_length": 132.375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.1780003528110683, "epoch": 0.00935, "frac_reward_zero_std": 0.75, "grad_norm": 0.3418287932872772, "kl": 0.43364062160253525, "learning_rate": 9.99962597106012e-06, "loss": -0.0308, "num_tokens": 8707063.0, "reward": 0.6934422850608826, "reward_std": 0.06218257546424866, "rewards/rollout_reward_func/mean": 0.6934422850608826, "rewards/rollout_reward_func/std": 0.2922743558883667, "sampling/importance_sampling_ratio/max": 1.2044739723205566, "sampling/importance_sampling_ratio/mean": 0.9709324836730957, "sampling/importance_sampling_ratio/min": 0.09644786268472672, "sampling/sampling_logp_difference/max": 1.524747371673584, "sampling/sampling_logp_difference/mean": 0.028325911611318588, "step": 935, "step_time": 4.3048073679965455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20809256378561258, "epoch": 0.00936, "grad_norm": 0.32406380772590637, "kl": 0.43803178519010544, "learning_rate": 9.99962513851122e-06, "loss": -0.0315, "step": 936, "step_time": 2.0634566549997544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 142.03125, "completions/mean_terminated_length": 142.03125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.24681207351386547, "epoch": 0.00937, "frac_reward_zero_std": 0.75, "grad_norm": 0.015033337287604809, "kl": 0.5428211279213428, "learning_rate": 9.999624305036816e-06, "loss": 0.0301, "num_tokens": 8723736.0, "reward": 0.6000480651855469, "reward_std": 0.015365973114967346, "rewards/rollout_reward_func/mean": 0.6000480651855469, "rewards/rollout_reward_func/std": 0.19451671838760376, "sampling/importance_sampling_ratio/max": 1.0064749717712402, "sampling/importance_sampling_ratio/mean": 0.9683672189712524, "sampling/importance_sampling_ratio/min": 0.011795803904533386, "sampling/sampling_logp_difference/max": 1.3082090616226196, "sampling/sampling_logp_difference/mean": 0.0295045655220747, "step": 937, "step_time": 4.372222393008997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24605108285322785, "epoch": 0.00938, "grad_norm": 0.01472191046923399, "kl": 0.5429388284683228, "learning_rate": 9.999623470636904e-06, "loss": 0.0301, "step": 938, "step_time": 2.4961015810113167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 96.15625, "completions/mean_terminated_length": 96.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.40099481493234634, "epoch": 0.00939, "frac_reward_zero_std": 0.75, "grad_norm": 0.007506849244236946, "kl": 0.6869763508439064, "learning_rate": 9.999622635311485e-06, "loss": -0.0272, "num_tokens": 8738261.0, "reward": 0.6828845739364624, "reward_std": 0.028828196227550507, "rewards/rollout_reward_func/mean": 0.6828845739364624, "rewards/rollout_reward_func/std": 0.14886412024497986, "sampling/importance_sampling_ratio/max": 1.0018086433410645, "sampling/importance_sampling_ratio/mean": 0.9639021754264832, "sampling/importance_sampling_ratio/min": 2.760757524811197e-07, "sampling/sampling_logp_difference/max": 2.4511823654174805, "sampling/sampling_logp_difference/mean": 0.11308936774730682, "step": 939, "step_time": 4.084924990005675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.400576950982213, "epoch": 0.0094, "grad_norm": 0.007806516718119383, "kl": 0.6855113655328751, "learning_rate": 9.999621799060561e-06, "loss": -0.0272, "step": 940, "step_time": 2.553245324990712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 164.03125, "completions/mean_terminated_length": 164.03125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3126332755200565, "epoch": 0.00941, "frac_reward_zero_std": 0.5, "grad_norm": 0.07545462995767593, "kl": 0.5895236618816853, "learning_rate": 9.999620961884131e-06, "loss": 0.0068, "num_tokens": 8755142.0, "reward": 0.48625481128692627, "reward_std": 0.025795795023441315, "rewards/rollout_reward_func/mean": 0.48625481128692627, "rewards/rollout_reward_func/std": 0.17034544050693512, "sampling/importance_sampling_ratio/max": 1.0240110158920288, "sampling/importance_sampling_ratio/mean": 0.9408932328224182, "sampling/importance_sampling_ratio/min": 0.000659898912999779, "sampling/sampling_logp_difference/max": 2.0613861083984375, "sampling/sampling_logp_difference/mean": 0.04378748685121536, "step": 941, "step_time": 4.636963242999627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31326996674761176, "epoch": 0.00942, "grad_norm": 0.06692709028720856, "kl": 0.584707710891962, "learning_rate": 9.999620123782196e-06, "loss": 0.0067, "step": 942, "step_time": 2.061875945008069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 101.84375, "completions/mean_terminated_length": 101.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5745566431432962, "epoch": 0.00943, "frac_reward_zero_std": 0.5, "grad_norm": 0.7376399636268616, "kl": 0.6232123784720898, "learning_rate": 9.999619284754754e-06, "loss": -0.0347, "num_tokens": 8770121.0, "reward": 0.6154327392578125, "reward_std": 0.03729705512523651, "rewards/rollout_reward_func/mean": 0.6154327392578125, "rewards/rollout_reward_func/std": 0.22691687941551208, "sampling/importance_sampling_ratio/max": 1.0001591444015503, "sampling/importance_sampling_ratio/mean": 0.9156111478805542, "sampling/importance_sampling_ratio/min": 0.005485340487211943, "sampling/sampling_logp_difference/max": 1.6994085311889648, "sampling/sampling_logp_difference/mean": 0.08404354006052017, "step": 943, "step_time": 4.514536382986989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.582419516518712, "epoch": 0.00944, "grad_norm": 0.01578633114695549, "kl": 0.654615007340908, "learning_rate": 9.999618444801806e-06, "loss": -0.0358, "step": 944, "step_time": 2.0376325880206423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.38457788713276386, "epoch": 0.00945, "frac_reward_zero_std": 0.75, "grad_norm": 0.0055159227922558784, "kl": 0.4716047942638397, "learning_rate": 9.999617603923354e-06, "loss": -0.0252, "num_tokens": 8788201.0, "reward": 0.5565577149391174, "reward_std": 0.029267434030771255, "rewards/rollout_reward_func/mean": 0.5565577149391174, "rewards/rollout_reward_func/std": 0.18192347884178162, "sampling/importance_sampling_ratio/max": 1.0032739639282227, "sampling/importance_sampling_ratio/mean": 0.9358992576599121, "sampling/importance_sampling_ratio/min": 0.003787416033446789, "sampling/sampling_logp_difference/max": 1.712430715560913, "sampling/sampling_logp_difference/mean": 0.04387946426868439, "step": 945, "step_time": 4.483273929996358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38597864331677556, "epoch": 0.00946, "grad_norm": 0.005415483843535185, "kl": 0.4711407460272312, "learning_rate": 9.999616762119397e-06, "loss": -0.0252, "step": 946, "step_time": 2.5662892479958828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 163.0625, "completions/mean_terminated_length": 163.0625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.6441351021640003, "epoch": 0.00947, "frac_reward_zero_std": 0.25, "grad_norm": 0.016167184337973595, "kl": 0.44856273755431175, "learning_rate": 9.999615919389935e-06, "loss": -0.018, "num_tokens": 8805731.0, "reward": 0.6462019681930542, "reward_std": 0.05806434527039528, "rewards/rollout_reward_func/mean": 0.6462019681930542, "rewards/rollout_reward_func/std": 0.28587237000465393, "sampling/importance_sampling_ratio/max": 1.0029128789901733, "sampling/importance_sampling_ratio/mean": 0.9034227132797241, "sampling/importance_sampling_ratio/min": 0.0022686298470944166, "sampling/sampling_logp_difference/max": 1.7038291692733765, "sampling/sampling_logp_difference/mean": 0.08351771533489227, "step": 947, "step_time": 4.33998571200209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6392534859478474, "epoch": 0.00948, "grad_norm": 0.015871740877628326, "kl": 0.44932033494114876, "learning_rate": 9.999615075734968e-06, "loss": -0.018, "step": 948, "step_time": 2.0483064300133265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 142.375, "completions/mean_terminated_length": 142.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.26255984883755445, "epoch": 0.00949, "frac_reward_zero_std": 0.75, "grad_norm": 0.009205175563693047, "kl": 0.5526152476668358, "learning_rate": 9.999614231154497e-06, "loss": -0.0171, "num_tokens": 8822791.0, "reward": 0.7664422988891602, "reward_std": 0.006799102760851383, "rewards/rollout_reward_func/mean": 0.7664422988891602, "rewards/rollout_reward_func/std": 0.2620137333869934, "sampling/importance_sampling_ratio/max": 1.0047855377197266, "sampling/importance_sampling_ratio/mean": 0.9649592638015747, "sampling/importance_sampling_ratio/min": 0.025378018617630005, "sampling/sampling_logp_difference/max": 1.6966710090637207, "sampling/sampling_logp_difference/mean": 0.030143529176712036, "step": 949, "step_time": 4.768472173986083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26176527235656977, "epoch": 0.0095, "grad_norm": 0.008809803985059261, "kl": 0.5557994917035103, "learning_rate": 9.999613385648523e-06, "loss": -0.0171, "step": 950, "step_time": 2.0338462560030166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 150.9375, "completions/mean_terminated_length": 150.9375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.46533093880862, "epoch": 0.00951, "frac_reward_zero_std": 0.5, "grad_norm": 0.013620361685752869, "kl": 0.5025186687707901, "learning_rate": 9.999612539217044e-06, "loss": -0.0454, "num_tokens": 8839557.0, "reward": 0.6024038791656494, "reward_std": 0.02474873699247837, "rewards/rollout_reward_func/mean": 0.6024038791656494, "rewards/rollout_reward_func/std": 0.20940326154232025, "sampling/importance_sampling_ratio/max": 1.0007514953613281, "sampling/importance_sampling_ratio/mean": 0.9337089657783508, "sampling/importance_sampling_ratio/min": 0.0004364447668194771, "sampling/sampling_logp_difference/max": 1.8036017417907715, "sampling/sampling_logp_difference/mean": 0.06816121935844421, "step": 951, "step_time": 4.74582918699889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4667069613933563, "epoch": 0.00952, "grad_norm": 0.012831744737923145, "kl": 0.507246557623148, "learning_rate": 9.999611691860062e-06, "loss": -0.0454, "step": 952, "step_time": 2.04910180799925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 138.84375, "completions/mean_terminated_length": 138.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.32595023047178984, "epoch": 0.00953, "frac_reward_zero_std": 0.5, "grad_norm": 0.15837961435317993, "kl": 0.5909777916967869, "learning_rate": 9.999610843577577e-06, "loss": 0.0117, "num_tokens": 8855944.0, "reward": 0.5705816745758057, "reward_std": 0.05151554197072983, "rewards/rollout_reward_func/mean": 0.5705816745758057, "rewards/rollout_reward_func/std": 0.17972137033939362, "sampling/importance_sampling_ratio/max": 1.3626947402954102, "sampling/importance_sampling_ratio/mean": 0.934485912322998, "sampling/importance_sampling_ratio/min": 0.016959192231297493, "sampling/sampling_logp_difference/max": 2.0634078979492188, "sampling/sampling_logp_difference/mean": 0.07079710066318512, "step": 953, "step_time": 4.183094196014281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.32524970453232527, "epoch": 0.00954, "grad_norm": 0.07099618762731552, "kl": 0.5939444601535797, "learning_rate": 9.999609994369586e-06, "loss": 0.0116, "step": 954, "step_time": 2.064506208007515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 173.0625, "completions/mean_terminated_length": 173.0625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4551972895860672, "epoch": 0.00955, "frac_reward_zero_std": 0.5, "grad_norm": 0.01008078083395958, "kl": 0.5867600217461586, "learning_rate": 9.999609144236094e-06, "loss": -0.0457, "num_tokens": 8873442.0, "reward": 0.6407884955406189, "reward_std": 0.04487408325076103, "rewards/rollout_reward_func/mean": 0.6407884955406189, "rewards/rollout_reward_func/std": 0.15833479166030884, "sampling/importance_sampling_ratio/max": 1.0008364915847778, "sampling/importance_sampling_ratio/mean": 0.9325751662254333, "sampling/importance_sampling_ratio/min": 0.0046594636514782906, "sampling/sampling_logp_difference/max": 1.9366800785064697, "sampling/sampling_logp_difference/mean": 0.06138545274734497, "step": 955, "step_time": 4.881436754010792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.45004255743697286, "epoch": 0.00956, "grad_norm": 0.009874818846583366, "kl": 0.5925649404525757, "learning_rate": 9.999608293177099e-06, "loss": -0.0457, "step": 956, "step_time": 2.045357106006122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 149.84375, "completions/mean_terminated_length": 149.84375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.37578807678073645, "epoch": 0.00957, "frac_reward_zero_std": 0.5, "grad_norm": 0.05259598791599274, "kl": 0.5582252405583858, "learning_rate": 9.9996074411926e-06, "loss": -0.0416, "num_tokens": 8890197.0, "reward": 0.8966346383094788, "reward_std": 0.039434801787137985, "rewards/rollout_reward_func/mean": 0.8966346383094788, "rewards/rollout_reward_func/std": 0.2848856449127197, "sampling/importance_sampling_ratio/max": 0.9992815852165222, "sampling/importance_sampling_ratio/mean": 0.9394497871398926, "sampling/importance_sampling_ratio/min": 0.00030010484624654055, "sampling/sampling_logp_difference/max": 2.407705068588257, "sampling/sampling_logp_difference/mean": 0.062425293028354645, "step": 957, "step_time": 4.779443041006743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3742365576326847, "epoch": 0.00958, "grad_norm": 0.049425266683101654, "kl": 0.5694433823227882, "learning_rate": 9.9996065882826e-06, "loss": -0.0417, "step": 958, "step_time": 2.031734944001073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 188.5625, "completions/mean_terminated_length": 188.5625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.562746929936111, "epoch": 0.00959, "frac_reward_zero_std": 0.5, "grad_norm": 0.017071984708309174, "kl": 0.43355710431933403, "learning_rate": 9.999605734447097e-06, "loss": -0.0555, "num_tokens": 8908599.0, "reward": 0.6231731176376343, "reward_std": 0.02474873885512352, "rewards/rollout_reward_func/mean": 0.6231731176376343, "rewards/rollout_reward_func/std": 0.1602284014225006, "sampling/importance_sampling_ratio/max": 0.9984726905822754, "sampling/importance_sampling_ratio/mean": 0.9301775097846985, "sampling/importance_sampling_ratio/min": 7.510018162903975e-10, "sampling/sampling_logp_difference/max": 2.6087255477905273, "sampling/sampling_logp_difference/mean": 0.12929227948188782, "step": 959, "step_time": 4.411394973001734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5658291140571237, "epoch": 0.0096, "grad_norm": 0.015404959209263325, "kl": 0.43472929671406746, "learning_rate": 9.999604879686092e-06, "loss": -0.0555, "step": 960, "step_time": 2.474182573998405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 147.28125, "completions/mean_terminated_length": 147.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.26929251896217465, "epoch": 0.00961, "frac_reward_zero_std": 0.75, "grad_norm": 0.03171762824058533, "kl": 0.5320725701749325, "learning_rate": 9.999604023999587e-06, "loss": -0.0261, "num_tokens": 8925304.0, "reward": 0.5780961513519287, "reward_std": 0.0067991032265126705, "rewards/rollout_reward_func/mean": 0.5780961513519287, "rewards/rollout_reward_func/std": 0.24200783669948578, "sampling/importance_sampling_ratio/max": 1.001604676246643, "sampling/importance_sampling_ratio/mean": 0.9639323949813843, "sampling/importance_sampling_ratio/min": 0.031118901446461678, "sampling/sampling_logp_difference/max": 1.2606310844421387, "sampling/sampling_logp_difference/mean": 0.029596691951155663, "step": 961, "step_time": 4.4136734289932065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2707983301952481, "epoch": 0.00962, "grad_norm": 0.03121871128678322, "kl": 0.5361044369637966, "learning_rate": 9.999603167387578e-06, "loss": -0.0261, "step": 962, "step_time": 2.0437087419995805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 81.0625, "completions/mean_terminated_length": 81.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6167814796790481, "epoch": 0.00963, "frac_reward_zero_std": 0.5, "grad_norm": 0.0519162118434906, "kl": 1.0436095148324966, "learning_rate": 9.999602309850068e-06, "loss": -0.0363, "num_tokens": 8940610.0, "reward": 0.8288461565971375, "reward_std": 0.06962281465530396, "rewards/rollout_reward_func/mean": 0.8288461565971375, "rewards/rollout_reward_func/std": 0.28128761053085327, "sampling/importance_sampling_ratio/max": 1.0018583536148071, "sampling/importance_sampling_ratio/mean": 0.9330210089683533, "sampling/importance_sampling_ratio/min": 0.00044000521302223206, "sampling/sampling_logp_difference/max": 3.0975818634033203, "sampling/sampling_logp_difference/mean": 0.1260102540254593, "step": 963, "step_time": 4.61508810500527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6202558181248605, "epoch": 0.00964, "grad_norm": 0.04841833934187889, "kl": 1.0154372043907642, "learning_rate": 9.999601451387057e-06, "loss": -0.0364, "step": 964, "step_time": 2.0520211810071487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.053636785596609116, "epoch": 0.00965, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006777324597351253, "kl": 0.41925498098134995, "learning_rate": 9.999600591998547e-06, "loss": 0.0016, "num_tokens": 8958922.0, "reward": 0.7920385003089905, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7920385003089905, "rewards/rollout_reward_func/std": 0.28448688983917236, "sampling/importance_sampling_ratio/max": 1.0008624792099, "sampling/importance_sampling_ratio/mean": 0.9939321279525757, "sampling/importance_sampling_ratio/min": 0.9873812794685364, "sampling/sampling_logp_difference/max": 0.009202826768159866, "sampling/sampling_logp_difference/mean": 0.0012189249973744154, "step": 965, "step_time": 4.3929207360124565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05338747473433614, "epoch": 0.00966, "grad_norm": 0.0006783998105674982, "kl": 0.4192969724535942, "learning_rate": 9.999599731684533e-06, "loss": 0.0016, "step": 966, "step_time": 2.4816286100030993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.05431925877928734, "epoch": 0.00967, "frac_reward_zero_std": 1.0, "grad_norm": 0.000614497228525579, "kl": 0.5624131672084332, "learning_rate": 9.99959887044502e-06, "loss": 0.0019, "num_tokens": 8975978.0, "reward": 0.5938461422920227, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5938461422920227, "rewards/rollout_reward_func/std": 0.07411862164735794, "sampling/importance_sampling_ratio/max": 1.001338005065918, "sampling/importance_sampling_ratio/mean": 0.9960700273513794, "sampling/importance_sampling_ratio/min": 0.9883155822753906, "sampling/sampling_logp_difference/max": 0.00789177231490612, "sampling/sampling_logp_difference/mean": 0.0011898870579898357, "step": 967, "step_time": 4.249203814986686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0544764157384634, "epoch": 0.00968, "grad_norm": 0.0006266175769269466, "kl": 0.562413427978754, "learning_rate": 9.999598008280007e-06, "loss": 0.0019, "step": 968, "step_time": 2.48897441799636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 164.25, "completions/mean_terminated_length": 164.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.04721672786399722, "epoch": 0.00969, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005051371990703046, "kl": 0.42541542649269104, "learning_rate": 9.999597145189494e-06, "loss": 0.0014, "num_tokens": 8993410.0, "reward": 0.8465385437011719, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8465385437011719, "rewards/rollout_reward_func/std": 0.29317280650138855, "sampling/importance_sampling_ratio/max": 1.000364899635315, "sampling/importance_sampling_ratio/mean": 0.9962632060050964, "sampling/importance_sampling_ratio/min": 0.9928689002990723, "sampling/sampling_logp_difference/max": 0.004681963473558426, "sampling/sampling_logp_difference/mean": 0.0008442570688202977, "step": 969, "step_time": 4.24651150599675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.046749583911150694, "epoch": 0.0097, "grad_norm": 0.0005001022946089506, "kl": 0.4255037195980549, "learning_rate": 9.999596281173482e-06, "loss": 0.0014, "step": 970, "step_time": 2.0203971099908813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 141.8125, "completions/mean_terminated_length": 141.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.34661933220922947, "epoch": 0.00971, "frac_reward_zero_std": 0.75, "grad_norm": 0.019415155053138733, "kl": 0.6983382850885391, "learning_rate": 9.999595416231968e-06, "loss": 0.0302, "num_tokens": 9009940.0, "reward": 0.8073749542236328, "reward_std": 0.0020397298503667116, "rewards/rollout_reward_func/mean": 0.8073749542236328, "rewards/rollout_reward_func/std": 0.33235087990760803, "sampling/importance_sampling_ratio/max": 0.9997280836105347, "sampling/importance_sampling_ratio/mean": 0.9633699655532837, "sampling/importance_sampling_ratio/min": 0.01098591461777687, "sampling/sampling_logp_difference/max": 2.0387613773345947, "sampling/sampling_logp_difference/mean": 0.03978091850876808, "step": 971, "step_time": 4.377652474991919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34765231888741255, "epoch": 0.00972, "grad_norm": 0.019440969452261925, "kl": 0.699800156056881, "learning_rate": 9.999594550364955e-06, "loss": 0.0302, "step": 972, "step_time": 2.458800134001649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 213.75, "completions/mean_terminated_length": 213.75, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.3233045656234026, "epoch": 0.00973, "frac_reward_zero_std": 0.75, "grad_norm": 0.007950665429234505, "kl": 0.4437793716788292, "learning_rate": 9.999593683572444e-06, "loss": -0.0366, "num_tokens": 9028548.0, "reward": 0.765572190284729, "reward_std": 0.01774565875530243, "rewards/rollout_reward_func/mean": 0.765572190284729, "rewards/rollout_reward_func/std": 0.3860376179218292, "sampling/importance_sampling_ratio/max": 0.9987638592720032, "sampling/importance_sampling_ratio/mean": 0.9643006324768066, "sampling/importance_sampling_ratio/min": 2.8206266506458633e-05, "sampling/sampling_logp_difference/max": 2.187386989593506, "sampling/sampling_logp_difference/mean": 0.04882294684648514, "step": 973, "step_time": 4.627723124001932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.32537210639566183, "epoch": 0.00974, "grad_norm": 0.008085389621555805, "kl": 0.44753268361091614, "learning_rate": 9.999592815854433e-06, "loss": -0.0366, "step": 974, "step_time": 2.5074473329877947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 190.59375, "completions/mean_terminated_length": 190.59375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3540368340909481, "epoch": 0.00975, "frac_reward_zero_std": 0.75, "grad_norm": 0.004439641255885363, "kl": 0.44484758004546165, "learning_rate": 9.999591947210923e-06, "loss": -0.0355, "num_tokens": 9046743.0, "reward": 0.6994760036468506, "reward_std": 0.0007207065937109292, "rewards/rollout_reward_func/mean": 0.6994760036468506, "rewards/rollout_reward_func/std": 0.21354001760482788, "sampling/importance_sampling_ratio/max": 1.0012999773025513, "sampling/importance_sampling_ratio/mean": 0.9654161334037781, "sampling/importance_sampling_ratio/min": 3.3484792716365064e-10, "sampling/sampling_logp_difference/max": 3.9433844089508057, "sampling/sampling_logp_difference/mean": 0.0903645008802414, "step": 975, "step_time": 4.673245305020828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35428759502246976, "epoch": 0.00976, "grad_norm": 0.0042750840075314045, "kl": 0.4447338320314884, "learning_rate": 9.999591077641915e-06, "loss": -0.0355, "step": 976, "step_time": 2.032708440994611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 166.5, "completions/mean_terminated_length": 166.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.05036589223891497, "epoch": 0.00977, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005614492692984641, "kl": 0.43740953132510185, "learning_rate": 9.999590207147407e-06, "loss": 0.0015, "num_tokens": 9064111.0, "reward": 0.7888461351394653, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7888461351394653, "rewards/rollout_reward_func/std": 0.31939697265625, "sampling/importance_sampling_ratio/max": 0.9997549653053284, "sampling/importance_sampling_ratio/mean": 0.9961063861846924, "sampling/importance_sampling_ratio/min": 0.9908985495567322, "sampling/sampling_logp_difference/max": 0.006301086395978928, "sampling/sampling_logp_difference/mean": 0.0009472398087382317, "step": 977, "step_time": 4.725345881997782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05011538555845618, "epoch": 0.00978, "grad_norm": 0.0005566739127971232, "kl": 0.4374694675207138, "learning_rate": 9.999589335727404e-06, "loss": 0.0015, "step": 978, "step_time": 2.017684354002995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5508100492879748, "epoch": 0.00979, "frac_reward_zero_std": 0.5, "grad_norm": 0.009249876253306866, "kl": 0.6655012369155884, "learning_rate": 9.9995884633819e-06, "loss": -0.0456, "num_tokens": 9080559.0, "reward": 0.6348077058792114, "reward_std": 0.061463892459869385, "rewards/rollout_reward_func/mean": 0.6348077058792114, "rewards/rollout_reward_func/std": 0.16458489000797272, "sampling/importance_sampling_ratio/max": 1.0000410079956055, "sampling/importance_sampling_ratio/mean": 0.9337310791015625, "sampling/importance_sampling_ratio/min": 2.4758537234270683e-12, "sampling/sampling_logp_difference/max": 11.620162010192871, "sampling/sampling_logp_difference/mean": 0.17219732701778412, "step": 979, "step_time": 4.314466538984561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5512317498214543, "epoch": 0.0098, "grad_norm": 0.009750492870807648, "kl": 0.6755971759557724, "learning_rate": 9.9995875901109e-06, "loss": -0.0455, "step": 980, "step_time": 2.489941202009504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 120.25, "completions/mean_terminated_length": 120.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06812054850161076, "epoch": 0.00981, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005337304901331663, "kl": 0.5869993157684803, "learning_rate": 9.999586715914402e-06, "loss": 0.0015, "num_tokens": 9095991.0, "reward": 0.8951153755187988, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8951153755187988, "rewards/rollout_reward_func/std": 0.26807019114494324, "sampling/importance_sampling_ratio/max": 0.9995778799057007, "sampling/importance_sampling_ratio/mean": 0.9948870539665222, "sampling/importance_sampling_ratio/min": 0.9817841053009033, "sampling/sampling_logp_difference/max": 0.014594025909900665, "sampling/sampling_logp_difference/mean": 0.0013600543607026339, "step": 981, "step_time": 4.358062325001811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06777296680957079, "epoch": 0.00982, "grad_norm": 0.000521091977134347, "kl": 0.5870588682591915, "learning_rate": 9.999585840792405e-06, "loss": 0.0015, "step": 982, "step_time": 2.0433351039901027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 150.1875, "completions/mean_terminated_length": 150.1875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3358013890683651, "epoch": 0.00983, "frac_reward_zero_std": 0.75, "grad_norm": 0.012414120137691498, "kl": 0.4622191749513149, "learning_rate": 9.999584964744914e-06, "loss": 0.0303, "num_tokens": 9112789.0, "reward": 0.9243268966674805, "reward_std": 0.009518738836050034, "rewards/rollout_reward_func/mean": 0.9243268966674805, "rewards/rollout_reward_func/std": 0.2041756957769394, "sampling/importance_sampling_ratio/max": 1.001843810081482, "sampling/importance_sampling_ratio/mean": 0.9666919708251953, "sampling/importance_sampling_ratio/min": 0.00017297441081609577, "sampling/sampling_logp_difference/max": 2.255746364593506, "sampling/sampling_logp_difference/mean": 0.0553460568189621, "step": 983, "step_time": 4.900658165985078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3393247751519084, "epoch": 0.00984, "grad_norm": 0.011577753350138664, "kl": 0.4578128457069397, "learning_rate": 9.999584087771923e-06, "loss": 0.0302, "step": 984, "step_time": 2.0440306199961924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 150.46875, "completions/mean_terminated_length": 150.46875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3379069888032973, "epoch": 0.00985, "frac_reward_zero_std": 0.5, "grad_norm": 0.10965323448181152, "kl": 1.1315801590681076, "learning_rate": 9.999583209873438e-06, "loss": -0.0536, "num_tokens": 9129428.0, "reward": 0.6517389416694641, "reward_std": 0.05197642371058464, "rewards/rollout_reward_func/mean": 0.6517389416694641, "rewards/rollout_reward_func/std": 0.22918151319026947, "sampling/importance_sampling_ratio/max": 0.99887615442276, "sampling/importance_sampling_ratio/mean": 0.9338610172271729, "sampling/importance_sampling_ratio/min": 0.004096780437976122, "sampling/sampling_logp_difference/max": 2.6883184909820557, "sampling/sampling_logp_difference/mean": 0.0645560622215271, "step": 985, "step_time": 4.3898993599868845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3399459910579026, "epoch": 0.00986, "grad_norm": 0.08736725151538849, "kl": 1.0287253186106682, "learning_rate": 9.999582331049455e-06, "loss": -0.0539, "step": 986, "step_time": 2.5165262020018417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 192.0, "completions/mean_terminated_length": 192.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.048320339526981115, "epoch": 0.00987, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005370485014282167, "kl": 0.42070548608899117, "learning_rate": 9.999581451299976e-06, "loss": 0.0016, "num_tokens": 9147428.0, "reward": 0.6794615387916565, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6794615387916565, "rewards/rollout_reward_func/std": 0.2971336543560028, "sampling/importance_sampling_ratio/max": 1.0010319948196411, "sampling/importance_sampling_ratio/mean": 0.9955390691757202, "sampling/importance_sampling_ratio/min": 0.991960346698761, "sampling/sampling_logp_difference/max": 0.004883691668510437, "sampling/sampling_logp_difference/mean": 0.0010265556629747152, "step": 987, "step_time": 4.353467572007503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05006434163078666, "epoch": 0.00988, "grad_norm": 0.000562503351829946, "kl": 0.42042824253439903, "learning_rate": 9.999580570625e-06, "loss": 0.0016, "step": 988, "step_time": 2.0370871760096634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 192.625, "completions/mean_terminated_length": 192.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.19455766072496772, "epoch": 0.00989, "frac_reward_zero_std": 0.75, "grad_norm": 0.0049776192754507065, "kl": 0.46101465076208115, "learning_rate": 9.99957968902453e-06, "loss": -0.0172, "num_tokens": 9165904.0, "reward": 0.8537692427635193, "reward_std": 0.02719641663134098, "rewards/rollout_reward_func/mean": 0.8537692427635193, "rewards/rollout_reward_func/std": 0.42864322662353516, "sampling/importance_sampling_ratio/max": 1.0002375841140747, "sampling/importance_sampling_ratio/mean": 0.9652583003044128, "sampling/importance_sampling_ratio/min": 0.012336215935647488, "sampling/sampling_logp_difference/max": 2.2228612899780273, "sampling/sampling_logp_difference/mean": 0.027782801538705826, "step": 989, "step_time": 5.201795618988399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1967695807106793, "epoch": 0.0099, "grad_norm": 0.005072753876447678, "kl": 0.4580419473350048, "learning_rate": 9.999578806498565e-06, "loss": -0.0172, "step": 990, "step_time": 2.0358773969928734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 165.34375, "completions/mean_terminated_length": 165.34375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.25800058897584677, "epoch": 0.00991, "frac_reward_zero_std": 0.75, "grad_norm": 0.012683252803981304, "kl": 0.42170480638742447, "learning_rate": 9.999577923047102e-06, "loss": 0.0206, "num_tokens": 9183043.0, "reward": 0.6887019276618958, "reward_std": 0.01590990275144577, "rewards/rollout_reward_func/mean": 0.6887019276618958, "rewards/rollout_reward_func/std": 0.33682653307914734, "sampling/importance_sampling_ratio/max": 0.9996119141578674, "sampling/importance_sampling_ratio/mean": 0.9638550877571106, "sampling/importance_sampling_ratio/min": 0.0035067263524979353, "sampling/sampling_logp_difference/max": 1.868299961090088, "sampling/sampling_logp_difference/mean": 0.03629238158464432, "step": 991, "step_time": 4.704624095997133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25871960911899805, "epoch": 0.00992, "grad_norm": 0.012898429296910763, "kl": 0.4211234897375107, "learning_rate": 9.999577038670144e-06, "loss": 0.0206, "step": 992, "step_time": 2.0264671699915198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 154.625, "completions/mean_terminated_length": 155.8064422607422, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5908956667408347, "epoch": 0.00993, "frac_reward_zero_std": 0.75, "grad_norm": 0.012230284512043, "kl": 0.5371934734284878, "learning_rate": 9.999576153367693e-06, "loss": -0.0077, "num_tokens": 9199903.0, "reward": 0.7976441979408264, "reward_std": 0.03331560641527176, "rewards/rollout_reward_func/mean": 0.7976441979408264, "rewards/rollout_reward_func/std": 0.19779004156589508, "sampling/importance_sampling_ratio/max": 0.9992206692695618, "sampling/importance_sampling_ratio/mean": 0.9598910212516785, "sampling/importance_sampling_ratio/min": 3.4196829391218115e-18, "sampling/sampling_logp_difference/max": 4.211490154266357, "sampling/sampling_logp_difference/mean": 0.2547512352466583, "step": 993, "step_time": 4.3114320849999785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.593431593850255, "epoch": 0.00994, "grad_norm": 0.012677217833697796, "kl": 0.5375992804765701, "learning_rate": 9.999575267139748e-06, "loss": -0.0077, "step": 994, "step_time": 2.459031671984121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 174.6875, "completions/mean_terminated_length": 174.6875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.8061101362109184, "epoch": 0.00995, "frac_reward_zero_std": 0.25, "grad_norm": 0.03989708423614502, "kl": 0.5969070717692375, "learning_rate": 9.999574379986306e-06, "loss": 0.0178, "num_tokens": 9217997.0, "reward": 0.6726812124252319, "reward_std": 0.02819264680147171, "rewards/rollout_reward_func/mean": 0.6726812124252319, "rewards/rollout_reward_func/std": 0.29431042075157166, "sampling/importance_sampling_ratio/max": 1.0017207860946655, "sampling/importance_sampling_ratio/mean": 0.8721593618392944, "sampling/importance_sampling_ratio/min": 0.00010045841918326914, "sampling/sampling_logp_difference/max": 2.1742184162139893, "sampling/sampling_logp_difference/mean": 0.1381203979253769, "step": 995, "step_time": 4.513825654998072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7982591735199094, "epoch": 0.00996, "grad_norm": 0.037520881742239, "kl": 0.5910481251776218, "learning_rate": 9.99957349190737e-06, "loss": 0.0177, "step": 996, "step_time": 2.05831492201105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 174.65625, "completions/mean_terminated_length": 174.65625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.21309462282806635, "epoch": 0.00997, "frac_reward_zero_std": 0.75, "grad_norm": 0.02045913226902485, "kl": 0.49125083163380623, "learning_rate": 9.99957260290294e-06, "loss": -0.0261, "num_tokens": 9235394.0, "reward": 0.9108365774154663, "reward_std": 0.002311693038791418, "rewards/rollout_reward_func/mean": 0.9108365774154663, "rewards/rollout_reward_func/std": 0.4048626720905304, "sampling/importance_sampling_ratio/max": 1.0045490264892578, "sampling/importance_sampling_ratio/mean": 0.9666628837585449, "sampling/importance_sampling_ratio/min": 0.03970900923013687, "sampling/sampling_logp_difference/max": 1.5787841081619263, "sampling/sampling_logp_difference/mean": 0.018515892326831818, "step": 997, "step_time": 4.77001851399109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2156939748674631, "epoch": 0.00998, "grad_norm": 0.020003046840429306, "kl": 0.49066758900880814, "learning_rate": 9.999571712973018e-06, "loss": -0.0261, "step": 998, "step_time": 2.0157073800073704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.5026715639978647, "epoch": 0.00999, "frac_reward_zero_std": 0.75, "grad_norm": 0.005626144353300333, "kl": 0.48933740705251694, "learning_rate": 9.9995708221176e-06, "loss": -0.0232, "num_tokens": 9253318.0, "reward": 0.9441538453102112, "reward_std": 0.014243385754525661, "rewards/rollout_reward_func/mean": 0.9441538453102112, "rewards/rollout_reward_func/std": 0.35598739981651306, "sampling/importance_sampling_ratio/max": 1.0042346715927124, "sampling/importance_sampling_ratio/mean": 0.9351982474327087, "sampling/importance_sampling_ratio/min": 0.008233502507209778, "sampling/sampling_logp_difference/max": 2.0068962574005127, "sampling/sampling_logp_difference/mean": 0.061246998608112335, "step": 999, "step_time": 4.605162161002227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5031126709654927, "epoch": 0.01, "grad_norm": 0.005407037679105997, "kl": 0.489312332123518, "learning_rate": 9.99956993033669e-06, "loss": -0.0232, "step": 1000, "step_time": 2.5157650860055583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 213.53125, "completions/mean_terminated_length": 213.53125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.6472302237525582, "epoch": 0.01001, "frac_reward_zero_std": 0.5, "grad_norm": 0.009554068557918072, "kl": 0.4317055009305477, "learning_rate": 9.999569037630288e-06, "loss": 0.0019, "num_tokens": 9271543.0, "reward": 0.9744614958763123, "reward_std": 0.03154784440994263, "rewards/rollout_reward_func/mean": 0.9744614958763123, "rewards/rollout_reward_func/std": 0.3159627616405487, "sampling/importance_sampling_ratio/max": 1.0057196617126465, "sampling/importance_sampling_ratio/mean": 0.9350249767303467, "sampling/importance_sampling_ratio/min": 3.921523490590383e-18, "sampling/sampling_logp_difference/max": 10.934523582458496, "sampling/sampling_logp_difference/mean": 0.27144771814346313, "step": 1001, "step_time": 4.575240960002702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6463930737227201, "epoch": 0.01002, "grad_norm": 0.009858638048171997, "kl": 0.4307410456240177, "learning_rate": 9.99956814399839e-06, "loss": 0.0018, "step": 1002, "step_time": 2.039999831999012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 125.8125, "completions/mean_terminated_length": 125.8125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.8601890038698912, "epoch": 0.01003, "frac_reward_zero_std": 0.25, "grad_norm": 0.027555223554372787, "kl": 0.5866316631436348, "learning_rate": 9.999567249441e-06, "loss": -0.0712, "num_tokens": 9288249.0, "reward": 0.7655288577079773, "reward_std": 0.08581102639436722, "rewards/rollout_reward_func/mean": 0.7655288577079773, "rewards/rollout_reward_func/std": 0.24047040939331055, "sampling/importance_sampling_ratio/max": 1.00131094455719, "sampling/importance_sampling_ratio/mean": 0.8703720569610596, "sampling/importance_sampling_ratio/min": 0.00022794897085987031, "sampling/sampling_logp_difference/max": 2.2796523571014404, "sampling/sampling_logp_difference/mean": 0.1397230178117752, "step": 1003, "step_time": 4.603438228012237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8601545775309205, "epoch": 0.01004, "grad_norm": 0.02715238742530346, "kl": 0.5912521816790104, "learning_rate": 9.999566353958118e-06, "loss": -0.0712, "step": 1004, "step_time": 2.014401512002223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 164.40625, "completions/mean_terminated_length": 166.258056640625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.9321952071040869, "epoch": 0.01005, "frac_reward_zero_std": 0.5, "grad_norm": 0.012975446879863739, "kl": 0.5086901374161243, "learning_rate": 9.999565457549745e-06, "loss": -0.0699, "num_tokens": 9304990.0, "reward": 0.8065913915634155, "reward_std": 0.04912584275007248, "rewards/rollout_reward_func/mean": 0.8065913915634155, "rewards/rollout_reward_func/std": 0.17433354258537292, "sampling/importance_sampling_ratio/max": 1.0005382299423218, "sampling/importance_sampling_ratio/mean": 0.8804376125335693, "sampling/importance_sampling_ratio/min": 2.140212540770099e-17, "sampling/sampling_logp_difference/max": 4.150557518005371, "sampling/sampling_logp_difference/mean": 0.2207678258419037, "step": 1005, "step_time": 4.605903378003859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.933323216624558, "epoch": 0.01006, "grad_norm": 0.012612374499440193, "kl": 0.5113950558006763, "learning_rate": 9.999564560215878e-06, "loss": -0.0699, "step": 1006, "step_time": 2.5140237539890222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 203.53125, "completions/mean_terminated_length": 203.53125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.42131903301924467, "epoch": 0.01007, "frac_reward_zero_std": 0.5, "grad_norm": 0.018961794674396515, "kl": 0.5011116899549961, "learning_rate": 9.999563661956521e-06, "loss": 0.0027, "num_tokens": 9323383.0, "reward": 1.022129774093628, "reward_std": 0.015189195051789284, "rewards/rollout_reward_func/mean": 1.022129774093628, "rewards/rollout_reward_func/std": 0.20519331097602844, "sampling/importance_sampling_ratio/max": 1.0004302263259888, "sampling/importance_sampling_ratio/mean": 0.9323863983154297, "sampling/importance_sampling_ratio/min": 0.0002677200245670974, "sampling/sampling_logp_difference/max": 2.4505324363708496, "sampling/sampling_logp_difference/mean": 0.06822488456964493, "step": 1007, "step_time": 4.419637470004091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4188512936234474, "epoch": 0.01008, "grad_norm": 0.018411381170153618, "kl": 0.4997747167944908, "learning_rate": 9.999562762771671e-06, "loss": 0.0027, "step": 1008, "step_time": 2.0419828700105427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.10146982595324516, "epoch": 0.01009, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012290432350710034, "kl": 0.5468728542327881, "learning_rate": 9.999561862661328e-06, "loss": 0.0017, "num_tokens": 9340447.0, "reward": 0.7251923084259033, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7251923084259033, "rewards/rollout_reward_func/std": 0.2438761591911316, "sampling/importance_sampling_ratio/max": 0.9999406337738037, "sampling/importance_sampling_ratio/mean": 0.9887344837188721, "sampling/importance_sampling_ratio/min": 0.9774584770202637, "sampling/sampling_logp_difference/max": 0.01119598001241684, "sampling/sampling_logp_difference/mean": 0.00268524675630033, "step": 1009, "step_time": 4.556701335997786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10271441470831633, "epoch": 0.0101, "grad_norm": 0.001260671648196876, "kl": 0.5465990975499153, "learning_rate": 9.999560961625496e-06, "loss": 0.0017, "step": 1010, "step_time": 2.022627799997281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 103.9375, "completions/mean_terminated_length": 103.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5347274048253894, "epoch": 0.01011, "frac_reward_zero_std": 0.75, "grad_norm": 0.02063979022204876, "kl": 0.6130469441413879, "learning_rate": 9.999560059664175e-06, "loss": -0.0101, "num_tokens": 9355821.0, "reward": 0.5700000524520874, "reward_std": 0.025450291112065315, "rewards/rollout_reward_func/mean": 0.5700000524520874, "rewards/rollout_reward_func/std": 0.14470984041690826, "sampling/importance_sampling_ratio/max": 1.001609444618225, "sampling/importance_sampling_ratio/mean": 0.9256579875946045, "sampling/importance_sampling_ratio/min": 0.015117833390831947, "sampling/sampling_logp_difference/max": 2.286466121673584, "sampling/sampling_logp_difference/mean": 0.08799567818641663, "step": 1011, "step_time": 4.585022077997564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5329947341233492, "epoch": 0.01012, "grad_norm": 0.019892597571015358, "kl": 0.6136961691081524, "learning_rate": 9.999559156777358e-06, "loss": -0.0101, "step": 1012, "step_time": 2.036721893993672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 189.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4000342022627592, "epoch": 0.01013, "frac_reward_zero_std": 0.5, "grad_norm": 0.015604376792907715, "kl": 0.502425093203783, "learning_rate": 9.999558252965055e-06, "loss": -0.0553, "num_tokens": 9374073.0, "reward": 0.8136249780654907, "reward_std": 0.022709013894200325, "rewards/rollout_reward_func/mean": 0.8136249780654907, "rewards/rollout_reward_func/std": 0.18373185396194458, "sampling/importance_sampling_ratio/max": 1.0009338855743408, "sampling/importance_sampling_ratio/mean": 0.9339480400085449, "sampling/importance_sampling_ratio/min": 0.011852099560201168, "sampling/sampling_logp_difference/max": 1.8211791515350342, "sampling/sampling_logp_difference/mean": 0.04780115932226181, "step": 1013, "step_time": 4.577463813009672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3941255370154977, "epoch": 0.01014, "grad_norm": 0.014519373886287212, "kl": 0.507741991430521, "learning_rate": 9.99955734822726e-06, "loss": -0.0553, "step": 1014, "step_time": 2.526375964996987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 121.5625, "completions/mean_terminated_length": 121.5625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.23771504499018192, "epoch": 0.01015, "frac_reward_zero_std": 0.75, "grad_norm": 0.015095395036041737, "kl": 0.4450644329190254, "learning_rate": 9.999556442563976e-06, "loss": -0.0174, "num_tokens": 9389603.0, "reward": 0.6463942527770996, "reward_std": 0.009110790677368641, "rewards/rollout_reward_func/mean": 0.6463942527770996, "rewards/rollout_reward_func/std": 0.1564626544713974, "sampling/importance_sampling_ratio/max": 1.001703143119812, "sampling/importance_sampling_ratio/mean": 0.9638994932174683, "sampling/importance_sampling_ratio/min": 0.030819810926914215, "sampling/sampling_logp_difference/max": 1.6759823560714722, "sampling/sampling_logp_difference/mean": 0.024021204560995102, "step": 1015, "step_time": 4.088594971981365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23632419668138027, "epoch": 0.01016, "grad_norm": 0.01531276199966669, "kl": 0.4442466013133526, "learning_rate": 9.9995555359752e-06, "loss": -0.0174, "step": 1016, "step_time": 2.0329147139927954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 161.65625, "completions/mean_terminated_length": 161.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9499793890863657, "epoch": 0.01017, "frac_reward_zero_std": 0.25, "grad_norm": 0.03291711211204529, "kl": 0.6408760696649551, "learning_rate": 9.999554628460936e-06, "loss": -0.0656, "num_tokens": 9406768.0, "reward": 0.6505288481712341, "reward_std": 0.06051202118396759, "rewards/rollout_reward_func/mean": 0.6505288481712341, "rewards/rollout_reward_func/std": 0.0923648327589035, "sampling/importance_sampling_ratio/max": 1.0524829626083374, "sampling/importance_sampling_ratio/mean": 0.9036310911178589, "sampling/importance_sampling_ratio/min": 3.2281559469993226e-06, "sampling/sampling_logp_difference/max": 3.2653229236602783, "sampling/sampling_logp_difference/mean": 0.20489737391471863, "step": 1017, "step_time": 4.992770089003898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9443442914634943, "epoch": 0.01018, "grad_norm": 0.031959619373083115, "kl": 0.6401074305176735, "learning_rate": 9.999553720021182e-06, "loss": -0.0657, "step": 1018, "step_time": 2.0652233739892836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 158.59375, "completions/mean_terminated_length": 158.59375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.14597377367317677, "epoch": 0.01019, "frac_reward_zero_std": 0.75, "grad_norm": 0.04668353125452995, "kl": 1.246734518557787, "learning_rate": 9.999552810655939e-06, "loss": -0.0202, "num_tokens": 9424099.0, "reward": 0.7972307801246643, "reward_std": 0.17568883299827576, "rewards/rollout_reward_func/mean": 0.7972307801246643, "rewards/rollout_reward_func/std": 0.4746287763118744, "sampling/importance_sampling_ratio/max": 0.9999182224273682, "sampling/importance_sampling_ratio/mean": 0.9679373502731323, "sampling/importance_sampling_ratio/min": 0.26604193449020386, "sampling/sampling_logp_difference/max": 1.260497808456421, "sampling/sampling_logp_difference/mean": 0.010332784615457058, "step": 1019, "step_time": 4.434600171000056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14405733533203602, "epoch": 0.0102, "grad_norm": 0.04559019207954407, "kl": 1.2463982477784157, "learning_rate": 9.999551900365207e-06, "loss": -0.0203, "step": 1020, "step_time": 2.5172270639886847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 145.625, "completions/mean_terminated_length": 145.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.5802056090906262, "epoch": 0.01021, "frac_reward_zero_std": 0.5, "grad_norm": 0.01567806489765644, "kl": 0.6007457561790943, "learning_rate": 9.999550989148985e-06, "loss": -0.0363, "num_tokens": 9441375.0, "reward": 0.8982884883880615, "reward_std": 0.07479014247655869, "rewards/rollout_reward_func/mean": 0.8982884883880615, "rewards/rollout_reward_func/std": 0.3736075162887573, "sampling/importance_sampling_ratio/max": 1.0038411617279053, "sampling/importance_sampling_ratio/mean": 0.9329283237457275, "sampling/importance_sampling_ratio/min": 0.0001883284712675959, "sampling/sampling_logp_difference/max": 2.1432158946990967, "sampling/sampling_logp_difference/mean": 0.10848433524370193, "step": 1021, "step_time": 4.340215390009689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5795192383229733, "epoch": 0.01022, "grad_norm": 0.013970139436423779, "kl": 0.6032629758119583, "learning_rate": 9.999550077007277e-06, "loss": -0.0363, "step": 1022, "step_time": 2.5055925980122993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 167.6875, "completions/mean_terminated_length": 167.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.26473930990323424, "epoch": 0.01023, "frac_reward_zero_std": 0.75, "grad_norm": 0.007376368623226881, "kl": 0.45269129797816277, "learning_rate": 9.999549163940078e-06, "loss": -0.0174, "num_tokens": 9458429.0, "reward": 0.659826934337616, "reward_std": 0.020397311076521873, "rewards/rollout_reward_func/mean": 0.659826934337616, "rewards/rollout_reward_func/std": 0.2847161889076233, "sampling/importance_sampling_ratio/max": 1.0030138492584229, "sampling/importance_sampling_ratio/mean": 0.9664081335067749, "sampling/importance_sampling_ratio/min": 0.01275581307709217, "sampling/sampling_logp_difference/max": 1.8355071544647217, "sampling/sampling_logp_difference/mean": 0.023931516334414482, "step": 1023, "step_time": 4.607298563991208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2656154865399003, "epoch": 0.01024, "grad_norm": 0.006023134104907513, "kl": 0.4631698913872242, "learning_rate": 9.999548249947393e-06, "loss": -0.0174, "step": 1024, "step_time": 2.0404447880064254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 142.53125, "completions/mean_terminated_length": 142.53125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.1926607247442007, "epoch": 0.01025, "frac_reward_zero_std": 0.75, "grad_norm": 0.01674574241042137, "kl": 0.5001520477235317, "learning_rate": 9.999547335029218e-06, "loss": -0.0266, "num_tokens": 9474574.0, "reward": 0.7197595834732056, "reward_std": 0.01590990275144577, "rewards/rollout_reward_func/mean": 0.7197595834732056, "rewards/rollout_reward_func/std": 0.09911546111106873, "sampling/importance_sampling_ratio/max": 1.0023797750473022, "sampling/importance_sampling_ratio/mean": 0.9671651124954224, "sampling/importance_sampling_ratio/min": 0.042101893573999405, "sampling/sampling_logp_difference/max": 1.4346156120300293, "sampling/sampling_logp_difference/mean": 0.021462664008140564, "step": 1025, "step_time": 4.3240609869899345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.19134364603087306, "epoch": 0.01026, "grad_norm": 0.016532929614186287, "kl": 0.5004912614822388, "learning_rate": 9.999546419185557e-06, "loss": -0.0266, "step": 1026, "step_time": 2.4570568620038102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 172.8125, "completions/mean_terminated_length": 172.8125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2702617500908673, "epoch": 0.01027, "frac_reward_zero_std": 0.5, "grad_norm": 0.027505626901984215, "kl": 0.5545465014874935, "learning_rate": 9.99954550241641e-06, "loss": -0.053, "num_tokens": 9491824.0, "reward": 0.8549230694770813, "reward_std": 0.03807498514652252, "rewards/rollout_reward_func/mean": 0.8549230694770813, "rewards/rollout_reward_func/std": 0.3559803366661072, "sampling/importance_sampling_ratio/max": 1.2634820938110352, "sampling/importance_sampling_ratio/mean": 0.993103563785553, "sampling/importance_sampling_ratio/min": 0.018211517482995987, "sampling/sampling_logp_difference/max": 2.269059181213379, "sampling/sampling_logp_difference/mean": 0.0610366128385067, "step": 1027, "step_time": 4.562868417997379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26802025688812137, "epoch": 0.01028, "grad_norm": 0.026517756283283234, "kl": 0.5517394207417965, "learning_rate": 9.999544584721775e-06, "loss": -0.053, "step": 1028, "step_time": 2.477632383008313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 166.84375, "completions/mean_terminated_length": 166.84375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.5622444916516542, "epoch": 0.01029, "frac_reward_zero_std": 0.5, "grad_norm": 0.026143155992031097, "kl": 0.6786135956645012, "learning_rate": 9.99954366610165e-06, "loss": -0.0538, "num_tokens": 9509531.0, "reward": 0.6944279074668884, "reward_std": 0.009559541940689087, "rewards/rollout_reward_func/mean": 0.6944279074668884, "rewards/rollout_reward_func/std": 0.21268177032470703, "sampling/importance_sampling_ratio/max": 1.0014671087265015, "sampling/importance_sampling_ratio/mean": 0.9114025831222534, "sampling/importance_sampling_ratio/min": 4.1161500121233985e-05, "sampling/sampling_logp_difference/max": 3.512214183807373, "sampling/sampling_logp_difference/mean": 0.09426917135715485, "step": 1029, "step_time": 4.59913935399527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5612073563970625, "epoch": 0.0103, "grad_norm": 0.02858978509902954, "kl": 0.6785592958331108, "learning_rate": 9.999542746556042e-06, "loss": -0.0538, "step": 1030, "step_time": 2.0327106049880967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 149.25, "completions/mean_terminated_length": 149.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06956359650939703, "epoch": 0.01031, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006146542145870626, "kl": 0.5565061792731285, "learning_rate": 9.999541826084947e-06, "loss": 0.0015, "num_tokens": 9525483.0, "reward": 0.9887692332267761, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9887692332267761, "rewards/rollout_reward_func/std": 0.2765207886695862, "sampling/importance_sampling_ratio/max": 1.0072035789489746, "sampling/importance_sampling_ratio/mean": 0.9971880316734314, "sampling/importance_sampling_ratio/min": 0.9890609979629517, "sampling/sampling_logp_difference/max": 0.008297467604279518, "sampling/sampling_logp_difference/mean": 0.0017261694883927703, "step": 1031, "step_time": 4.960770963007235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0692220265045762, "epoch": 0.01032, "grad_norm": 0.0006032798555679619, "kl": 0.5565756149590015, "learning_rate": 9.999540904688363e-06, "loss": 0.0015, "step": 1032, "step_time": 2.031125035005971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.27119848830625415, "epoch": 0.01033, "frac_reward_zero_std": 0.75, "grad_norm": 0.006470487453043461, "kl": 0.5297772586345673, "learning_rate": 9.999539982366296e-06, "loss": -0.0364, "num_tokens": 9543035.0, "reward": 0.5628894567489624, "reward_std": 0.0030324009712785482, "rewards/rollout_reward_func/mean": 0.5628894567489624, "rewards/rollout_reward_func/std": 0.2021014541387558, "sampling/importance_sampling_ratio/max": 1.0055904388427734, "sampling/importance_sampling_ratio/mean": 0.9684923887252808, "sampling/importance_sampling_ratio/min": 0.0008389265858568251, "sampling/sampling_logp_difference/max": 2.02952241897583, "sampling/sampling_logp_difference/mean": 0.03391432762145996, "step": 1033, "step_time": 4.556236281001475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.272874987218529, "epoch": 0.01034, "grad_norm": 0.0077501144260168076, "kl": 0.5401902310550213, "learning_rate": 9.99953905911874e-06, "loss": -0.0364, "step": 1034, "step_time": 2.031109297997318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2806744067929685, "epoch": 0.01035, "frac_reward_zero_std": 0.75, "grad_norm": 0.007257014978677034, "kl": 0.5112849101424217, "learning_rate": 9.999538134945701e-06, "loss": -0.027, "num_tokens": 9560919.0, "reward": 0.7487692832946777, "reward_std": 0.05167318880558014, "rewards/rollout_reward_func/mean": 0.7487692832946777, "rewards/rollout_reward_func/std": 0.30297499895095825, "sampling/importance_sampling_ratio/max": 1.0038436651229858, "sampling/importance_sampling_ratio/mean": 0.9659422039985657, "sampling/importance_sampling_ratio/min": 0.0005198978469707072, "sampling/sampling_logp_difference/max": 2.241046905517578, "sampling/sampling_logp_difference/mean": 0.04820840433239937, "step": 1035, "step_time": 4.3036191040155245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27982144756242633, "epoch": 0.01036, "grad_norm": 0.006926164496690035, "kl": 0.5176823809742928, "learning_rate": 9.999537209847177e-06, "loss": -0.027, "step": 1036, "step_time": 2.0262460859958082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 125.875, "completions/mean_terminated_length": 125.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6486286437138915, "epoch": 0.01037, "frac_reward_zero_std": 0.5, "grad_norm": 0.008300726301968098, "kl": 0.5691541656851768, "learning_rate": 9.999536283823168e-06, "loss": -0.0548, "num_tokens": 9576475.0, "reward": 0.7980769276618958, "reward_std": 0.03609853982925415, "rewards/rollout_reward_func/mean": 0.7980769276618958, "rewards/rollout_reward_func/std": 0.34846529364585876, "sampling/importance_sampling_ratio/max": 1.0011322498321533, "sampling/importance_sampling_ratio/mean": 0.905137836933136, "sampling/importance_sampling_ratio/min": 0.005469200201332569, "sampling/sampling_logp_difference/max": 2.1844704151153564, "sampling/sampling_logp_difference/mean": 0.0915183275938034, "step": 1037, "step_time": 4.592736474995036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6321344887837768, "epoch": 0.01038, "grad_norm": 0.008254902437329292, "kl": 0.5722580291330814, "learning_rate": 9.999535356873673e-06, "loss": -0.0548, "step": 1038, "step_time": 2.015018825994048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 120.25, "completions/mean_terminated_length": 120.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.0541776311583817, "epoch": 0.01039, "frac_reward_zero_std": 1.0, "grad_norm": 0.00046607572585344315, "kl": 0.44584856927394867, "learning_rate": 9.999534428998694e-06, "loss": 0.0012, "num_tokens": 9592227.0, "reward": 0.7200000286102295, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7200000286102295, "rewards/rollout_reward_func/std": 0.26447686553001404, "sampling/importance_sampling_ratio/max": 1.0001139640808105, "sampling/importance_sampling_ratio/mean": 0.9973793029785156, "sampling/importance_sampling_ratio/min": 0.9930495023727417, "sampling/sampling_logp_difference/max": 0.0036298297345638275, "sampling/sampling_logp_difference/mean": 0.0009753627819009125, "step": 1039, "step_time": 4.468837249005446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05400705710053444, "epoch": 0.0104, "grad_norm": 0.0004604867717716843, "kl": 0.44588104262948036, "learning_rate": 9.999533500198229e-06, "loss": 0.0012, "step": 1040, "step_time": 2.031628621989512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 127.03125, "completions/mean_terminated_length": 127.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6781717808917165, "epoch": 0.01041, "frac_reward_zero_std": 0.75, "grad_norm": 0.006315798033028841, "kl": 0.6083735972642899, "learning_rate": 9.999532570472281e-06, "loss": -0.0274, "num_tokens": 9607772.0, "reward": 0.9287172555923462, "reward_std": 0.012657222338020802, "rewards/rollout_reward_func/mean": 0.9287172555923462, "rewards/rollout_reward_func/std": 0.3276354670524597, "sampling/importance_sampling_ratio/max": 1.0052229166030884, "sampling/importance_sampling_ratio/mean": 0.9356458187103271, "sampling/importance_sampling_ratio/min": 2.030685347789476e-24, "sampling/sampling_logp_difference/max": 6.838440895080566, "sampling/sampling_logp_difference/mean": 0.375669002532959, "step": 1041, "step_time": 4.086210578992905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6766580091789365, "epoch": 0.01042, "grad_norm": 0.007122888695448637, "kl": 0.6165167205035686, "learning_rate": 9.99953163982085e-06, "loss": -0.0274, "step": 1042, "step_time": 2.0347004529903643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 189.71875, "completions/mean_terminated_length": 189.71875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.34501351276412606, "epoch": 0.01043, "frac_reward_zero_std": 0.5, "grad_norm": 0.018493205308914185, "kl": 0.5900870561599731, "learning_rate": 9.999530708243934e-06, "loss": -0.055, "num_tokens": 9625531.0, "reward": 0.6077548265457153, "reward_std": 0.0553039126098156, "rewards/rollout_reward_func/mean": 0.6077548265457153, "rewards/rollout_reward_func/std": 0.1227862760424614, "sampling/importance_sampling_ratio/max": 1.0032306909561157, "sampling/importance_sampling_ratio/mean": 0.9344620704650879, "sampling/importance_sampling_ratio/min": 1.3426567635477227e-09, "sampling/sampling_logp_difference/max": 12.22311019897461, "sampling/sampling_logp_difference/mean": 0.1276443898677826, "step": 1043, "step_time": 4.934344886998588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34669165406376123, "epoch": 0.01044, "grad_norm": 0.017895624041557312, "kl": 0.5832725130021572, "learning_rate": 9.999529775741534e-06, "loss": -0.055, "step": 1044, "step_time": 2.041238579011406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 172.75, "completions/mean_terminated_length": 172.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2929128869436681, "epoch": 0.01045, "frac_reward_zero_std": 0.75, "grad_norm": 0.006537375971674919, "kl": 0.5991233885288239, "learning_rate": 9.999528842313652e-06, "loss": -0.0268, "num_tokens": 9643187.0, "reward": 0.7941057682037354, "reward_std": 0.008838835172355175, "rewards/rollout_reward_func/mean": 0.7941057682037354, "rewards/rollout_reward_func/std": 0.40701809525489807, "sampling/importance_sampling_ratio/max": 1.0010793209075928, "sampling/importance_sampling_ratio/mean": 0.9642227292060852, "sampling/importance_sampling_ratio/min": 0.00026280790916644037, "sampling/sampling_logp_difference/max": 2.1533589363098145, "sampling/sampling_logp_difference/mean": 0.043229032307863235, "step": 1045, "step_time": 4.830792476008355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28859948134049773, "epoch": 0.01046, "grad_norm": 0.006332087330520153, "kl": 0.599343866109848, "learning_rate": 9.999527907960287e-06, "loss": -0.0268, "step": 1046, "step_time": 2.0396790850063553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 100.1875, "completions/mean_terminated_length": 100.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2199427504092455, "epoch": 0.01047, "frac_reward_zero_std": 0.75, "grad_norm": 0.00720607116818428, "kl": 0.6316672787070274, "learning_rate": 9.999526972681438e-06, "loss": -0.0173, "num_tokens": 9657873.0, "reward": 0.6932836771011353, "reward_std": 0.02878740429878235, "rewards/rollout_reward_func/mean": 0.6932836771011353, "rewards/rollout_reward_func/std": 0.2820006310939789, "sampling/importance_sampling_ratio/max": 1.003796100616455, "sampling/importance_sampling_ratio/mean": 0.9664442539215088, "sampling/importance_sampling_ratio/min": 0.014381813816726208, "sampling/sampling_logp_difference/max": 2.008634567260742, "sampling/sampling_logp_difference/mean": 0.033325254917144775, "step": 1047, "step_time": 3.8571865409976454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2213567686267197, "epoch": 0.01048, "grad_norm": 0.006168148014694452, "kl": 0.6271566599607468, "learning_rate": 9.999526036477107e-06, "loss": -0.0173, "step": 1048, "step_time": 2.475993889005622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 187.5625, "completions/mean_terminated_length": 187.5625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.09836462140083313, "epoch": 0.01049, "frac_reward_zero_std": 0.75, "grad_norm": 0.023974627256393433, "kl": 0.7721774280071259, "learning_rate": 9.999525099347293e-06, "loss": -0.0313, "num_tokens": 9675723.0, "reward": 0.7677451968193054, "reward_std": 0.041515324264764786, "rewards/rollout_reward_func/mean": 0.7677451968193054, "rewards/rollout_reward_func/std": 0.37704935669898987, "sampling/importance_sampling_ratio/max": 1.0019224882125854, "sampling/importance_sampling_ratio/mean": 0.970941960811615, "sampling/importance_sampling_ratio/min": 0.1536371260881424, "sampling/sampling_logp_difference/max": 1.7477600574493408, "sampling/sampling_logp_difference/mean": 0.009447665885090828, "step": 1049, "step_time": 4.482862990000285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09790196921676397, "epoch": 0.0105, "grad_norm": 0.02424018271267414, "kl": 0.7720134519040585, "learning_rate": 9.999524161291997e-06, "loss": -0.0313, "step": 1050, "step_time": 2.4707201730125234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 169.8125, "completions/mean_terminated_length": 169.8125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.17426930228248239, "epoch": 0.01051, "frac_reward_zero_std": 0.75, "grad_norm": 0.020644865930080414, "kl": 0.4380851313471794, "learning_rate": 9.99952322231122e-06, "loss": 0.0202, "num_tokens": 9692797.0, "reward": 0.5110769271850586, "reward_std": 0.024476774036884308, "rewards/rollout_reward_func/mean": 0.5110769271850586, "rewards/rollout_reward_func/std": 0.18088527023792267, "sampling/importance_sampling_ratio/max": 1.004369854927063, "sampling/importance_sampling_ratio/mean": 0.9678804278373718, "sampling/importance_sampling_ratio/min": 0.026442190632224083, "sampling/sampling_logp_difference/max": 1.341221570968628, "sampling/sampling_logp_difference/mean": 0.019894199445843697, "step": 1051, "step_time": 4.241627265000716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17559034517034888, "epoch": 0.01052, "grad_norm": 0.021326448768377304, "kl": 0.44473784789443016, "learning_rate": 9.99952228240496e-06, "loss": 0.0203, "step": 1052, "step_time": 2.0274779130049865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 27.96875, "completions/mean_terminated_length": 27.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3451079782098532, "epoch": 0.01053, "frac_reward_zero_std": 0.75, "grad_norm": 0.016121838241815567, "kl": 0.9450449794530869, "learning_rate": 9.999521341573221e-06, "loss": 0.0111, "num_tokens": 9705804.0, "reward": 0.6913461685180664, "reward_std": 0.013598205521702766, "rewards/rollout_reward_func/mean": 0.6913461685180664, "rewards/rollout_reward_func/std": 0.1374896913766861, "sampling/importance_sampling_ratio/max": 1.001473069190979, "sampling/importance_sampling_ratio/mean": 0.9621217846870422, "sampling/importance_sampling_ratio/min": 0.0014649060321971774, "sampling/sampling_logp_difference/max": 2.5045909881591797, "sampling/sampling_logp_difference/mean": 0.06814099848270416, "step": 1053, "step_time": 3.5654136540033505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3440041597932577, "epoch": 0.01054, "grad_norm": 0.01698819175362587, "kl": 0.950714722275734, "learning_rate": 9.999520399815998e-06, "loss": 0.0111, "step": 1054, "step_time": 2.4218462699936936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 172.0625, "completions/mean_terminated_length": 172.0625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3407588843256235, "epoch": 0.01055, "frac_reward_zero_std": 0.75, "grad_norm": 0.006474251393228769, "kl": 0.48520367220044136, "learning_rate": 9.999519457133295e-06, "loss": -0.0176, "num_tokens": 9723222.0, "reward": 0.6883845925331116, "reward_std": 0.017949629575014114, "rewards/rollout_reward_func/mean": 0.6883845925331116, "rewards/rollout_reward_func/std": 0.287502646446228, "sampling/importance_sampling_ratio/max": 1.0011292695999146, "sampling/importance_sampling_ratio/mean": 0.9662191271781921, "sampling/importance_sampling_ratio/min": 6.193621084094048e-05, "sampling/sampling_logp_difference/max": 2.7525274753570557, "sampling/sampling_logp_difference/mean": 0.07513101398944855, "step": 1055, "step_time": 4.547832808995736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3393716919235885, "epoch": 0.01056, "grad_norm": 0.006475072354078293, "kl": 0.4846251532435417, "learning_rate": 9.999518513525112e-06, "loss": -0.0176, "step": 1056, "step_time": 2.469630548999703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 169.3125, "completions/mean_terminated_length": 169.3125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.39765203883871436, "epoch": 0.01057, "frac_reward_zero_std": 0.75, "grad_norm": 0.004657596815377474, "kl": 0.4216076396405697, "learning_rate": 9.999517568991448e-06, "loss": -0.0274, "num_tokens": 9740088.0, "reward": 0.9706250429153442, "reward_std": 0.0265165027230978, "rewards/rollout_reward_func/mean": 0.9706250429153442, "rewards/rollout_reward_func/std": 0.18988282978534698, "sampling/importance_sampling_ratio/max": 1.0008904933929443, "sampling/importance_sampling_ratio/mean": 0.9672774076461792, "sampling/importance_sampling_ratio/min": 3.675552971791479e-13, "sampling/sampling_logp_difference/max": 3.149332046508789, "sampling/sampling_logp_difference/mean": 0.13506321609020233, "step": 1057, "step_time": 4.410855450980307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39693148620426655, "epoch": 0.01058, "grad_norm": 0.004741901997476816, "kl": 0.42155591771006584, "learning_rate": 9.999516623532303e-06, "loss": -0.0274, "step": 1058, "step_time": 2.059080072984216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 172.22579956054688, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2999912756495178, "epoch": 0.01059, "frac_reward_zero_std": 0.75, "grad_norm": 0.017562557011842728, "kl": 0.5086987465620041, "learning_rate": 9.99951567714768e-06, "loss": -0.0236, "num_tokens": 9757120.0, "reward": 1.024827003479004, "reward_std": 0.006799102760851383, "rewards/rollout_reward_func/mean": 1.024827003479004, "rewards/rollout_reward_func/std": 0.25390341877937317, "sampling/importance_sampling_ratio/max": 1.0040104389190674, "sampling/importance_sampling_ratio/mean": 0.9422109723091125, "sampling/importance_sampling_ratio/min": 0.0050460658967494965, "sampling/sampling_logp_difference/max": 1.6131680011749268, "sampling/sampling_logp_difference/mean": 0.04473302140831947, "step": 1059, "step_time": 4.327568262007844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30047428933903575, "epoch": 0.0106, "grad_norm": 0.018191132694482803, "kl": 0.5137805230915546, "learning_rate": 9.999514729837577e-06, "loss": -0.0236, "step": 1060, "step_time": 2.5006981159895076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 161.84375, "completions/mean_terminated_length": 161.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11438354244455695, "epoch": 0.01061, "frac_reward_zero_std": 0.75, "grad_norm": 0.02369179017841816, "kl": 0.7784518003463745, "learning_rate": 9.999513781601992e-06, "loss": -0.0232, "num_tokens": 9774291.0, "reward": 0.865788459777832, "reward_std": 0.1416933238506317, "rewards/rollout_reward_func/mean": 0.865788459777832, "rewards/rollout_reward_func/std": 0.48086312413215637, "sampling/importance_sampling_ratio/max": 1.0031136274337769, "sampling/importance_sampling_ratio/mean": 0.9695805311203003, "sampling/importance_sampling_ratio/min": 0.16533814370632172, "sampling/sampling_logp_difference/max": 1.7376419305801392, "sampling/sampling_logp_difference/mean": 0.010766924358904362, "step": 1061, "step_time": 4.728109178984596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1150887911207974, "epoch": 0.01062, "grad_norm": 0.02337092161178589, "kl": 0.7786352671682835, "learning_rate": 9.99951283244093e-06, "loss": -0.0232, "step": 1062, "step_time": 2.0443127419930534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 98.75, "completions/mean_terminated_length": 98.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3043160382658243, "epoch": 0.01063, "frac_reward_zero_std": 0.75, "grad_norm": 0.013159863650798798, "kl": 0.48890917003154755, "learning_rate": 9.999511882354389e-06, "loss": -0.0156, "num_tokens": 9789035.0, "reward": 0.6111057996749878, "reward_std": 0.008838837966322899, "rewards/rollout_reward_func/mean": 0.6111057996749878, "rewards/rollout_reward_func/std": 0.2396232783794403, "sampling/importance_sampling_ratio/max": 1.0024012327194214, "sampling/importance_sampling_ratio/mean": 0.9420678615570068, "sampling/importance_sampling_ratio/min": 0.009867476299405098, "sampling/sampling_logp_difference/max": 2.3117244243621826, "sampling/sampling_logp_difference/mean": 0.04921853169798851, "step": 1063, "step_time": 3.799416461995861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3081684815697372, "epoch": 0.01064, "grad_norm": 0.012183251790702343, "kl": 0.494498897343874, "learning_rate": 9.999510931342367e-06, "loss": -0.0155, "step": 1064, "step_time": 2.0044517599963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 219.46875, "completions/mean_terminated_length": 219.46875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.22642750712111592, "epoch": 0.01065, "frac_reward_zero_std": 0.75, "grad_norm": 0.005792289972305298, "kl": 0.41273434832692146, "learning_rate": 9.999509979404867e-06, "loss": -0.0366, "num_tokens": 9807594.0, "reward": 1.1237452030181885, "reward_std": 0.02791712060570717, "rewards/rollout_reward_func/mean": 1.1237452030181885, "rewards/rollout_reward_func/std": 0.2266058325767517, "sampling/importance_sampling_ratio/max": 1.0024710893630981, "sampling/importance_sampling_ratio/mean": 0.9662823677062988, "sampling/importance_sampling_ratio/min": 0.005530448164790869, "sampling/sampling_logp_difference/max": 1.9087574481964111, "sampling/sampling_logp_difference/mean": 0.024454444646835327, "step": 1065, "step_time": 4.591176968002401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23049897328019142, "epoch": 0.01066, "grad_norm": 0.0056883711367845535, "kl": 0.4125015214085579, "learning_rate": 9.999509026541889e-06, "loss": -0.0366, "step": 1066, "step_time": 2.48700683300558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 117.625, "completions/mean_terminated_length": 117.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6830194042995572, "epoch": 0.01067, "frac_reward_zero_std": 0.75, "grad_norm": 0.009755787439644337, "kl": 0.7185267060995102, "learning_rate": 9.999508072753433e-06, "loss": -0.0258, "num_tokens": 9822886.0, "reward": 0.6352115273475647, "reward_std": 0.009952865540981293, "rewards/rollout_reward_func/mean": 0.6352115273475647, "rewards/rollout_reward_func/std": 0.15270791947841644, "sampling/importance_sampling_ratio/max": 1.004030704498291, "sampling/importance_sampling_ratio/mean": 0.9047238826751709, "sampling/importance_sampling_ratio/min": 0.0154954819008708, "sampling/sampling_logp_difference/max": 1.8286023139953613, "sampling/sampling_logp_difference/mean": 0.09793207049369812, "step": 1067, "step_time": 4.601608929006034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6859376272186637, "epoch": 0.01068, "grad_norm": 0.008764917962253094, "kl": 0.710744246840477, "learning_rate": 9.999507118039498e-06, "loss": -0.0257, "step": 1068, "step_time": 2.0374817429910763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 73.5, "completions/mean_terminated_length": 73.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0881383316591382, "epoch": 0.01069, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006451690569519997, "kl": 0.6978110373020172, "learning_rate": 9.999506162400088e-06, "loss": 0.0015, "num_tokens": 9836718.0, "reward": 0.75, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.75, "rewards/rollout_reward_func/std": 0.05649289861321449, "sampling/importance_sampling_ratio/max": 0.9999688267707825, "sampling/importance_sampling_ratio/mean": 0.9936131238937378, "sampling/importance_sampling_ratio/min": 0.9841511845588684, "sampling/sampling_logp_difference/max": 0.01238194853067398, "sampling/sampling_logp_difference/mean": 0.00198800815269351, "step": 1069, "step_time": 3.807709350992809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0879064742475748, "epoch": 0.0107, "grad_norm": 0.0006418412667699158, "kl": 0.6977997273206711, "learning_rate": 9.999505205835198e-06, "loss": 0.0015, "step": 1070, "step_time": 2.0106761130009545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 239.46875, "completions/mean_terminated_length": 239.46875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.4300023955293, "epoch": 0.01071, "frac_reward_zero_std": 0.5, "grad_norm": 0.006939026992768049, "kl": 0.44849396497011185, "learning_rate": 9.999504248344831e-06, "loss": -0.0748, "num_tokens": 9855933.0, "reward": 0.8349807858467102, "reward_std": 0.037476662546396255, "rewards/rollout_reward_func/mean": 0.8349807858467102, "rewards/rollout_reward_func/std": 0.4186232089996338, "sampling/importance_sampling_ratio/max": 1.0003418922424316, "sampling/importance_sampling_ratio/mean": 0.9338171482086182, "sampling/importance_sampling_ratio/min": 1.532239548396319e-05, "sampling/sampling_logp_difference/max": 2.481064796447754, "sampling/sampling_logp_difference/mean": 0.06736297905445099, "step": 1071, "step_time": 5.240120618014771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4261147379875183, "epoch": 0.01072, "grad_norm": 0.006608685478568077, "kl": 0.44789107516407967, "learning_rate": 9.99950328992899e-06, "loss": -0.0749, "step": 1072, "step_time": 2.042191651999019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07367887254804373, "epoch": 0.01073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008234719862230122, "kl": 0.45161133259534836, "learning_rate": 9.99950233058767e-06, "loss": 0.0014, "num_tokens": 9872117.0, "reward": 0.5900000333786011, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5900000333786011, "rewards/rollout_reward_func/std": 0.0914400964975357, "sampling/importance_sampling_ratio/max": 1.0035662651062012, "sampling/importance_sampling_ratio/mean": 0.9965395927429199, "sampling/importance_sampling_ratio/min": 0.9909082055091858, "sampling/sampling_logp_difference/max": 0.0046162791550159454, "sampling/sampling_logp_difference/mean": 0.0011574261588975787, "step": 1073, "step_time": 4.67730962800124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07420379389077425, "epoch": 0.01074, "grad_norm": 0.0008254032582044601, "kl": 0.45149926841259, "learning_rate": 9.999501370320872e-06, "loss": 0.0014, "step": 1074, "step_time": 2.0122518249991117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 143.625, "completions/mean_terminated_length": 143.625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.5116148022934794, "epoch": 0.01075, "frac_reward_zero_std": 0.75, "grad_norm": 0.004727616906166077, "kl": 0.5526121817529202, "learning_rate": 9.999500409128599e-06, "loss": -0.0182, "num_tokens": 9888017.0, "reward": 0.7024999856948853, "reward_std": 0.03528054431080818, "rewards/rollout_reward_func/mean": 0.7024999856948853, "rewards/rollout_reward_func/std": 0.31577590107917786, "sampling/importance_sampling_ratio/max": 1.0013805627822876, "sampling/importance_sampling_ratio/mean": 0.9051001071929932, "sampling/importance_sampling_ratio/min": 0.00025906949304044247, "sampling/sampling_logp_difference/max": 2.2053704261779785, "sampling/sampling_logp_difference/mean": 0.08781927824020386, "step": 1075, "step_time": 4.101972354997997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5121159739792347, "epoch": 0.01076, "grad_norm": 0.004973116796463728, "kl": 0.5462468452751637, "learning_rate": 9.99949944701085e-06, "loss": -0.0182, "step": 1076, "step_time": 2.040830484002072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 130.125, "completions/mean_terminated_length": 130.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.8833058658055961, "epoch": 0.01077, "frac_reward_zero_std": 0.5, "grad_norm": 0.017876051366329193, "kl": 0.5190519765019417, "learning_rate": 9.999498483967625e-06, "loss": 0.0094, "num_tokens": 9903709.0, "reward": 0.6600143909454346, "reward_std": 0.026561658829450607, "rewards/rollout_reward_func/mean": 0.6600143909454346, "rewards/rollout_reward_func/std": 0.20711706578731537, "sampling/importance_sampling_ratio/max": 1.0036327838897705, "sampling/importance_sampling_ratio/mean": 0.9026211500167847, "sampling/importance_sampling_ratio/min": 8.213842939142069e-09, "sampling/sampling_logp_difference/max": 4.112858772277832, "sampling/sampling_logp_difference/mean": 0.19255219399929047, "step": 1077, "step_time": 4.830082160005986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8799557723104954, "epoch": 0.01078, "grad_norm": 0.01721428707242012, "kl": 0.5167399123311043, "learning_rate": 9.999497519998923e-06, "loss": 0.0093, "step": 1078, "step_time": 2.49182605399983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 177.875, "completions/mean_terminated_length": 177.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.24523924849927425, "epoch": 0.01079, "frac_reward_zero_std": 0.75, "grad_norm": 0.021085165441036224, "kl": 0.584008451551199, "learning_rate": 9.99949655510475e-06, "loss": 0.0113, "num_tokens": 9921313.0, "reward": 0.8336970806121826, "reward_std": 0.04494207724928856, "rewards/rollout_reward_func/mean": 0.8336970806121826, "rewards/rollout_reward_func/std": 0.3331433832645416, "sampling/importance_sampling_ratio/max": 0.9997045397758484, "sampling/importance_sampling_ratio/mean": 0.9642174243927002, "sampling/importance_sampling_ratio/min": 0.0024328194558620453, "sampling/sampling_logp_difference/max": 2.0628557205200195, "sampling/sampling_logp_difference/mean": 0.03755737841129303, "step": 1079, "step_time": 4.384937411996361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24370945803821087, "epoch": 0.0108, "grad_norm": 0.023188484832644463, "kl": 0.5835608243942261, "learning_rate": 9.9994955892851e-06, "loss": 0.0113, "step": 1080, "step_time": 2.0357248399959644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 154.53125, "completions/mean_terminated_length": 154.53125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.8614889346063137, "epoch": 0.01081, "frac_reward_zero_std": 0.25, "grad_norm": 0.011367907747626305, "kl": 0.5805503129959106, "learning_rate": 9.999494622539973e-06, "loss": -0.0262, "num_tokens": 9937698.0, "reward": 0.7642307877540588, "reward_std": 0.03399552404880524, "rewards/rollout_reward_func/mean": 0.7642307877540588, "rewards/rollout_reward_func/std": 0.22714003920555115, "sampling/importance_sampling_ratio/max": 0.9988961219787598, "sampling/importance_sampling_ratio/mean": 0.9013383388519287, "sampling/importance_sampling_ratio/min": 3.8637404031760525e-06, "sampling/sampling_logp_difference/max": 2.269158124923706, "sampling/sampling_logp_difference/mean": 0.17203614115715027, "step": 1081, "step_time": 4.314949263003655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8567674327641726, "epoch": 0.01082, "grad_norm": 0.010677025653421879, "kl": 0.5688170194625854, "learning_rate": 9.999493654869373e-06, "loss": -0.0262, "step": 1082, "step_time": 2.0542694450050476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 93.75, "completions/mean_terminated_length": 93.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09502724464982748, "epoch": 0.01083, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008600326254963875, "kl": 0.5435485355556011, "learning_rate": 9.999492686273298e-06, "loss": 0.0013, "num_tokens": 9952450.0, "reward": 0.6553846597671509, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6553846597671509, "rewards/rollout_reward_func/std": 0.07485660165548325, "sampling/importance_sampling_ratio/max": 1.0036243200302124, "sampling/importance_sampling_ratio/mean": 0.9960314035415649, "sampling/importance_sampling_ratio/min": 0.9812634587287903, "sampling/sampling_logp_difference/max": 0.014937978237867355, "sampling/sampling_logp_difference/mean": 0.0017709534149616957, "step": 1083, "step_time": 4.2687719040040975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0970124825835228, "epoch": 0.01084, "grad_norm": 0.0008786047692410648, "kl": 0.5431601703166962, "learning_rate": 9.99949171675175e-06, "loss": 0.0013, "step": 1084, "step_time": 2.4365817030047765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 194.6875, "completions/mean_terminated_length": 194.6875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4305186029523611, "epoch": 0.01085, "frac_reward_zero_std": 0.75, "grad_norm": 0.007548968307673931, "kl": 0.5033247135579586, "learning_rate": 9.999490746304727e-06, "loss": -0.0172, "num_tokens": 9970040.0, "reward": 0.6516057252883911, "reward_std": 0.008838835172355175, "rewards/rollout_reward_func/mean": 0.6516057252883911, "rewards/rollout_reward_func/std": 0.10533320903778076, "sampling/importance_sampling_ratio/max": 1.003656029701233, "sampling/importance_sampling_ratio/mean": 0.9643051028251648, "sampling/importance_sampling_ratio/min": 1.424402791354827e-14, "sampling/sampling_logp_difference/max": 10.88527774810791, "sampling/sampling_logp_difference/mean": 0.15502415597438812, "step": 1085, "step_time": 4.4163105149855255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43026882968842983, "epoch": 0.01086, "grad_norm": 0.007089380640536547, "kl": 0.5026081949472427, "learning_rate": 9.999489774932232e-06, "loss": -0.0172, "step": 1086, "step_time": 2.0374554860172793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.5561155900359154, "epoch": 0.01087, "frac_reward_zero_std": 0.5, "grad_norm": 0.01418228354305029, "kl": 0.5617305338382721, "learning_rate": 9.999488802634262e-06, "loss": -0.0542, "num_tokens": 9986912.0, "reward": 0.8175961375236511, "reward_std": 0.06202410161495209, "rewards/rollout_reward_func/mean": 0.8175961375236511, "rewards/rollout_reward_func/std": 0.3341582715511322, "sampling/importance_sampling_ratio/max": 1.0018703937530518, "sampling/importance_sampling_ratio/mean": 0.9048382639884949, "sampling/importance_sampling_ratio/min": 0.013302919454872608, "sampling/sampling_logp_difference/max": 2.310818672180176, "sampling/sampling_logp_difference/mean": 0.06985218077898026, "step": 1087, "step_time": 4.192008163015998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5631599510088563, "epoch": 0.01088, "grad_norm": 0.01366395689547062, "kl": 0.5594988986849785, "learning_rate": 9.999487829410819e-06, "loss": -0.0542, "step": 1088, "step_time": 2.0154477990145097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 218.46875, "completions/mean_terminated_length": 218.46875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.408061645925045, "epoch": 0.01089, "frac_reward_zero_std": 0.75, "grad_norm": 0.010424559935927391, "kl": 0.4425330385565758, "learning_rate": 9.999486855261904e-06, "loss": -0.032, "num_tokens": 10005263.0, "reward": 0.9113735556602478, "reward_std": 0.04001395404338837, "rewards/rollout_reward_func/mean": 0.9113735556602478, "rewards/rollout_reward_func/std": 0.4906027913093567, "sampling/importance_sampling_ratio/max": 1.0004457235336304, "sampling/importance_sampling_ratio/mean": 0.9314489364624023, "sampling/importance_sampling_ratio/min": 0.0007101981318555772, "sampling/sampling_logp_difference/max": 1.9280354976654053, "sampling/sampling_logp_difference/mean": 0.05513381585478783, "step": 1089, "step_time": 5.1925055739993695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40566192334517837, "epoch": 0.0109, "grad_norm": 0.010076633654534817, "kl": 0.44044120609760284, "learning_rate": 9.999485880187515e-06, "loss": -0.032, "step": 1090, "step_time": 2.5316730360063957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 220.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.308052578009665, "epoch": 0.01091, "frac_reward_zero_std": 0.5, "grad_norm": 0.026025019586086273, "kl": 0.4394812509417534, "learning_rate": 9.999484904187655e-06, "loss": -0.0632, "num_tokens": 10023539.0, "reward": 0.8137620091438293, "reward_std": 0.03189460188150406, "rewards/rollout_reward_func/mean": 0.8137620091438293, "rewards/rollout_reward_func/std": 0.4126550257205963, "sampling/importance_sampling_ratio/max": 1.0003952980041504, "sampling/importance_sampling_ratio/mean": 0.9348280429840088, "sampling/importance_sampling_ratio/min": 7.062675285851583e-05, "sampling/sampling_logp_difference/max": 2.354876756668091, "sampling/sampling_logp_difference/mean": 0.05498090386390686, "step": 1091, "step_time": 4.703460140000971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30697415256872773, "epoch": 0.01092, "grad_norm": 0.027886029332876205, "kl": 0.43746092170476913, "learning_rate": 9.99948392726232e-06, "loss": -0.0632, "step": 1092, "step_time": 2.050767247012118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 191.21875, "completions/mean_terminated_length": 191.21875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.28287492925301194, "epoch": 0.01093, "frac_reward_zero_std": 0.75, "grad_norm": 0.004358069505542517, "kl": 0.47684314101934433, "learning_rate": 9.999482949411516e-06, "loss": -0.0369, "num_tokens": 10041402.0, "reward": 0.8340192437171936, "reward_std": 0.06130071356892586, "rewards/rollout_reward_func/mean": 0.8340192437171936, "rewards/rollout_reward_func/std": 0.4198867976665497, "sampling/importance_sampling_ratio/max": 1.0020359754562378, "sampling/importance_sampling_ratio/mean": 0.9651545882225037, "sampling/importance_sampling_ratio/min": 0.0002788925776258111, "sampling/sampling_logp_difference/max": 2.1937665939331055, "sampling/sampling_logp_difference/mean": 0.038117047399282455, "step": 1093, "step_time": 4.690313553997839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.281203918159008, "epoch": 0.01094, "grad_norm": 0.004133912734687328, "kl": 0.47675446420907974, "learning_rate": 9.99948197063524e-06, "loss": -0.0369, "step": 1094, "step_time": 2.47891974299273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 174.125, "completions/mean_terminated_length": 174.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.20661736372858286, "epoch": 0.01095, "frac_reward_zero_std": 0.75, "grad_norm": 0.031039981171488762, "kl": 0.5831075049936771, "learning_rate": 9.999480990933493e-06, "loss": -0.0265, "num_tokens": 10058966.0, "reward": 0.7025192379951477, "reward_std": 0.06119193509221077, "rewards/rollout_reward_func/mean": 0.7025192379951477, "rewards/rollout_reward_func/std": 0.3145633637905121, "sampling/importance_sampling_ratio/max": 1.0032235383987427, "sampling/importance_sampling_ratio/mean": 0.9405046105384827, "sampling/importance_sampling_ratio/min": 0.022251764312386513, "sampling/sampling_logp_difference/max": 1.932469129562378, "sampling/sampling_logp_difference/mean": 0.027503520250320435, "step": 1095, "step_time": 5.059957582016068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20561645831912756, "epoch": 0.01096, "grad_norm": 0.023948153480887413, "kl": 0.5904840119183064, "learning_rate": 9.999480010306273e-06, "loss": -0.0266, "step": 1096, "step_time": 2.032394360998296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 120.09375, "completions/mean_terminated_length": 120.09375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3147790776565671, "epoch": 0.01097, "frac_reward_zero_std": 0.75, "grad_norm": 0.013908511027693748, "kl": 0.5582608282566071, "learning_rate": 9.999479028753583e-06, "loss": -0.0272, "num_tokens": 10074089.0, "reward": 0.5213461518287659, "reward_std": 0.09409960359334946, "rewards/rollout_reward_func/mean": 0.5213461518287659, "rewards/rollout_reward_func/std": 0.22384782135486603, "sampling/importance_sampling_ratio/max": 0.9988203644752502, "sampling/importance_sampling_ratio/mean": 0.9641070365905762, "sampling/importance_sampling_ratio/min": 0.0002873266057576984, "sampling/sampling_logp_difference/max": 2.514293909072876, "sampling/sampling_logp_difference/mean": 0.052156731486320496, "step": 1097, "step_time": 4.080632327997591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3150607030838728, "epoch": 0.01098, "grad_norm": 0.01431334763765335, "kl": 0.5698213763535023, "learning_rate": 9.999478046275422e-06, "loss": -0.0271, "step": 1098, "step_time": 2.0157393109984696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 176.0625, "completions/mean_terminated_length": 176.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6966328234411776, "epoch": 0.01099, "frac_reward_zero_std": 0.75, "grad_norm": 0.05297272279858589, "kl": 0.8912202343344688, "learning_rate": 9.99947706287179e-06, "loss": -0.035, "num_tokens": 10091443.0, "reward": 1.0735745429992676, "reward_std": 0.07066474854946136, "rewards/rollout_reward_func/mean": 1.0735745429992676, "rewards/rollout_reward_func/std": 0.28076985478401184, "sampling/importance_sampling_ratio/max": 1.0015407800674438, "sampling/importance_sampling_ratio/mean": 0.9058207273483276, "sampling/importance_sampling_ratio/min": 4.146480847566636e-08, "sampling/sampling_logp_difference/max": 2.157886505126953, "sampling/sampling_logp_difference/mean": 0.13646994531154633, "step": 1099, "step_time": 4.46167442798469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6947634578682482, "epoch": 0.011, "grad_norm": 0.04935925081372261, "kl": 0.8688158206641674, "learning_rate": 9.999476078542688e-06, "loss": -0.0351, "step": 1100, "step_time": 2.493128969996178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 95.25, "completions/mean_terminated_length": 95.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08812347706407309, "epoch": 0.01101, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006917946157045662, "kl": 0.6404397264122963, "learning_rate": 9.999475093288116e-06, "loss": 0.0015, "num_tokens": 10106619.0, "reward": 0.5911538600921631, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5911538600921631, "rewards/rollout_reward_func/std": 0.1755712628364563, "sampling/importance_sampling_ratio/max": 1.0013397932052612, "sampling/importance_sampling_ratio/mean": 0.9925418496131897, "sampling/importance_sampling_ratio/min": 0.9793375730514526, "sampling/sampling_logp_difference/max": 0.017384566366672516, "sampling/sampling_logp_difference/mean": 0.002318589948117733, "step": 1101, "step_time": 4.523499880007876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08856210578233004, "epoch": 0.01102, "grad_norm": 0.0006874366081319749, "kl": 0.6403573155403137, "learning_rate": 9.999474107108074e-06, "loss": 0.0015, "step": 1102, "step_time": 2.0244597049895674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 220.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 131.0, "completions/mean_terminated_length": 128.1290283203125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7450268953107297, "epoch": 0.01103, "frac_reward_zero_std": 0.75, "grad_norm": 0.026617268100380898, "kl": 0.649603333324194, "learning_rate": 9.999473120002564e-06, "loss": -0.0213, "num_tokens": 10122643.0, "reward": 0.8518269062042236, "reward_std": 0.05673444643616676, "rewards/rollout_reward_func/mean": 0.8518269062042236, "rewards/rollout_reward_func/std": 0.22984609007835388, "sampling/importance_sampling_ratio/max": 1.001212239265442, "sampling/importance_sampling_ratio/mean": 0.9350499510765076, "sampling/importance_sampling_ratio/min": 3.1486878171014764e-19, "sampling/sampling_logp_difference/max": 4.489606857299805, "sampling/sampling_logp_difference/mean": 0.29984062910079956, "step": 1103, "step_time": 4.098569583009521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.742905409540981, "epoch": 0.01104, "grad_norm": 0.023685192689299583, "kl": 0.6315727829933167, "learning_rate": 9.999472131971582e-06, "loss": -0.0214, "step": 1104, "step_time": 2.023491115003708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 119.25, "completions/mean_terminated_length": 119.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.1083183097653091, "epoch": 0.01105, "frac_reward_zero_std": 0.75, "grad_norm": 0.02342468872666359, "kl": 0.6671711578965187, "learning_rate": 9.999471143015132e-06, "loss": -0.0225, "num_tokens": 10138347.0, "reward": 0.8306730389595032, "reward_std": 0.01767767407000065, "rewards/rollout_reward_func/mean": 0.8306730389595032, "rewards/rollout_reward_func/std": 0.18352362513542175, "sampling/importance_sampling_ratio/max": 1.003523349761963, "sampling/importance_sampling_ratio/mean": 0.9729176163673401, "sampling/importance_sampling_ratio/min": 0.15142953395843506, "sampling/sampling_logp_difference/max": 1.8209106922149658, "sampling/sampling_logp_difference/mean": 0.012754573486745358, "step": 1105, "step_time": 3.9615625380029087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10825883783400059, "epoch": 0.01106, "grad_norm": 0.023431479930877686, "kl": 0.6673465743660927, "learning_rate": 9.999470153133216e-06, "loss": -0.0225, "step": 1106, "step_time": 2.459020201997191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 138.71875, "completions/mean_terminated_length": 138.71875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.37462970707565546, "epoch": 0.01107, "frac_reward_zero_std": 0.5, "grad_norm": 0.032575078308582306, "kl": 0.5106714516878128, "learning_rate": 9.999469162325828e-06, "loss": -0.0445, "num_tokens": 10154474.0, "reward": 0.7992788553237915, "reward_std": 0.022709008306264877, "rewards/rollout_reward_func/mean": 0.7992788553237915, "rewards/rollout_reward_func/std": 0.17439404129981995, "sampling/importance_sampling_ratio/max": 0.9998088479042053, "sampling/importance_sampling_ratio/mean": 0.9372629523277283, "sampling/importance_sampling_ratio/min": 0.03286363556981087, "sampling/sampling_logp_difference/max": 1.770483136177063, "sampling/sampling_logp_difference/mean": 0.04293716326355934, "step": 1107, "step_time": 4.5578770200081635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3765709805302322, "epoch": 0.01108, "grad_norm": 0.03315535560250282, "kl": 0.5133900716900826, "learning_rate": 9.999468170592971e-06, "loss": -0.0445, "step": 1108, "step_time": 2.0309631019918015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1638968661427498, "epoch": 0.01109, "frac_reward_zero_std": 0.75, "grad_norm": 0.011991883628070354, "kl": 0.485750250518322, "learning_rate": 9.999467177934649e-06, "loss": -0.0272, "num_tokens": 10171962.0, "reward": 0.8810369968414307, "reward_std": 0.011861719191074371, "rewards/rollout_reward_func/mean": 0.8810369968414307, "rewards/rollout_reward_func/std": 0.3686182498931885, "sampling/importance_sampling_ratio/max": 1.0041029453277588, "sampling/importance_sampling_ratio/mean": 0.9641085863113403, "sampling/importance_sampling_ratio/min": 0.00845364574342966, "sampling/sampling_logp_difference/max": 2.550206422805786, "sampling/sampling_logp_difference/mean": 0.02584240958094597, "step": 1109, "step_time": 4.4988905919890385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.16423785965889692, "epoch": 0.0111, "grad_norm": 0.014567635022103786, "kl": 0.4858023673295975, "learning_rate": 9.999466184350858e-06, "loss": -0.0272, "step": 1110, "step_time": 2.0494488890035427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 216.0, "completions/mean_terminated_length": 216.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05985459918156266, "epoch": 0.01111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007861402118578553, "kl": 0.4279765821993351, "learning_rate": 9.999465189841599e-06, "loss": 0.0017, "num_tokens": 10190098.0, "reward": 1.0590769052505493, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0590769052505493, "rewards/rollout_reward_func/std": 0.40180012583732605, "sampling/importance_sampling_ratio/max": 1.0016957521438599, "sampling/importance_sampling_ratio/mean": 0.9962426424026489, "sampling/importance_sampling_ratio/min": 0.9922486543655396, "sampling/sampling_logp_difference/max": 0.005504371598362923, "sampling/sampling_logp_difference/mean": 0.000980344950221479, "step": 1111, "step_time": 5.012606127020263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05922494689002633, "epoch": 0.01112, "grad_norm": 0.0007675125962123275, "kl": 0.42809607461094856, "learning_rate": 9.999464194406873e-06, "loss": 0.0017, "step": 1112, "step_time": 2.0539068699945346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 102.40625, "completions/mean_terminated_length": 102.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6725088991224766, "epoch": 0.01113, "frac_reward_zero_std": 0.5, "grad_norm": 0.09856251627206802, "kl": 0.6838436722755432, "learning_rate": 9.99946319804668e-06, "loss": -0.0344, "num_tokens": 10205095.0, "reward": 0.7474038600921631, "reward_std": 0.05599215626716614, "rewards/rollout_reward_func/mean": 0.7474038600921631, "rewards/rollout_reward_func/std": 0.2569916844367981, "sampling/importance_sampling_ratio/max": 1.0008518695831299, "sampling/importance_sampling_ratio/mean": 0.9066437482833862, "sampling/importance_sampling_ratio/min": 7.473703730540016e-15, "sampling/sampling_logp_difference/max": 4.143052101135254, "sampling/sampling_logp_difference/mean": 0.20323625206947327, "step": 1113, "step_time": 4.685255787007918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6747171832248569, "epoch": 0.01114, "grad_norm": 0.08796820044517517, "kl": 0.675523579120636, "learning_rate": 9.999462200761019e-06, "loss": -0.0345, "step": 1114, "step_time": 2.026398533998872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 144.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3063505897298455, "epoch": 0.01115, "frac_reward_zero_std": 0.75, "grad_norm": 0.011381151154637337, "kl": 0.49465737119317055, "learning_rate": 9.999461202549894e-06, "loss": 0.03, "num_tokens": 10221331.0, "reward": 0.6056250333786011, "reward_std": 0.0017677652649581432, "rewards/rollout_reward_func/mean": 0.6056250333786011, "rewards/rollout_reward_func/std": 0.1418609917163849, "sampling/importance_sampling_ratio/max": 1.0039260387420654, "sampling/importance_sampling_ratio/mean": 0.9678133726119995, "sampling/importance_sampling_ratio/min": 0.0016519593773409724, "sampling/sampling_logp_difference/max": 1.767288327217102, "sampling/sampling_logp_difference/mean": 0.04100216552615166, "step": 1115, "step_time": 4.113663495991204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3063020291738212, "epoch": 0.01116, "grad_norm": 0.012140970677137375, "kl": 0.49967454746365547, "learning_rate": 9.9994602034133e-06, "loss": 0.03, "step": 1116, "step_time": 2.0490393360014423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 75.3125, "completions/mean_terminated_length": 75.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.16551916301250458, "epoch": 0.01117, "frac_reward_zero_std": 0.75, "grad_norm": 0.007948348298668861, "kl": 0.8711561784148216, "learning_rate": 9.999459203351241e-06, "loss": -0.0257, "num_tokens": 10235501.0, "reward": 0.930437445640564, "reward_std": 0.041515324264764786, "rewards/rollout_reward_func/mean": 0.930437445640564, "rewards/rollout_reward_func/std": 0.20392735302448273, "sampling/importance_sampling_ratio/max": 1.0024818181991577, "sampling/importance_sampling_ratio/mean": 0.9655952453613281, "sampling/importance_sampling_ratio/min": 0.08155573904514313, "sampling/sampling_logp_difference/max": 2.4453041553497314, "sampling/sampling_logp_difference/mean": 0.01964656263589859, "step": 1117, "step_time": 4.536122124998656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.166962256655097, "epoch": 0.01118, "grad_norm": 0.0069070602767169476, "kl": 0.8745781630277634, "learning_rate": 9.999458202363715e-06, "loss": -0.0257, "step": 1118, "step_time": 2.4885011939986725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 118.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.14481378300115466, "epoch": 0.01119, "frac_reward_zero_std": 0.75, "grad_norm": 0.0038873793091624975, "kl": 0.6458332575857639, "learning_rate": 9.999457200450725e-06, "loss": -0.0161, "num_tokens": 10251317.0, "reward": 0.6875481009483337, "reward_std": 0.026516500860452652, "rewards/rollout_reward_func/mean": 0.6875481009483337, "rewards/rollout_reward_func/std": 0.1971123367547989, "sampling/importance_sampling_ratio/max": 1.0022650957107544, "sampling/importance_sampling_ratio/mean": 0.9689525365829468, "sampling/importance_sampling_ratio/min": 0.08234010636806488, "sampling/sampling_logp_difference/max": 2.498976230621338, "sampling/sampling_logp_difference/mean": 0.02059614658355713, "step": 1119, "step_time": 4.01759297499666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.1440476947464049, "epoch": 0.0112, "grad_norm": 0.015077990479767323, "kl": 0.67598457634449, "learning_rate": 9.99945619761227e-06, "loss": -0.016, "step": 1120, "step_time": 2.010118607999175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 238.3125, "completions/mean_terminated_length": 238.3125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.7219014563597739, "epoch": 0.01121, "frac_reward_zero_std": 0.5, "grad_norm": 0.024591114372015, "kl": 0.6185241639614105, "learning_rate": 9.999455193848349e-06, "loss": -0.071, "num_tokens": 10269703.0, "reward": 1.1249278783798218, "reward_std": 0.03944006562232971, "rewards/rollout_reward_func/mean": 1.1249278783798218, "rewards/rollout_reward_func/std": 0.2883872985839844, "sampling/importance_sampling_ratio/max": 1.0012277364730835, "sampling/importance_sampling_ratio/mean": 0.8735896348953247, "sampling/importance_sampling_ratio/min": 0.0014474272029474378, "sampling/sampling_logp_difference/max": 2.378145217895508, "sampling/sampling_logp_difference/mean": 0.10443107038736343, "step": 1121, "step_time": 4.7637217180017615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7283031847327948, "epoch": 0.01122, "grad_norm": 0.021069465205073357, "kl": 0.6023825071752071, "learning_rate": 9.999454189158961e-06, "loss": -0.071, "step": 1122, "step_time": 2.057675187003042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 164.25, "completions/mean_terminated_length": 164.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.24925018660724163, "epoch": 0.01123, "frac_reward_zero_std": 0.75, "grad_norm": 0.018459709361195564, "kl": 0.41003742814064026, "learning_rate": 9.999453183544113e-06, "loss": 0.0201, "num_tokens": 10286183.0, "reward": 0.6521346569061279, "reward_std": 0.006799102760851383, "rewards/rollout_reward_func/mean": 0.6521346569061279, "rewards/rollout_reward_func/std": 0.40693461894989014, "sampling/importance_sampling_ratio/max": 1.005418062210083, "sampling/importance_sampling_ratio/mean": 0.9682316780090332, "sampling/importance_sampling_ratio/min": 0.02658664993941784, "sampling/sampling_logp_difference/max": 1.4848076105117798, "sampling/sampling_logp_difference/mean": 0.029242875054478645, "step": 1123, "step_time": 5.072464136988856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2488794201053679, "epoch": 0.01124, "grad_norm": 0.01850076951086521, "kl": 0.40969614312052727, "learning_rate": 9.999452177003797e-06, "loss": 0.0201, "step": 1124, "step_time": 2.5157915270028752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.04839617060497403, "epoch": 0.01125, "frac_reward_zero_std": 1.0, "grad_norm": 0.000467722857138142, "kl": 0.41960931196808815, "learning_rate": 9.999451169538017e-06, "loss": 0.0013, "num_tokens": 10302343.0, "reward": 0.8215384483337402, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8215384483337402, "rewards/rollout_reward_func/std": 0.2727874219417572, "sampling/importance_sampling_ratio/max": 1.0044125318527222, "sampling/importance_sampling_ratio/mean": 0.9979290962219238, "sampling/importance_sampling_ratio/min": 0.9931694269180298, "sampling/sampling_logp_difference/max": 0.004726629704236984, "sampling/sampling_logp_difference/mean": 0.0010242481948807836, "step": 1125, "step_time": 4.042933314012771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04919743491336703, "epoch": 0.01126, "grad_norm": 0.00048637561849318445, "kl": 0.4194793291389942, "learning_rate": 9.999450161146776e-06, "loss": 0.0013, "step": 1126, "step_time": 2.0156485229817918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05861953739076853, "epoch": 0.01127, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005466112052090466, "kl": 0.5094859376549721, "learning_rate": 9.999449151830068e-06, "loss": 0.0014, "num_tokens": 10318055.0, "reward": 0.7342307567596436, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7342307567596436, "rewards/rollout_reward_func/std": 0.22347478568553925, "sampling/importance_sampling_ratio/max": 1.001064658164978, "sampling/importance_sampling_ratio/mean": 0.9977736473083496, "sampling/importance_sampling_ratio/min": 0.9933575391769409, "sampling/sampling_logp_difference/max": 0.0037179309874773026, "sampling/sampling_logp_difference/mean": 0.0009291948517784476, "step": 1127, "step_time": 4.050455903015973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05944562470540404, "epoch": 0.01128, "grad_norm": 0.0005570415523834527, "kl": 0.5093386396765709, "learning_rate": 9.999448141587897e-06, "loss": 0.0014, "step": 1128, "step_time": 2.018976308987476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 214.15625, "completions/mean_terminated_length": 214.15625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.49460390163585544, "epoch": 0.01129, "frac_reward_zero_std": 0.5, "grad_norm": 0.011396752670407295, "kl": 0.4672303721308708, "learning_rate": 9.999447130420266e-06, "loss": -0.0655, "num_tokens": 10336628.0, "reward": 1.1077163219451904, "reward_std": 0.06150468811392784, "rewards/rollout_reward_func/mean": 1.1077163219451904, "rewards/rollout_reward_func/std": 0.16690939664840698, "sampling/importance_sampling_ratio/max": 1.002180576324463, "sampling/importance_sampling_ratio/mean": 0.9349033236503601, "sampling/importance_sampling_ratio/min": 0.00028095991001464427, "sampling/sampling_logp_difference/max": 2.1687755584716797, "sampling/sampling_logp_difference/mean": 0.06874983012676239, "step": 1129, "step_time": 5.560324755992042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4993459153920412, "epoch": 0.0113, "grad_norm": 0.011232112534344196, "kl": 0.4703385904431343, "learning_rate": 9.99944611832717e-06, "loss": -0.0655, "step": 1130, "step_time": 2.045623297999555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06280718930065632, "epoch": 0.01131, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006347611779347062, "kl": 0.45662207901477814, "learning_rate": 9.99944510530861e-06, "loss": 0.0014, "num_tokens": 10352988.0, "reward": 0.8092308044433594, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8092308044433594, "rewards/rollout_reward_func/std": 0.33492764830589294, "sampling/importance_sampling_ratio/max": 1.0000309944152832, "sampling/importance_sampling_ratio/mean": 0.9962546825408936, "sampling/importance_sampling_ratio/min": 0.9898896813392639, "sampling/sampling_logp_difference/max": 0.005604512989521027, "sampling/sampling_logp_difference/mean": 0.0009745244169607759, "step": 1131, "step_time": 4.44235471898719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06319010443985462, "epoch": 0.01132, "grad_norm": 0.000651901587843895, "kl": 0.4565179608762264, "learning_rate": 9.99944409136459e-06, "loss": 0.0014, "step": 1132, "step_time": 2.006640740990406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 139.84375, "completions/mean_terminated_length": 139.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4749111160635948, "epoch": 0.01133, "frac_reward_zero_std": 0.5, "grad_norm": 0.02250026725232601, "kl": 0.6922876909375191, "learning_rate": 9.999443076495105e-06, "loss": -0.0072, "num_tokens": 10369047.0, "reward": 0.8142980933189392, "reward_std": 0.036307208240032196, "rewards/rollout_reward_func/mean": 0.8142980933189392, "rewards/rollout_reward_func/std": 0.32867783308029175, "sampling/importance_sampling_ratio/max": 1.0009039640426636, "sampling/importance_sampling_ratio/mean": 0.932398796081543, "sampling/importance_sampling_ratio/min": 0.011649709194898605, "sampling/sampling_logp_difference/max": 1.7063778638839722, "sampling/sampling_logp_difference/mean": 0.06347833573818207, "step": 1133, "step_time": 4.6674980349926045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4726931508630514, "epoch": 0.01134, "grad_norm": 0.021906778216362, "kl": 0.6775235161185265, "learning_rate": 9.999442060700163e-06, "loss": -0.0072, "step": 1134, "step_time": 2.460100259988394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 193.03125, "completions/mean_terminated_length": 193.03125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.20521403243765235, "epoch": 0.01135, "frac_reward_zero_std": 0.75, "grad_norm": 0.025741228833794594, "kl": 0.449001245200634, "learning_rate": 9.999441043979755e-06, "loss": 0.0298, "num_tokens": 10386864.0, "reward": 0.9266250133514404, "reward_std": 0.015365973114967346, "rewards/rollout_reward_func/mean": 0.9266250133514404, "rewards/rollout_reward_func/std": 0.4296973645687103, "sampling/importance_sampling_ratio/max": 1.000549077987671, "sampling/importance_sampling_ratio/mean": 0.9667785167694092, "sampling/importance_sampling_ratio/min": 0.03897089138627052, "sampling/sampling_logp_difference/max": 1.166858434677124, "sampling/sampling_logp_difference/mean": 0.01821555383503437, "step": 1135, "step_time": 5.150027775998751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2041069846600294, "epoch": 0.01136, "grad_norm": 0.025736704468727112, "kl": 0.4487818367779255, "learning_rate": 9.999440026333887e-06, "loss": 0.0298, "step": 1136, "step_time": 2.03983772500942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 189.71875, "completions/mean_terminated_length": 189.71875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 1.065476335119456, "epoch": 0.01137, "frac_reward_zero_std": 0.5, "grad_norm": 0.012667068280279636, "kl": 0.5160715021193027, "learning_rate": 9.999439007762558e-06, "loss": -0.0657, "num_tokens": 10404239.0, "reward": 0.8137259483337402, "reward_std": 0.05913589894771576, "rewards/rollout_reward_func/mean": 0.8137259483337402, "rewards/rollout_reward_func/std": 0.2962214946746826, "sampling/importance_sampling_ratio/max": 1.0018877983093262, "sampling/importance_sampling_ratio/mean": 0.8724591732025146, "sampling/importance_sampling_ratio/min": 5.311641041835761e-19, "sampling/sampling_logp_difference/max": 5.168476104736328, "sampling/sampling_logp_difference/mean": 0.2920752763748169, "step": 1137, "step_time": 4.809005183982663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0650952709838748, "epoch": 0.01138, "grad_norm": 0.01351859886199236, "kl": 0.5077312141656876, "learning_rate": 9.999437988265768e-06, "loss": -0.0657, "step": 1138, "step_time": 2.035701753986359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 98.0, "completions/mean_terminated_length": 98.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08283156249672174, "epoch": 0.01139, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006562793860211968, "kl": 0.6095910742878914, "learning_rate": 9.999436967843518e-06, "loss": 0.0014, "num_tokens": 10419263.0, "reward": 0.5611538290977478, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5611538290977478, "rewards/rollout_reward_func/std": 0.2179257869720459, "sampling/importance_sampling_ratio/max": 1.0020251274108887, "sampling/importance_sampling_ratio/mean": 0.9946388602256775, "sampling/importance_sampling_ratio/min": 0.9776967763900757, "sampling/sampling_logp_difference/max": 0.018390346318483353, "sampling/sampling_logp_difference/mean": 0.0021804175339639187, "step": 1139, "step_time": 4.202532408009574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08381077647209167, "epoch": 0.0114, "grad_norm": 0.0006515005952678621, "kl": 0.6094335541129112, "learning_rate": 9.999435946495807e-06, "loss": 0.0014, "step": 1140, "step_time": 2.46410054100852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 192.0, "completions/mean_terminated_length": 192.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.05094511900097132, "epoch": 0.01141, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006634974852204323, "kl": 0.41653721779584885, "learning_rate": 9.999434924222635e-06, "loss": 0.0016, "num_tokens": 10436911.0, "reward": 0.8012692332267761, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8012692332267761, "rewards/rollout_reward_func/std": 0.3954152762889862, "sampling/importance_sampling_ratio/max": 1.0014127492904663, "sampling/importance_sampling_ratio/mean": 0.9968610405921936, "sampling/importance_sampling_ratio/min": 0.9909063577651978, "sampling/sampling_logp_difference/max": 0.006247837096452713, "sampling/sampling_logp_difference/mean": 0.0008685713401064277, "step": 1141, "step_time": 5.0718335150086205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05094508873298764, "epoch": 0.01142, "grad_norm": 0.0006535729044117033, "kl": 0.41653355211019516, "learning_rate": 9.999433901024004e-06, "loss": 0.0016, "step": 1142, "step_time": 2.0400470170134213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 150.5, "completions/mean_terminated_length": 150.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.21027374593541026, "epoch": 0.01143, "frac_reward_zero_std": 0.75, "grad_norm": 0.014308884739875793, "kl": 0.4948447719216347, "learning_rate": 9.999432876899914e-06, "loss": -0.0269, "num_tokens": 10453639.0, "reward": 0.9016106128692627, "reward_std": 0.03134387731552124, "rewards/rollout_reward_func/mean": 0.9016106128692627, "rewards/rollout_reward_func/std": 0.24235761165618896, "sampling/importance_sampling_ratio/max": 1.0014501810073853, "sampling/importance_sampling_ratio/mean": 0.9679510593414307, "sampling/importance_sampling_ratio/min": 0.029157759621739388, "sampling/sampling_logp_difference/max": 1.878256916999817, "sampling/sampling_logp_difference/mean": 0.023665040731430054, "step": 1143, "step_time": 4.074456161004491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2106402558274567, "epoch": 0.01144, "grad_norm": 0.013298770412802696, "kl": 0.49773725494742393, "learning_rate": 9.999431851850363e-06, "loss": -0.0269, "step": 1144, "step_time": 2.02329397698486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 127.0, "completions/mean_terminated_length": 127.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2991638956591487, "epoch": 0.01145, "frac_reward_zero_std": 0.75, "grad_norm": 0.025933261960744858, "kl": 0.7497799247503281, "learning_rate": 9.999430825875353e-06, "loss": -0.0169, "num_tokens": 10469919.0, "reward": 0.8498749732971191, "reward_std": 0.008838835172355175, "rewards/rollout_reward_func/mean": 0.8498749732971191, "rewards/rollout_reward_func/std": 0.3206489086151123, "sampling/importance_sampling_ratio/max": 1.0048071146011353, "sampling/importance_sampling_ratio/mean": 0.9433795213699341, "sampling/importance_sampling_ratio/min": 0.00845843181014061, "sampling/sampling_logp_difference/max": 1.9829657077789307, "sampling/sampling_logp_difference/mean": 0.040987398475408554, "step": 1145, "step_time": 4.0958850829993025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2988892998546362, "epoch": 0.01146, "grad_norm": 0.02584177628159523, "kl": 0.7494764924049377, "learning_rate": 9.999429798974887e-06, "loss": -0.0169, "step": 1146, "step_time": 2.478047342992795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 176.75, "completions/mean_terminated_length": 176.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8904569908045232, "epoch": 0.01147, "frac_reward_zero_std": 0.25, "grad_norm": 0.02344593033194542, "kl": 0.6057889126241207, "learning_rate": 9.99942877114896e-06, "loss": -0.0544, "num_tokens": 10487023.0, "reward": 0.7522451877593994, "reward_std": 0.05695899948477745, "rewards/rollout_reward_func/mean": 0.7522451877593994, "rewards/rollout_reward_func/std": 0.43571266531944275, "sampling/importance_sampling_ratio/max": 1.0062761306762695, "sampling/importance_sampling_ratio/mean": 0.8702199459075928, "sampling/importance_sampling_ratio/min": 8.838868836846814e-08, "sampling/sampling_logp_difference/max": 3.69429874420166, "sampling/sampling_logp_difference/mean": 0.20657768845558167, "step": 1147, "step_time": 5.086063057009596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8917525177821517, "epoch": 0.01148, "grad_norm": 0.02303948439657688, "kl": 0.6065366119146347, "learning_rate": 9.999427742397575e-06, "loss": -0.0544, "step": 1148, "step_time": 2.0345856600033585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 94.75, "completions/mean_terminated_length": 94.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.081275450065732, "epoch": 0.01149, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007636881782673299, "kl": 0.43014809861779213, "learning_rate": 9.999426712720733e-06, "loss": 0.0011, "num_tokens": 10501039.0, "reward": 0.623846173286438, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.623846173286438, "rewards/rollout_reward_func/std": 0.18917931616306305, "sampling/importance_sampling_ratio/max": 1.0065011978149414, "sampling/importance_sampling_ratio/mean": 0.998824417591095, "sampling/importance_sampling_ratio/min": 0.9903414845466614, "sampling/sampling_logp_difference/max": 0.005764400586485863, "sampling/sampling_logp_difference/mean": 0.001208821078762412, "step": 1149, "step_time": 3.833316778000153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08200145233422518, "epoch": 0.0115, "grad_norm": 0.000806904339697212, "kl": 0.4300071820616722, "learning_rate": 9.99942568211843e-06, "loss": 0.0011, "step": 1150, "step_time": 1.9919002140013617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 166.71875, "completions/mean_terminated_length": 166.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4680342832580209, "epoch": 0.01151, "frac_reward_zero_std": 0.75, "grad_norm": 0.008144514635205269, "kl": 0.5769087858498096, "learning_rate": 9.999424650590674e-06, "loss": -0.0481, "num_tokens": 10517902.0, "reward": 0.7285596132278442, "reward_std": 0.014538917690515518, "rewards/rollout_reward_func/mean": 0.7285596132278442, "rewards/rollout_reward_func/std": 0.13995592296123505, "sampling/importance_sampling_ratio/max": 0.9987074136734009, "sampling/importance_sampling_ratio/mean": 0.9295048713684082, "sampling/importance_sampling_ratio/min": 5.524234109028194e-15, "sampling/sampling_logp_difference/max": 14.778146743774414, "sampling/sampling_logp_difference/mean": 0.14627353847026825, "step": 1151, "step_time": 4.849711424001725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.46800240594893694, "epoch": 0.01152, "grad_norm": 0.008086832240223885, "kl": 0.5755694583058357, "learning_rate": 9.999423618137458e-06, "loss": -0.0481, "step": 1152, "step_time": 2.9208706879944657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 184.46875, "completions/mean_terminated_length": 184.46875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.425358546897769, "epoch": 0.01153, "frac_reward_zero_std": 0.5, "grad_norm": 0.03580410033464432, "kl": 0.5610897392034531, "learning_rate": 9.999422584758785e-06, "loss": -0.0428, "num_tokens": 10535357.0, "reward": 0.879855751991272, "reward_std": 0.0442168228328228, "rewards/rollout_reward_func/mean": 0.879855751991272, "rewards/rollout_reward_func/std": 0.18575963377952576, "sampling/importance_sampling_ratio/max": 0.999929666519165, "sampling/importance_sampling_ratio/mean": 0.9062279462814331, "sampling/importance_sampling_ratio/min": 0.02066618949174881, "sampling/sampling_logp_difference/max": 1.900329351425171, "sampling/sampling_logp_difference/mean": 0.05226391926407814, "step": 1153, "step_time": 4.319128473995079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4259127830155194, "epoch": 0.01154, "grad_norm": 0.036620333790779114, "kl": 0.5554671101272106, "learning_rate": 9.999421550454654e-06, "loss": -0.0429, "step": 1154, "step_time": 2.028237847996934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 189.53125, "completions/mean_terminated_length": 189.53125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3867281605489552, "epoch": 0.01155, "frac_reward_zero_std": 0.75, "grad_norm": 0.006574264727532864, "kl": 0.5321907550096512, "learning_rate": 9.999420515225069e-06, "loss": -0.0208, "num_tokens": 10553414.0, "reward": 0.5247788429260254, "reward_std": 0.03163175657391548, "rewards/rollout_reward_func/mean": 0.5247788429260254, "rewards/rollout_reward_func/std": 0.14829595386981964, "sampling/importance_sampling_ratio/max": 1.0030486583709717, "sampling/importance_sampling_ratio/mean": 0.9347242116928101, "sampling/importance_sampling_ratio/min": 0.00431857630610466, "sampling/sampling_logp_difference/max": 1.906764268875122, "sampling/sampling_logp_difference/mean": 0.04440898820757866, "step": 1155, "step_time": 4.553004183995654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38398524140939116, "epoch": 0.01156, "grad_norm": 0.006915367674082518, "kl": 0.5271377414464951, "learning_rate": 9.999419479070025e-06, "loss": -0.0208, "step": 1156, "step_time": 2.0449091869959375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 147.125, "completions/mean_terminated_length": 147.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.49006027542054653, "epoch": 0.01157, "frac_reward_zero_std": 0.5, "grad_norm": 0.011989775113761425, "kl": 0.5611362867057323, "learning_rate": 9.999418441989527e-06, "loss": -0.0457, "num_tokens": 10569762.0, "reward": 0.6160721182823181, "reward_std": 0.06441470980644226, "rewards/rollout_reward_func/mean": 0.6160721182823181, "rewards/rollout_reward_func/std": 0.13739752769470215, "sampling/importance_sampling_ratio/max": 1.0027257204055786, "sampling/importance_sampling_ratio/mean": 0.9352695941925049, "sampling/importance_sampling_ratio/min": 1.1210468073841184e-05, "sampling/sampling_logp_difference/max": 3.395169734954834, "sampling/sampling_logp_difference/mean": 0.09312786906957626, "step": 1157, "step_time": 4.584825574995193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.49040862265974283, "epoch": 0.01158, "grad_norm": 0.01150534674525261, "kl": 0.5603146553039551, "learning_rate": 9.999417403983573e-06, "loss": -0.0457, "step": 1158, "step_time": 2.5165691719957977 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 190.34375, "completions/mean_terminated_length": 190.34375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.738808314781636, "epoch": 0.01159, "frac_reward_zero_std": 0.5, "grad_norm": 0.0306021086871624, "kl": 0.49701661244034767, "learning_rate": 9.999416365052164e-06, "loss": -0.0116, "num_tokens": 10587133.0, "reward": 0.9221571683883667, "reward_std": 0.04934161901473999, "rewards/rollout_reward_func/mean": 0.9221571683883667, "rewards/rollout_reward_func/std": 0.3152892589569092, "sampling/importance_sampling_ratio/max": 1.0064961910247803, "sampling/importance_sampling_ratio/mean": 0.9052540063858032, "sampling/importance_sampling_ratio/min": 2.0939242085233462e-16, "sampling/sampling_logp_difference/max": 4.028665065765381, "sampling/sampling_logp_difference/mean": 0.22485589981079102, "step": 1159, "step_time": 4.677520091005135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7440899885259569, "epoch": 0.0116, "grad_norm": 0.030187852680683136, "kl": 0.4923914782702923, "learning_rate": 9.999415325195299e-06, "loss": -0.0116, "step": 1160, "step_time": 2.099727205983072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 99.65625, "completions/mean_terminated_length": 99.65625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.598904512822628, "epoch": 0.01161, "frac_reward_zero_std": 0.75, "grad_norm": 0.007594710681587458, "kl": 0.6827922612428665, "learning_rate": 9.999414284412979e-06, "loss": -0.0232, "num_tokens": 10601354.0, "reward": 0.7033653855323792, "reward_std": 0.011572751216590405, "rewards/rollout_reward_func/mean": 0.7033653855323792, "rewards/rollout_reward_func/std": 0.13757556676864624, "sampling/importance_sampling_ratio/max": 1.0027050971984863, "sampling/importance_sampling_ratio/mean": 0.9310207366943359, "sampling/importance_sampling_ratio/min": 1.675507232956619e-10, "sampling/sampling_logp_difference/max": 14.473714828491211, "sampling/sampling_logp_difference/mean": 0.2277272641658783, "step": 1161, "step_time": 4.0104518689913675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6056906310841441, "epoch": 0.01162, "grad_norm": 0.008238364942371845, "kl": 0.6797590926289558, "learning_rate": 9.999413242705202e-06, "loss": -0.0232, "step": 1162, "step_time": 2.022692543017911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 27.96875, "completions/mean_terminated_length": 27.838708877563477, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9960622489452362, "epoch": 0.01163, "frac_reward_zero_std": 0.5, "grad_norm": 0.010127237997949123, "kl": 0.8935102596879005, "learning_rate": 9.999412200071973e-06, "loss": -0.027, "num_tokens": 10614089.0, "reward": 0.735576868057251, "reward_std": 0.07343031466007233, "rewards/rollout_reward_func/mean": 0.735576868057251, "rewards/rollout_reward_func/std": 0.10262352973222733, "sampling/importance_sampling_ratio/max": 0.9965295791625977, "sampling/importance_sampling_ratio/mean": 0.9269661903381348, "sampling/importance_sampling_ratio/min": 1.034382474426007e-40, "sampling/sampling_logp_difference/max": 10.49483871459961, "sampling/sampling_logp_difference/mean": 0.8042776584625244, "step": 1163, "step_time": 4.193808763004199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9931189864873886, "epoch": 0.01164, "grad_norm": 0.010648425668478012, "kl": 0.8926163464784622, "learning_rate": 9.999411156513289e-06, "loss": -0.027, "step": 1164, "step_time": 2.4833064889899106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 94.25, "completions/mean_terminated_length": 94.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0971210952848196, "epoch": 0.01165, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007821476901881397, "kl": 0.6091655716300011, "learning_rate": 9.999410112029152e-06, "loss": 0.0013, "num_tokens": 10628553.0, "reward": 0.7450000047683716, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7450000047683716, "rewards/rollout_reward_func/std": 0.24759720265865326, "sampling/importance_sampling_ratio/max": 1.0021439790725708, "sampling/importance_sampling_ratio/mean": 0.9942456483840942, "sampling/importance_sampling_ratio/min": 0.9777424335479736, "sampling/sampling_logp_difference/max": 0.019527051597833633, "sampling/sampling_logp_difference/mean": 0.0023277089931070805, "step": 1165, "step_time": 4.119361231001676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09511933289468288, "epoch": 0.01166, "grad_norm": 0.0007587405270896852, "kl": 0.6096073091030121, "learning_rate": 9.99940906661956e-06, "loss": 0.0013, "step": 1166, "step_time": 2.0157062329890323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 102.96875, "completions/mean_terminated_length": 102.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.42842021118849516, "epoch": 0.01167, "frac_reward_zero_std": 0.5, "grad_norm": 0.03253079950809479, "kl": 0.6418186351656914, "learning_rate": 9.999408020284516e-06, "loss": 0.0292, "num_tokens": 10643440.0, "reward": 0.59745192527771, "reward_std": 0.01563793607056141, "rewards/rollout_reward_func/mean": 0.59745192527771, "rewards/rollout_reward_func/std": 0.13681435585021973, "sampling/importance_sampling_ratio/max": 1.0001343488693237, "sampling/importance_sampling_ratio/mean": 0.9337310194969177, "sampling/importance_sampling_ratio/min": 0.016729965806007385, "sampling/sampling_logp_difference/max": 2.231172561645508, "sampling/sampling_logp_difference/mean": 0.0628974586725235, "step": 1167, "step_time": 4.053311572992243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4249040950089693, "epoch": 0.01168, "grad_norm": 0.03296053782105446, "kl": 0.6352532655000687, "learning_rate": 9.999406973024017e-06, "loss": 0.0291, "step": 1168, "step_time": 2.025340796004457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 163.9375, "completions/mean_terminated_length": 163.9375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.43627715902402997, "epoch": 0.01169, "frac_reward_zero_std": 0.5, "grad_norm": 0.018181681632995605, "kl": 0.4386141821742058, "learning_rate": 9.999405924838066e-06, "loss": -0.046, "num_tokens": 10659990.0, "reward": 0.7616826891899109, "reward_std": 0.022437043488025665, "rewards/rollout_reward_func/mean": 0.7616826891899109, "rewards/rollout_reward_func/std": 0.22272853553295135, "sampling/importance_sampling_ratio/max": 1.0035314559936523, "sampling/importance_sampling_ratio/mean": 0.9346451759338379, "sampling/importance_sampling_ratio/min": 0.0047808485105633736, "sampling/sampling_logp_difference/max": 1.9878525733947754, "sampling/sampling_logp_difference/mean": 0.05941501259803772, "step": 1169, "step_time": 4.9065339299995685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4315730957314372, "epoch": 0.0117, "grad_norm": 0.01879037544131279, "kl": 0.43288979679346085, "learning_rate": 9.999404875726661e-06, "loss": -0.046, "step": 1170, "step_time": 2.0578140090001398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 126.0, "completions/mean_terminated_length": 126.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5954338302835822, "epoch": 0.01171, "frac_reward_zero_std": 0.25, "grad_norm": 0.03189029544591904, "kl": 0.6111360043287277, "learning_rate": 9.999403825689805e-06, "loss": -0.0089, "num_tokens": 10676014.0, "reward": 0.7864576578140259, "reward_std": 0.04597282409667969, "rewards/rollout_reward_func/mean": 0.7864576578140259, "rewards/rollout_reward_func/std": 0.3928534984588623, "sampling/importance_sampling_ratio/max": 1.000801920890808, "sampling/importance_sampling_ratio/mean": 0.8996604681015015, "sampling/importance_sampling_ratio/min": 0.00670575350522995, "sampling/sampling_logp_difference/max": 2.325943946838379, "sampling/sampling_logp_difference/mean": 0.08933314681053162, "step": 1171, "step_time": 4.503485412009468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.593276135623455, "epoch": 0.01172, "grad_norm": 0.0328366756439209, "kl": 0.6021773368120193, "learning_rate": 9.999402774727496e-06, "loss": -0.0089, "step": 1172, "step_time": 2.097962767998979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 104.75, "completions/mean_terminated_length": 104.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2932454058900476, "epoch": 0.01173, "frac_reward_zero_std": 0.25, "grad_norm": 0.033043570816516876, "kl": 0.7205885276198387, "learning_rate": 9.999401722839737e-06, "loss": -0.0747, "num_tokens": 10691206.0, "reward": 0.9452173113822937, "reward_std": 0.093620665371418, "rewards/rollout_reward_func/mean": 0.9452173113822937, "rewards/rollout_reward_func/std": 0.2858430743217468, "sampling/importance_sampling_ratio/max": 1.0039409399032593, "sampling/importance_sampling_ratio/mean": 0.8402829766273499, "sampling/importance_sampling_ratio/min": 2.859225681110742e-18, "sampling/sampling_logp_difference/max": 10.328907012939453, "sampling/sampling_logp_difference/mean": 0.3929595947265625, "step": 1173, "step_time": 4.5113383299831185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2859543766826391, "epoch": 0.01174, "grad_norm": 0.03655305132269859, "kl": 0.7091130837798119, "learning_rate": 9.999400670026525e-06, "loss": -0.0746, "step": 1174, "step_time": 2.0612741590011865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4418956097215414, "epoch": 0.01175, "frac_reward_zero_std": 0.75, "grad_norm": 0.01348965335637331, "kl": 0.7295278906822205, "learning_rate": 9.999399616287859e-06, "loss": -0.0077, "num_tokens": 10707430.0, "reward": 0.7803750038146973, "reward_std": 0.06051202118396759, "rewards/rollout_reward_func/mean": 0.7803750038146973, "rewards/rollout_reward_func/std": 0.13451576232910156, "sampling/importance_sampling_ratio/max": 0.9994949698448181, "sampling/importance_sampling_ratio/mean": 0.9586965441703796, "sampling/importance_sampling_ratio/min": 0.001593185355886817, "sampling/sampling_logp_difference/max": 1.8017792701721191, "sampling/sampling_logp_difference/mean": 0.09823934733867645, "step": 1175, "step_time": 5.659998449998966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4412875920534134, "epoch": 0.01176, "grad_norm": 0.01413373276591301, "kl": 0.7283594906330109, "learning_rate": 9.999398561623746e-06, "loss": -0.0077, "step": 1176, "step_time": 2.032778822016553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 164.90625, "completions/mean_terminated_length": 164.90625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5973124923184514, "epoch": 0.01177, "frac_reward_zero_std": 0.75, "grad_norm": 0.00650778179988265, "kl": 0.5073144920170307, "learning_rate": 9.999397506034179e-06, "loss": -0.0271, "num_tokens": 10724019.0, "reward": 0.778048038482666, "reward_std": 0.04718577861785889, "rewards/rollout_reward_func/mean": 0.778048038482666, "rewards/rollout_reward_func/std": 0.2060512751340866, "sampling/importance_sampling_ratio/max": 1.0032087564468384, "sampling/importance_sampling_ratio/mean": 0.9375722408294678, "sampling/importance_sampling_ratio/min": 2.3832259950229417e-13, "sampling/sampling_logp_difference/max": 4.072694778442383, "sampling/sampling_logp_difference/mean": 0.1462843418121338, "step": 1177, "step_time": 4.577931815008924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5944534768350422, "epoch": 0.01178, "grad_norm": 0.00607994245365262, "kl": 0.5020606592297554, "learning_rate": 9.999396449519164e-06, "loss": -0.0271, "step": 1178, "step_time": 2.045677746005822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 208.40625, "completions/mean_terminated_length": 208.40625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.640562363434583, "epoch": 0.01179, "frac_reward_zero_std": 0.25, "grad_norm": 0.0288253091275692, "kl": 0.5172988288104534, "learning_rate": 9.999395392078698e-06, "loss": -0.0362, "num_tokens": 10741776.0, "reward": 0.5826009511947632, "reward_std": 0.027509169653058052, "rewards/rollout_reward_func/mean": 0.5826009511947632, "rewards/rollout_reward_func/std": 0.2122213989496231, "sampling/importance_sampling_ratio/max": 1.0048248767852783, "sampling/importance_sampling_ratio/mean": 0.9060184955596924, "sampling/importance_sampling_ratio/min": 5.6854627473512664e-05, "sampling/sampling_logp_difference/max": 2.3987510204315186, "sampling/sampling_logp_difference/mean": 0.08715242147445679, "step": 1179, "step_time": 4.499855914989894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6378503148443997, "epoch": 0.0118, "grad_norm": 0.029753193259239197, "kl": 0.5153133049607277, "learning_rate": 9.999394333712782e-06, "loss": -0.0363, "step": 1180, "step_time": 2.5243470759887714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 148.5, "completions/mean_terminated_length": 148.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06851712241768837, "epoch": 0.01181, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006234000320546329, "kl": 0.5014307424426079, "learning_rate": 9.999393274421414e-06, "loss": 0.0014, "num_tokens": 10757984.0, "reward": 0.8947307467460632, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8947307467460632, "rewards/rollout_reward_func/std": 0.35809287428855896, "sampling/importance_sampling_ratio/max": 1.0040974617004395, "sampling/importance_sampling_ratio/mean": 0.9956871271133423, "sampling/importance_sampling_ratio/min": 0.9769218564033508, "sampling/sampling_logp_difference/max": 0.0203079991042614, "sampling/sampling_logp_difference/mean": 0.0014830760192126036, "step": 1181, "step_time": 4.223120853006549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07018917007371783, "epoch": 0.01182, "grad_norm": 0.0006334480713121593, "kl": 0.5010273158550262, "learning_rate": 9.999392214204598e-06, "loss": 0.0014, "step": 1182, "step_time": 2.021273143000144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 217.0625, "completions/mean_terminated_length": 217.0625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.47780757676810026, "epoch": 0.01183, "frac_reward_zero_std": 0.5, "grad_norm": 0.019314829260110855, "kl": 0.5076420269906521, "learning_rate": 9.999391153062331e-06, "loss": -0.0746, "num_tokens": 10775970.0, "reward": 1.0905817747116089, "reward_std": 0.06678079068660736, "rewards/rollout_reward_func/mean": 1.0905817747116089, "rewards/rollout_reward_func/std": 0.31439095735549927, "sampling/importance_sampling_ratio/max": 1.0042003393173218, "sampling/importance_sampling_ratio/mean": 0.934515655040741, "sampling/importance_sampling_ratio/min": 0.0015675036702305079, "sampling/sampling_logp_difference/max": 1.9540066719055176, "sampling/sampling_logp_difference/mean": 0.05397281050682068, "step": 1183, "step_time": 4.551236501989479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.48165874276310205, "epoch": 0.01184, "grad_norm": 0.01875365898013115, "kl": 0.5093837305903435, "learning_rate": 9.999390090994617e-06, "loss": -0.0746, "step": 1184, "step_time": 2.0395634770175093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 148.90625, "completions/mean_terminated_length": 148.90625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.36983491759747267, "epoch": 0.01185, "frac_reward_zero_std": 0.75, "grad_norm": 0.01451386883854866, "kl": 0.4620417580008507, "learning_rate": 9.999389028001453e-06, "loss": -0.0269, "num_tokens": 10792183.0, "reward": 0.882567286491394, "reward_std": 0.01590990275144577, "rewards/rollout_reward_func/mean": 0.882567286491394, "rewards/rollout_reward_func/std": 0.2774842083454132, "sampling/importance_sampling_ratio/max": 1.0032827854156494, "sampling/importance_sampling_ratio/mean": 0.9362464547157288, "sampling/importance_sampling_ratio/min": 0.018665779381990433, "sampling/sampling_logp_difference/max": 1.5990500450134277, "sampling/sampling_logp_difference/mean": 0.046050023287534714, "step": 1185, "step_time": 4.125786873002653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36952716996893287, "epoch": 0.01186, "grad_norm": 0.014103253372013569, "kl": 0.46568114310503006, "learning_rate": 9.999387964082844e-06, "loss": -0.0269, "step": 1186, "step_time": 2.910235860006651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 165.25, "completions/mean_terminated_length": 164.22579956054688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0867185601964593, "epoch": 0.01187, "frac_reward_zero_std": 1.0, "grad_norm": 0.003662376431748271, "kl": 0.5669864043593407, "learning_rate": 9.999386899238782e-06, "loss": 0.0018, "num_tokens": 10808455.0, "reward": 0.6881923079490662, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6881923079490662, "rewards/rollout_reward_func/std": 0.13435354828834534, "sampling/importance_sampling_ratio/max": 1.0018821954727173, "sampling/importance_sampling_ratio/mean": 0.9889390468597412, "sampling/importance_sampling_ratio/min": 0.8799756765365601, "sampling/sampling_logp_difference/max": 0.1261882781982422, "sampling/sampling_logp_difference/mean": 0.0024359067901968956, "step": 1187, "step_time": 4.563391214011062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08564967243000865, "epoch": 0.01188, "grad_norm": 0.003945288248360157, "kl": 0.5688076317310333, "learning_rate": 9.999385833469273e-06, "loss": 0.0018, "step": 1188, "step_time": 2.0310047539969673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 142.78125, "completions/mean_terminated_length": 142.78125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.19766423851251602, "epoch": 0.01189, "frac_reward_zero_std": 0.75, "grad_norm": 0.027276746928691864, "kl": 0.5416349731385708, "learning_rate": 9.999384766774318e-06, "loss": -0.0258, "num_tokens": 10824664.0, "reward": 0.6375480890274048, "reward_std": 0.019989363849163055, "rewards/rollout_reward_func/mean": 0.6375480890274048, "rewards/rollout_reward_func/std": 0.1383914053440094, "sampling/importance_sampling_ratio/max": 1.003785490989685, "sampling/importance_sampling_ratio/mean": 0.968830943107605, "sampling/importance_sampling_ratio/min": 0.07689797878265381, "sampling/sampling_logp_difference/max": 1.0852763652801514, "sampling/sampling_logp_difference/mean": 0.01487124152481556, "step": 1189, "step_time": 4.268838671014237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1982725476846099, "epoch": 0.0119, "grad_norm": 0.0267736054956913, "kl": 0.5482886582612991, "learning_rate": 9.999383699153913e-06, "loss": -0.0258, "step": 1190, "step_time": 2.0260795350113767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 117.5, "completions/mean_terminated_length": 117.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1247544228099287, "epoch": 0.01191, "frac_reward_zero_std": 0.5, "grad_norm": 0.010982447303831577, "kl": 0.641926784068346, "learning_rate": 9.999382630608064e-06, "loss": 0.0007, "num_tokens": 10840040.0, "reward": 0.9418509602546692, "reward_std": 0.07466636598110199, "rewards/rollout_reward_func/mean": 0.9418509602546692, "rewards/rollout_reward_func/std": 0.23516881465911865, "sampling/importance_sampling_ratio/max": 1.0046952962875366, "sampling/importance_sampling_ratio/mean": 0.8989559412002563, "sampling/importance_sampling_ratio/min": 2.1494435450189775e-23, "sampling/sampling_logp_difference/max": 3.5039114952087402, "sampling/sampling_logp_difference/mean": 0.3725103735923767, "step": 1191, "step_time": 4.3926740099996096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1284599397331476, "epoch": 0.01192, "grad_norm": 0.0101572684943676, "kl": 0.6237366087734699, "learning_rate": 9.999381561136765e-06, "loss": 0.0007, "step": 1192, "step_time": 2.4972417620010674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 171.84375, "completions/mean_terminated_length": 171.84375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.24374982016161084, "epoch": 0.01193, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036530906800180674, "kl": 0.4462158754467964, "learning_rate": 9.99938049074002e-06, "loss": 0.0016, "num_tokens": 10856955.0, "reward": 0.7632692456245422, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7632692456245422, "rewards/rollout_reward_func/std": 0.2629426121711731, "sampling/importance_sampling_ratio/max": 1.0026400089263916, "sampling/importance_sampling_ratio/mean": 0.9655710458755493, "sampling/importance_sampling_ratio/min": 0.01168457418680191, "sampling/sampling_logp_difference/max": 1.2985286712646484, "sampling/sampling_logp_difference/mean": 0.02980091981589794, "step": 1193, "step_time": 4.475109314997098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2454701722599566, "epoch": 0.01194, "grad_norm": 0.0038376012817025185, "kl": 0.4465531148016453, "learning_rate": 9.99937941941783e-06, "loss": 0.0016, "step": 1194, "step_time": 2.023049020994222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 128.5625, "completions/mean_terminated_length": 128.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3276723772287369, "epoch": 0.01195, "frac_reward_zero_std": 0.75, "grad_norm": 0.009017233736813068, "kl": 0.501227181404829, "learning_rate": 9.999378347170195e-06, "loss": -0.0179, "num_tokens": 10872333.0, "reward": 0.8896153569221497, "reward_std": 0.017949635162949562, "rewards/rollout_reward_func/mean": 0.8896153569221497, "rewards/rollout_reward_func/std": 0.20032881200313568, "sampling/importance_sampling_ratio/max": 1.0085138082504272, "sampling/importance_sampling_ratio/mean": 0.9675531387329102, "sampling/importance_sampling_ratio/min": 0.0006666624685749412, "sampling/sampling_logp_difference/max": 2.1612114906311035, "sampling/sampling_logp_difference/mean": 0.05991463363170624, "step": 1195, "step_time": 3.944151416006207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.32326422492042184, "epoch": 0.01196, "grad_norm": 0.009423879906535149, "kl": 0.5000810660421848, "learning_rate": 9.999377273997111e-06, "loss": -0.0179, "step": 1196, "step_time": 2.0098616060131462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.25377212883904576, "epoch": 0.01197, "frac_reward_zero_std": 0.75, "grad_norm": 0.012657172046601772, "kl": 0.5320038758218288, "learning_rate": 9.999376199898583e-06, "loss": -0.0268, "num_tokens": 10889713.0, "reward": 0.6049038767814636, "reward_std": 0.023116955533623695, "rewards/rollout_reward_func/mean": 0.6049038767814636, "rewards/rollout_reward_func/std": 0.229365274310112, "sampling/importance_sampling_ratio/max": 0.9999564290046692, "sampling/importance_sampling_ratio/mean": 0.9632288217544556, "sampling/importance_sampling_ratio/min": 0.0005985168972983956, "sampling/sampling_logp_difference/max": 1.8559300899505615, "sampling/sampling_logp_difference/mean": 0.03898981213569641, "step": 1197, "step_time": 5.223366956997779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25522466050460935, "epoch": 0.01198, "grad_norm": 0.011966047808527946, "kl": 0.5330194979906082, "learning_rate": 9.99937512487461e-06, "loss": -0.0268, "step": 1198, "step_time": 2.0620582939955057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.059020685497671366, "epoch": 0.01199, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007221496780402958, "kl": 0.43837444111704826, "learning_rate": 9.99937404892519e-06, "loss": 0.0014, "num_tokens": 10906049.0, "reward": 0.6966538429260254, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6966538429260254, "rewards/rollout_reward_func/std": 0.2324121743440628, "sampling/importance_sampling_ratio/max": 1.002932071685791, "sampling/importance_sampling_ratio/mean": 0.9975934624671936, "sampling/importance_sampling_ratio/min": 0.9841580390930176, "sampling/sampling_logp_difference/max": 0.013919949531555176, "sampling/sampling_logp_difference/mean": 0.0012177706230431795, "step": 1199, "step_time": 4.071786913002143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.057927549816668034, "epoch": 0.012, "grad_norm": 0.0006814642692916095, "kl": 0.438582856208086, "learning_rate": 9.999372972050326e-06, "loss": 0.0014, "step": 1200, "step_time": 2.023338214006799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 122.0, "completions/mean_terminated_length": 122.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.05914475163444877, "epoch": 0.01201, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006010346114635468, "kl": 0.4711233749985695, "learning_rate": 9.999371894250018e-06, "loss": 0.0013, "num_tokens": 10921321.0, "reward": 0.8573076725006104, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8573076725006104, "rewards/rollout_reward_func/std": 0.24873675405979156, "sampling/importance_sampling_ratio/max": 1.0034116506576538, "sampling/importance_sampling_ratio/mean": 0.9974640607833862, "sampling/importance_sampling_ratio/min": 0.9813820719718933, "sampling/sampling_logp_difference/max": 0.01453983411192894, "sampling/sampling_logp_difference/mean": 0.00118572311475873, "step": 1201, "step_time": 4.061353846998827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05863392446190119, "epoch": 0.01202, "grad_norm": 0.0005901700933463871, "kl": 0.4712343029677868, "learning_rate": 9.999370815524266e-06, "loss": 0.0013, "step": 1202, "step_time": 2.027045939998061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 129.78125, "completions/mean_terminated_length": 129.78125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.6528825243003666, "epoch": 0.01203, "frac_reward_zero_std": 0.5, "grad_norm": 0.023056669160723686, "kl": 0.5280248746275902, "learning_rate": 9.999369735873068e-06, "loss": 0.0401, "num_tokens": 10936706.0, "reward": 0.7461514472961426, "reward_std": 0.03371675685048103, "rewards/rollout_reward_func/mean": 0.7461514472961426, "rewards/rollout_reward_func/std": 0.16321876645088196, "sampling/importance_sampling_ratio/max": 1.0016019344329834, "sampling/importance_sampling_ratio/mean": 0.9347168207168579, "sampling/importance_sampling_ratio/min": 1.8068212481736623e-09, "sampling/sampling_logp_difference/max": 3.4902005195617676, "sampling/sampling_logp_difference/mean": 0.1869802474975586, "step": 1203, "step_time": 5.195315887998731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6509154080413282, "epoch": 0.01204, "grad_norm": 0.020467078313231468, "kl": 0.5138396807014942, "learning_rate": 9.999368655296428e-06, "loss": 0.0401, "step": 1204, "step_time": 2.0771295790182194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 151.1875, "completions/mean_terminated_length": 151.1875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5796750825829804, "epoch": 0.01205, "frac_reward_zero_std": 0.75, "grad_norm": 0.03784632310271263, "kl": 0.7196575291454792, "learning_rate": 9.999367573794344e-06, "loss": -0.0353, "num_tokens": 10952664.0, "reward": 0.6976812481880188, "reward_std": 0.05387623608112335, "rewards/rollout_reward_func/mean": 0.6976812481880188, "rewards/rollout_reward_func/std": 0.34179621934890747, "sampling/importance_sampling_ratio/max": 0.9995234608650208, "sampling/importance_sampling_ratio/mean": 0.932341456413269, "sampling/importance_sampling_ratio/min": 7.505593369039332e-10, "sampling/sampling_logp_difference/max": 13.912185668945312, "sampling/sampling_logp_difference/mean": 0.19614815711975098, "step": 1205, "step_time": 4.257209166986286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5779332788661122, "epoch": 0.01206, "grad_norm": 0.03715619072318077, "kl": 0.7137230336666107, "learning_rate": 9.999366491366816e-06, "loss": -0.0353, "step": 1206, "step_time": 2.0425052140271873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 175.5, "completions/mean_terminated_length": 175.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2370577333495021, "epoch": 0.01207, "frac_reward_zero_std": 0.75, "grad_norm": 0.0050993990153074265, "kl": 0.5099678225815296, "learning_rate": 9.999365408013845e-06, "loss": -0.0175, "num_tokens": 10969832.0, "reward": 0.9480769634246826, "reward_std": 0.0353553369641304, "rewards/rollout_reward_func/mean": 0.9480769634246826, "rewards/rollout_reward_func/std": 0.2166612446308136, "sampling/importance_sampling_ratio/max": 1.0018874406814575, "sampling/importance_sampling_ratio/mean": 0.9644989967346191, "sampling/importance_sampling_ratio/min": 0.0032276820857077837, "sampling/sampling_logp_difference/max": 1.9403901100158691, "sampling/sampling_logp_difference/mean": 0.036247722804546356, "step": 1207, "step_time": 4.246677996008657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2378124762326479, "epoch": 0.01208, "grad_norm": 0.004805928096175194, "kl": 0.5066631063818932, "learning_rate": 9.999364323735433e-06, "loss": -0.0175, "step": 1208, "step_time": 2.0217232929935562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.03784025786444545, "epoch": 0.01209, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004097824858035892, "kl": 0.43087152391672134, "learning_rate": 9.999363238531578e-06, "loss": 0.0015, "num_tokens": 10986688.0, "reward": 0.9768462181091309, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9768462181091309, "rewards/rollout_reward_func/std": 0.21539956331253052, "sampling/importance_sampling_ratio/max": 1.0045030117034912, "sampling/importance_sampling_ratio/mean": 0.9993345141410828, "sampling/importance_sampling_ratio/min": 0.9947483539581299, "sampling/sampling_logp_difference/max": 0.006040716543793678, "sampling/sampling_logp_difference/mean": 0.0007711101789027452, "step": 1209, "step_time": 5.126436431011825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.03848612541332841, "epoch": 0.0121, "grad_norm": 0.00042544628377072513, "kl": 0.4307514950633049, "learning_rate": 9.99936215240228e-06, "loss": 0.0015, "step": 1210, "step_time": 2.0300723709951853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 152.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.22336465353146195, "epoch": 0.01211, "frac_reward_zero_std": 0.75, "grad_norm": 0.01101657934486866, "kl": 0.5023615583777428, "learning_rate": 9.999361065347541e-06, "loss": -0.0169, "num_tokens": 11003060.0, "reward": 0.6463462114334106, "reward_std": 0.027196412906050682, "rewards/rollout_reward_func/mean": 0.6463462114334106, "rewards/rollout_reward_func/std": 0.3770715892314911, "sampling/importance_sampling_ratio/max": 1.0043138265609741, "sampling/importance_sampling_ratio/mean": 0.9448680877685547, "sampling/importance_sampling_ratio/min": 0.029689930379390717, "sampling/sampling_logp_difference/max": 1.6847888231277466, "sampling/sampling_logp_difference/mean": 0.031136931851506233, "step": 1211, "step_time": 4.080246368990629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22435889998450875, "epoch": 0.01212, "grad_norm": 0.010601844638586044, "kl": 0.49267033487558365, "learning_rate": 9.99935997736736e-06, "loss": -0.0169, "step": 1212, "step_time": 2.0338802580008633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 220.375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.35246339766308665, "epoch": 0.01213, "frac_reward_zero_std": 0.75, "grad_norm": 0.015586853958666325, "kl": 0.4368526004254818, "learning_rate": 9.999358888461737e-06, "loss": -0.0269, "num_tokens": 11020928.0, "reward": 0.9230333566665649, "reward_std": 0.04918566718697548, "rewards/rollout_reward_func/mean": 0.9230333566665649, "rewards/rollout_reward_func/std": 0.48255521059036255, "sampling/importance_sampling_ratio/max": 0.9999396204948425, "sampling/importance_sampling_ratio/mean": 0.9665011167526245, "sampling/importance_sampling_ratio/min": 3.244519675149604e-08, "sampling/sampling_logp_difference/max": 2.314114570617676, "sampling/sampling_logp_difference/mean": 0.09630873054265976, "step": 1213, "step_time": 4.7952797660036595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35091276932507753, "epoch": 0.01214, "grad_norm": 0.015039956197142601, "kl": 0.4347536265850067, "learning_rate": 9.999357798630673e-06, "loss": -0.027, "step": 1214, "step_time": 2.959794583002804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 146.5, "completions/mean_terminated_length": 146.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.25502987997606397, "epoch": 0.01215, "frac_reward_zero_std": 0.75, "grad_norm": 0.016218412667512894, "kl": 0.5680277496576309, "learning_rate": 9.99935670787417e-06, "loss": -0.0171, "num_tokens": 11037384.0, "reward": 0.7472115755081177, "reward_std": 0.0176776684820652, "rewards/rollout_reward_func/mean": 0.7472115755081177, "rewards/rollout_reward_func/std": 0.24430586397647858, "sampling/importance_sampling_ratio/max": 1.000018835067749, "sampling/importance_sampling_ratio/mean": 0.9601600766181946, "sampling/importance_sampling_ratio/min": 0.0016399887390434742, "sampling/sampling_logp_difference/max": 1.6907947063446045, "sampling/sampling_logp_difference/mean": 0.0399373359978199, "step": 1215, "step_time": 4.153255026998522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24898658134043217, "epoch": 0.01216, "grad_norm": 0.014758897945284843, "kl": 0.5695502236485481, "learning_rate": 9.999355616192225e-06, "loss": -0.0171, "step": 1216, "step_time": 2.031174273994111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 145.25, "completions/mean_terminated_length": 145.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.07356284558773041, "epoch": 0.01217, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009099915623664856, "kl": 0.5575951859354973, "learning_rate": 9.99935452358484e-06, "loss": 0.0017, "num_tokens": 11053400.0, "reward": 0.6715384721755981, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6715384721755981, "rewards/rollout_reward_func/std": 0.07125735282897949, "sampling/importance_sampling_ratio/max": 1.0011978149414062, "sampling/importance_sampling_ratio/mean": 0.9950479865074158, "sampling/importance_sampling_ratio/min": 0.9864206910133362, "sampling/sampling_logp_difference/max": 0.00916818156838417, "sampling/sampling_logp_difference/mean": 0.0014495104551315308, "step": 1217, "step_time": 4.236706170995603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07311192341148853, "epoch": 0.01218, "grad_norm": 0.0009216597536578774, "kl": 0.5576609075069427, "learning_rate": 9.999353430052015e-06, "loss": 0.0017, "step": 1218, "step_time": 2.0226558840149664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 62.71875, "completions/mean_terminated_length": 62.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9478963632136583, "epoch": 0.01219, "frac_reward_zero_std": 0.25, "grad_norm": 0.03062613680958748, "kl": 0.8489831760525703, "learning_rate": 9.999352335593749e-06, "loss": 0.0139, "num_tokens": 11066655.0, "reward": 0.8471298217773438, "reward_std": 0.028325071558356285, "rewards/rollout_reward_func/mean": 0.8471298217773438, "rewards/rollout_reward_func/std": 0.10571067035198212, "sampling/importance_sampling_ratio/max": 1.0020008087158203, "sampling/importance_sampling_ratio/mean": 0.9029865264892578, "sampling/importance_sampling_ratio/min": 1.2121430580969183e-16, "sampling/sampling_logp_difference/max": 3.147815465927124, "sampling/sampling_logp_difference/mean": 0.34645724296569824, "step": 1219, "step_time": 3.955272506987967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9513516873121262, "epoch": 0.0122, "grad_norm": 0.028800267726182938, "kl": 0.8710706941783428, "learning_rate": 9.999351240210043e-06, "loss": 0.0138, "step": 1220, "step_time": 2.5024442309950246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 106.125, "completions/mean_terminated_length": 106.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8847491890192032, "epoch": 0.01221, "frac_reward_zero_std": 0.5, "grad_norm": 0.019278379157185555, "kl": 0.8393009081482887, "learning_rate": 9.9993501439009e-06, "loss": -0.0581, "num_tokens": 11081755.0, "reward": 0.7477307319641113, "reward_std": 0.1186627596616745, "rewards/rollout_reward_func/mean": 0.7477307319641113, "rewards/rollout_reward_func/std": 0.34410789608955383, "sampling/importance_sampling_ratio/max": 1.0062505006790161, "sampling/importance_sampling_ratio/mean": 0.8424739837646484, "sampling/importance_sampling_ratio/min": 0.001588331419043243, "sampling/sampling_logp_difference/max": 2.366504669189453, "sampling/sampling_logp_difference/mean": 0.13386529684066772, "step": 1221, "step_time": 4.202449163996789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8805596372112632, "epoch": 0.01222, "grad_norm": 0.018846718594431877, "kl": 0.8362123146653175, "learning_rate": 9.999349046666318e-06, "loss": -0.0582, "step": 1222, "step_time": 2.025390836992301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 184.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7035752050578594, "epoch": 0.01223, "frac_reward_zero_std": 0.5, "grad_norm": 0.012458647601306438, "kl": 0.5236372835934162, "learning_rate": 9.999347948506298e-06, "loss": -0.0652, "num_tokens": 11098555.0, "reward": 0.6412500143051147, "reward_std": 0.02561761438846588, "rewards/rollout_reward_func/mean": 0.6412500143051147, "rewards/rollout_reward_func/std": 0.2562359869480133, "sampling/importance_sampling_ratio/max": 1.0172580480575562, "sampling/importance_sampling_ratio/mean": 0.8488256931304932, "sampling/importance_sampling_ratio/min": 9.794841753318906e-05, "sampling/sampling_logp_difference/max": 3.5418248176574707, "sampling/sampling_logp_difference/mean": 0.13298147916793823, "step": 1223, "step_time": 4.2778126079938374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.702574563678354, "epoch": 0.01224, "grad_norm": 0.011941468343138695, "kl": 0.5284387022256851, "learning_rate": 9.999346849420837e-06, "loss": -0.0651, "step": 1224, "step_time": 2.037825219988008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.08354502450674772, "epoch": 0.01225, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009814525255933404, "kl": 0.5681874826550484, "learning_rate": 9.99934574940994e-06, "loss": 0.0017, "num_tokens": 11114291.0, "reward": 0.8412692546844482, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8412692546844482, "rewards/rollout_reward_func/std": 0.3131154477596283, "sampling/importance_sampling_ratio/max": 0.9990978240966797, "sampling/importance_sampling_ratio/mean": 0.9936178922653198, "sampling/importance_sampling_ratio/min": 0.9871581792831421, "sampling/sampling_logp_difference/max": 0.007689911872148514, "sampling/sampling_logp_difference/mean": 0.0014409133000299335, "step": 1225, "step_time": 4.3044748670072295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08307120576500893, "epoch": 0.01226, "grad_norm": 0.0009608520194888115, "kl": 0.5683108270168304, "learning_rate": 9.999344648473603e-06, "loss": 0.0017, "step": 1226, "step_time": 2.9534371999980067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 210.21875, "completions/mean_terminated_length": 210.21875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.18734991736710072, "epoch": 0.01227, "frac_reward_zero_std": 0.75, "grad_norm": 0.008217108435928822, "kl": 0.45724892988801, "learning_rate": 9.99934354661183e-06, "loss": -0.0364, "num_tokens": 11132594.0, "reward": 0.8529374599456787, "reward_std": 0.027917128056287766, "rewards/rollout_reward_func/mean": 0.8529374599456787, "rewards/rollout_reward_func/std": 0.4250761866569519, "sampling/importance_sampling_ratio/max": 1.0017403364181519, "sampling/importance_sampling_ratio/mean": 0.967853307723999, "sampling/importance_sampling_ratio/min": 0.009105589240789413, "sampling/sampling_logp_difference/max": 1.6988379955291748, "sampling/sampling_logp_difference/mean": 0.023064767941832542, "step": 1227, "step_time": 4.6350430349993985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18608665140345693, "epoch": 0.01228, "grad_norm": 0.008458340540528297, "kl": 0.4578236937522888, "learning_rate": 9.99934244382462e-06, "loss": -0.0364, "step": 1228, "step_time": 2.036095262999879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 231.5625, "completions/mean_terminated_length": 231.5625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.24212877172976732, "epoch": 0.01229, "frac_reward_zero_std": 0.75, "grad_norm": 0.010727448388934135, "kl": 0.3976169377565384, "learning_rate": 9.999341340111972e-06, "loss": -0.0273, "num_tokens": 11151396.0, "reward": 0.7773351073265076, "reward_std": 0.047896966338157654, "rewards/rollout_reward_func/mean": 0.7773351073265076, "rewards/rollout_reward_func/std": 0.4535161852836609, "sampling/importance_sampling_ratio/max": 1.0007022619247437, "sampling/importance_sampling_ratio/mean": 0.9668436646461487, "sampling/importance_sampling_ratio/min": 5.821954982820898e-05, "sampling/sampling_logp_difference/max": 1.6711716651916504, "sampling/sampling_logp_difference/mean": 0.043820369988679886, "step": 1229, "step_time": 4.886395323002944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24236859614029527, "epoch": 0.0123, "grad_norm": 0.010696537792682648, "kl": 0.3980254493653774, "learning_rate": 9.999340235473887e-06, "loss": -0.0273, "step": 1230, "step_time": 2.1374177150064497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 123.09375, "completions/mean_terminated_length": 123.09375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4303672960959375, "epoch": 0.01231, "frac_reward_zero_std": 0.75, "grad_norm": 0.011414720676839352, "kl": 0.5798693709075451, "learning_rate": 9.999339129910366e-06, "loss": -0.0215, "num_tokens": 11166967.0, "reward": 0.6483653783798218, "reward_std": 0.06477372348308563, "rewards/rollout_reward_func/mean": 0.6483653783798218, "rewards/rollout_reward_func/std": 0.1707703173160553, "sampling/importance_sampling_ratio/max": 1.0031050443649292, "sampling/importance_sampling_ratio/mean": 0.9372475743293762, "sampling/importance_sampling_ratio/min": 0.004643870517611504, "sampling/sampling_logp_difference/max": 1.6685585975646973, "sampling/sampling_logp_difference/mean": 0.06300287693738937, "step": 1231, "step_time": 4.560506130008434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42873527109622955, "epoch": 0.01232, "grad_norm": 0.011932007037103176, "kl": 0.580218855291605, "learning_rate": 9.99933802342141e-06, "loss": -0.0215, "step": 1232, "step_time": 2.456010542002332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 176.21875, "completions/mean_terminated_length": 176.21875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6518702208995819, "epoch": 0.01233, "frac_reward_zero_std": 0.25, "grad_norm": 0.02617330104112625, "kl": 0.4672785773873329, "learning_rate": 9.999336916007016e-06, "loss": -0.0645, "num_tokens": 11183862.0, "reward": 0.8439615964889526, "reward_std": 0.07669389247894287, "rewards/rollout_reward_func/mean": 0.8439615964889526, "rewards/rollout_reward_func/std": 0.43712085485458374, "sampling/importance_sampling_ratio/max": 1.0064284801483154, "sampling/importance_sampling_ratio/mean": 0.9085307121276855, "sampling/importance_sampling_ratio/min": 7.2285920396097936e-06, "sampling/sampling_logp_difference/max": 2.0962867736816406, "sampling/sampling_logp_difference/mean": 0.10466904193162918, "step": 1233, "step_time": 4.739689908004948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6567657105624676, "epoch": 0.01234, "grad_norm": 0.025089137256145477, "kl": 0.4680766239762306, "learning_rate": 9.999335807667186e-06, "loss": -0.0645, "step": 1234, "step_time": 2.075546246000158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 172.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.038711695931851864, "epoch": 0.01235, "frac_reward_zero_std": 1.0, "grad_norm": 0.00042447459418326616, "kl": 0.44127291813492775, "learning_rate": 9.999334698401922e-06, "loss": 0.0015, "num_tokens": 11200366.0, "reward": 1.0180000066757202, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0180000066757202, "rewards/rollout_reward_func/std": 0.25660577416419983, "sampling/importance_sampling_ratio/max": 1.0067963600158691, "sampling/importance_sampling_ratio/mean": 1.001269817352295, "sampling/importance_sampling_ratio/min": 0.9958319067955017, "sampling/sampling_logp_difference/max": 0.0063263047486543655, "sampling/sampling_logp_difference/mean": 0.001073999796062708, "step": 1235, "step_time": 4.334161884013156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.037647501565515995, "epoch": 0.01236, "grad_norm": 0.0004076192562934011, "kl": 0.4414347745478153, "learning_rate": 9.999333588211223e-06, "loss": 0.0015, "step": 1236, "step_time": 2.028459948996897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 148.28125, "completions/mean_terminated_length": 146.77418518066406, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5198290231637657, "epoch": 0.01237, "frac_reward_zero_std": 0.5, "grad_norm": 0.019574493169784546, "kl": 0.5889922827482224, "learning_rate": 9.999332477095089e-06, "loss": -0.0539, "num_tokens": 11216663.0, "reward": 0.7935577034950256, "reward_std": 0.06555307656526566, "rewards/rollout_reward_func/mean": 0.7935577034950256, "rewards/rollout_reward_func/std": 0.2688158452510834, "sampling/importance_sampling_ratio/max": 1.002153754234314, "sampling/importance_sampling_ratio/mean": 0.9044978022575378, "sampling/importance_sampling_ratio/min": 0.007103620562702417, "sampling/sampling_logp_difference/max": 1.9470947980880737, "sampling/sampling_logp_difference/mean": 0.07560553401708603, "step": 1237, "step_time": 5.047069624997675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5153036308474839, "epoch": 0.01238, "grad_norm": 0.018328843638300896, "kl": 0.5930757112801075, "learning_rate": 9.99933136505352e-06, "loss": -0.0539, "step": 1238, "step_time": 2.029949575997307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 141.5625, "completions/mean_terminated_length": 141.5625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.13101808028295636, "epoch": 0.01239, "frac_reward_zero_std": 0.75, "grad_norm": 0.01862741820514202, "kl": 0.8337118104100227, "learning_rate": 9.999330252086517e-06, "loss": -0.0253, "num_tokens": 11232777.0, "reward": 0.7638990879058838, "reward_std": 0.014318910427391529, "rewards/rollout_reward_func/mean": 0.7638990879058838, "rewards/rollout_reward_func/std": 0.21639476716518402, "sampling/importance_sampling_ratio/max": 1.0027743577957153, "sampling/importance_sampling_ratio/mean": 0.9693325161933899, "sampling/importance_sampling_ratio/min": 0.07050430774688721, "sampling/sampling_logp_difference/max": 2.59067440032959, "sampling/sampling_logp_difference/mean": 0.01778041385114193, "step": 1239, "step_time": 4.055991213004745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12942512752488256, "epoch": 0.0124, "grad_norm": 0.01850852183997631, "kl": 0.8385578766465187, "learning_rate": 9.99932913819408e-06, "loss": -0.0253, "step": 1240, "step_time": 2.035723787994357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 77.5, "completions/mean_terminated_length": 77.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10428497940301895, "epoch": 0.01241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010077408514916897, "kl": 0.6732380464673042, "learning_rate": 9.99932802337621e-06, "loss": 0.0013, "num_tokens": 11246881.0, "reward": 0.7496153712272644, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7496153712272644, "rewards/rollout_reward_func/std": 0.04318135604262352, "sampling/importance_sampling_ratio/max": 1.0056662559509277, "sampling/importance_sampling_ratio/mean": 0.9919947981834412, "sampling/importance_sampling_ratio/min": 0.9803394675254822, "sampling/sampling_logp_difference/max": 0.017950981855392456, "sampling/sampling_logp_difference/mean": 0.0032872601877897978, "step": 1241, "step_time": 4.094883800004027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10320215206593275, "epoch": 0.01242, "grad_norm": 0.0009940041927620769, "kl": 0.6734327003359795, "learning_rate": 9.999326907632905e-06, "loss": 0.0013, "step": 1242, "step_time": 2.023498672991991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 190.25, "completions/mean_terminated_length": 190.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.03478498477488756, "epoch": 0.01243, "frac_reward_zero_std": 1.0, "grad_norm": 0.00036805617855861783, "kl": 0.4003814607858658, "learning_rate": 9.999325790964166e-06, "loss": 0.0015, "num_tokens": 11264657.0, "reward": 1.0095384120941162, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0095384120941162, "rewards/rollout_reward_func/std": 0.35438552498817444, "sampling/importance_sampling_ratio/max": 1.0018583536148071, "sampling/importance_sampling_ratio/mean": 0.9978867769241333, "sampling/importance_sampling_ratio/min": 0.9959188103675842, "sampling/sampling_logp_difference/max": 0.0028289910405874252, "sampling/sampling_logp_difference/mean": 0.0006056607235223055, "step": 1243, "step_time": 5.274691362006706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.034599816892296076, "epoch": 0.01244, "grad_norm": 0.0003675843181554228, "kl": 0.4004100635647774, "learning_rate": 9.999324673369997e-06, "loss": 0.0015, "step": 1244, "step_time": 2.037800026999321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.051919064950197935, "epoch": 0.01245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005551741342060268, "kl": 0.5789388343691826, "learning_rate": 9.999323554850393e-06, "loss": 0.0018, "num_tokens": 11281433.0, "reward": 0.6642307639122009, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6642307639122009, "rewards/rollout_reward_func/std": 0.07355090975761414, "sampling/importance_sampling_ratio/max": 1.003705620765686, "sampling/importance_sampling_ratio/mean": 0.999096155166626, "sampling/importance_sampling_ratio/min": 0.9911792874336243, "sampling/sampling_logp_difference/max": 0.003996133804321289, "sampling/sampling_logp_difference/mean": 0.0010032763238996267, "step": 1245, "step_time": 4.054900247007026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.051416438072919846, "epoch": 0.01246, "grad_norm": 0.0005557690165005624, "kl": 0.5790190622210503, "learning_rate": 9.999322435405358e-06, "loss": 0.0018, "step": 1246, "step_time": 2.04219468098745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 151.4375, "completions/mean_terminated_length": 151.4375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.23824748629704118, "epoch": 0.01247, "frac_reward_zero_std": 0.75, "grad_norm": 0.0034826418850570917, "kl": 0.5341657921671867, "learning_rate": 9.99932131503489e-06, "loss": -0.0174, "num_tokens": 11297431.0, "reward": 0.6874038577079773, "reward_std": 0.020397311076521873, "rewards/rollout_reward_func/mean": 0.6874038577079773, "rewards/rollout_reward_func/std": 0.3235221803188324, "sampling/importance_sampling_ratio/max": 1.003733515739441, "sampling/importance_sampling_ratio/mean": 0.9693856835365295, "sampling/importance_sampling_ratio/min": 0.014147588983178139, "sampling/sampling_logp_difference/max": 1.7479619979858398, "sampling/sampling_logp_difference/mean": 0.02741965837776661, "step": 1247, "step_time": 4.051188478995755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2384536610916257, "epoch": 0.01248, "grad_norm": 0.003013358451426029, "kl": 0.543424054980278, "learning_rate": 9.99932019373899e-06, "loss": -0.0174, "step": 1248, "step_time": 2.523345595000137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 96.25, "completions/mean_terminated_length": 96.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.052351620979607105, "epoch": 0.01249, "frac_reward_zero_std": 1.0, "grad_norm": 0.00047318165889009833, "kl": 0.48361457884311676, "learning_rate": 9.99931907151766e-06, "loss": 0.0012, "num_tokens": 11312007.0, "reward": 0.5115385055541992, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5115385055541992, "rewards/rollout_reward_func/std": 0.1183328777551651, "sampling/importance_sampling_ratio/max": 1.002604603767395, "sampling/importance_sampling_ratio/mean": 0.9992194175720215, "sampling/importance_sampling_ratio/min": 0.9973509907722473, "sampling/sampling_logp_difference/max": 0.0026682578027248383, "sampling/sampling_logp_difference/mean": 0.00067562994081527, "step": 1249, "step_time": 3.7652820849980344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05221997480839491, "epoch": 0.0125, "grad_norm": 0.000464199751149863, "kl": 0.4836362935602665, "learning_rate": 9.999317948370898e-06, "loss": 0.0012, "step": 1250, "step_time": 1.9929681560097379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 169.25, "completions/mean_terminated_length": 169.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.038551717530936, "epoch": 0.01251, "frac_reward_zero_std": 1.0, "grad_norm": 0.00036765309050679207, "kl": 0.4904016964137554, "learning_rate": 9.999316824298703e-06, "loss": 0.0016, "num_tokens": 11328927.0, "reward": 0.8601154088973999, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8601154088973999, "rewards/rollout_reward_func/std": 0.3044252097606659, "sampling/importance_sampling_ratio/max": 1.0045268535614014, "sampling/importance_sampling_ratio/mean": 1.0006622076034546, "sampling/importance_sampling_ratio/min": 0.996553361415863, "sampling/sampling_logp_difference/max": 0.00658043660223484, "sampling/sampling_logp_difference/mean": 0.000994263682514429, "step": 1251, "step_time": 4.5987534830055665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.038406705018132925, "epoch": 0.01252, "grad_norm": 0.00037262114346958697, "kl": 0.49042249098420143, "learning_rate": 9.999315699301079e-06, "loss": 0.0016, "step": 1252, "step_time": 2.045840881015465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 144.90625, "completions/mean_terminated_length": 144.90625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.19814115995541215, "epoch": 0.01253, "frac_reward_zero_std": 0.75, "grad_norm": 0.009008003398776054, "kl": 0.5664237812161446, "learning_rate": 9.999314573378024e-06, "loss": -0.0263, "num_tokens": 11344844.0, "reward": 0.7768750190734863, "reward_std": 0.0023116914089769125, "rewards/rollout_reward_func/mean": 0.7768750190734863, "rewards/rollout_reward_func/std": 0.06001397594809532, "sampling/importance_sampling_ratio/max": 1.0035489797592163, "sampling/importance_sampling_ratio/mean": 0.9694851636886597, "sampling/importance_sampling_ratio/min": 0.026363302022218704, "sampling/sampling_logp_difference/max": 1.2587270736694336, "sampling/sampling_logp_difference/mean": 0.02012198232114315, "step": 1253, "step_time": 4.179800078010885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.19754128716886044, "epoch": 0.01254, "grad_norm": 0.008823786862194538, "kl": 0.5560305826365948, "learning_rate": 9.999313446529542e-06, "loss": -0.0264, "step": 1254, "step_time": 2.9837540709995665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 163.9375, "completions/mean_terminated_length": 163.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6404608450829983, "epoch": 0.01255, "frac_reward_zero_std": 0.5, "grad_norm": 0.018343277275562286, "kl": 0.7723449617624283, "learning_rate": 9.999312318755627e-06, "loss": -0.0554, "num_tokens": 11361346.0, "reward": 0.8971394300460815, "reward_std": 0.0163586363196373, "rewards/rollout_reward_func/mean": 0.8971394300460815, "rewards/rollout_reward_func/std": 0.2708938717842102, "sampling/importance_sampling_ratio/max": 1.0031856298446655, "sampling/importance_sampling_ratio/mean": 0.9324294328689575, "sampling/importance_sampling_ratio/min": 3.004992782384974e-11, "sampling/sampling_logp_difference/max": 14.430838584899902, "sampling/sampling_logp_difference/mean": 0.18351978063583374, "step": 1255, "step_time": 4.391620734008029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6394181912764907, "epoch": 0.01256, "grad_norm": 0.018996044993400574, "kl": 0.7681060433387756, "learning_rate": 9.999311190056283e-06, "loss": -0.0554, "step": 1256, "step_time": 2.050531510991277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 218.0, "completions/mean_terminated_length": 218.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.03272239747457206, "epoch": 0.01257, "frac_reward_zero_std": 1.0, "grad_norm": 0.00037456475547514856, "kl": 0.3921767473220825, "learning_rate": 9.999310060431508e-06, "loss": 0.0016, "num_tokens": 11379522.0, "reward": 0.6871538162231445, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6871538162231445, "rewards/rollout_reward_func/std": 0.4062528610229492, "sampling/importance_sampling_ratio/max": 1.004715085029602, "sampling/importance_sampling_ratio/mean": 1.0000927448272705, "sampling/importance_sampling_ratio/min": 0.9971713423728943, "sampling/sampling_logp_difference/max": 0.004610273987054825, "sampling/sampling_logp_difference/mean": 0.0007355811540037394, "step": 1257, "step_time": 4.7215281590033555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.03289700928144157, "epoch": 0.01258, "grad_norm": 0.00037572297151200473, "kl": 0.3921364024281502, "learning_rate": 9.999308929881305e-06, "loss": 0.0016, "step": 1258, "step_time": 2.0089661050078575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 120.375, "completions/mean_terminated_length": 120.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.11468749912455678, "epoch": 0.01259, "frac_reward_zero_std": 0.75, "grad_norm": 0.05746515467762947, "kl": 0.7955443561077118, "learning_rate": 9.999307798405675e-06, "loss": 0.0292, "num_tokens": 11394582.0, "reward": 0.7659134864807129, "reward_std": 0.01128651387989521, "rewards/rollout_reward_func/mean": 0.7659134864807129, "rewards/rollout_reward_func/std": 0.3085345923900604, "sampling/importance_sampling_ratio/max": 1.0031055212020874, "sampling/importance_sampling_ratio/mean": 0.9711127281188965, "sampling/importance_sampling_ratio/min": 0.09328510612249374, "sampling/sampling_logp_difference/max": 2.308931827545166, "sampling/sampling_logp_difference/mean": 0.015882689505815506, "step": 1259, "step_time": 3.968607817987504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11498669674620032, "epoch": 0.0126, "grad_norm": 0.05715968459844589, "kl": 0.7694655023515224, "learning_rate": 9.999306666004616e-06, "loss": 0.029, "step": 1260, "step_time": 2.966998431002139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 153.75, "completions/mean_terminated_length": 153.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.04143729852512479, "epoch": 0.01261, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003968401288148016, "kl": 0.4934490695595741, "learning_rate": 9.999305532678127e-06, "loss": 0.0016, "num_tokens": 11410838.0, "reward": 0.806538462638855, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.806538462638855, "rewards/rollout_reward_func/std": 0.16548851132392883, "sampling/importance_sampling_ratio/max": 1.0029795169830322, "sampling/importance_sampling_ratio/mean": 0.9986473321914673, "sampling/importance_sampling_ratio/min": 0.9937781095504761, "sampling/sampling_logp_difference/max": 0.0024880096316337585, "sampling/sampling_logp_difference/mean": 0.0006823863368481398, "step": 1261, "step_time": 4.275184601006913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04295240854844451, "epoch": 0.01262, "grad_norm": 0.0004208639729768038, "kl": 0.4931834079325199, "learning_rate": 9.999304398426211e-06, "loss": 0.0016, "step": 1262, "step_time": 2.024389849982981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 199.40625, "completions/mean_terminated_length": 199.40625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.08848772943019867, "epoch": 0.01263, "frac_reward_zero_std": 1.0, "grad_norm": 0.004554188810288906, "kl": 0.4273051396012306, "learning_rate": 9.999303263248869e-06, "loss": 0.0016, "num_tokens": 11428315.0, "reward": 1.0686923265457153, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0686923265457153, "rewards/rollout_reward_func/std": 0.40644675493240356, "sampling/importance_sampling_ratio/max": 1.0017834901809692, "sampling/importance_sampling_ratio/mean": 0.9675872325897217, "sampling/importance_sampling_ratio/min": 2.1530861538110457e-09, "sampling/sampling_logp_difference/max": 17.95595359802246, "sampling/sampling_logp_difference/mean": 0.08971498906612396, "step": 1263, "step_time": 4.565782868005044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08487050933763385, "epoch": 0.01264, "grad_norm": 0.004225416108965874, "kl": 0.4274573400616646, "learning_rate": 9.999302127146098e-06, "loss": 0.0016, "step": 1264, "step_time": 2.0789622759984923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.03706286568194628, "epoch": 0.01265, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003996244922745973, "kl": 0.4417739436030388, "learning_rate": 9.999300990117899e-06, "loss": 0.0015, "num_tokens": 11445379.0, "reward": 0.6178076863288879, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6178076863288879, "rewards/rollout_reward_func/std": 0.27588483691215515, "sampling/importance_sampling_ratio/max": 1.001064419746399, "sampling/importance_sampling_ratio/mean": 0.9973991513252258, "sampling/importance_sampling_ratio/min": 0.994462251663208, "sampling/sampling_logp_difference/max": 0.0026203040033578873, "sampling/sampling_logp_difference/mean": 0.0007133909384720027, "step": 1265, "step_time": 4.842973239989078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.03808040404692292, "epoch": 0.01266, "grad_norm": 0.00041330239037051797, "kl": 0.4416045621037483, "learning_rate": 9.999299852164274e-06, "loss": 0.0015, "step": 1266, "step_time": 2.5508874770021066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.4799551139585674, "epoch": 0.01267, "frac_reward_zero_std": 0.5, "grad_norm": 0.03323731943964958, "kl": 0.6822002455592155, "learning_rate": 9.999298713285224e-06, "loss": -0.0185, "num_tokens": 11461179.0, "reward": 0.7523124814033508, "reward_std": 0.016358640044927597, "rewards/rollout_reward_func/mean": 0.7523124814033508, "rewards/rollout_reward_func/std": 0.06354076415300369, "sampling/importance_sampling_ratio/max": 1.0034946203231812, "sampling/importance_sampling_ratio/mean": 0.938101053237915, "sampling/importance_sampling_ratio/min": 0.0023720422759652138, "sampling/sampling_logp_difference/max": 1.960850715637207, "sampling/sampling_logp_difference/mean": 0.054930806159973145, "step": 1267, "step_time": 4.452598380994459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4751684698276222, "epoch": 0.01268, "grad_norm": 0.03393355384469032, "kl": 0.6725486814975739, "learning_rate": 9.999297573480746e-06, "loss": -0.0185, "step": 1268, "step_time": 2.0632983109971974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 168.5625, "completions/mean_terminated_length": 168.5625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2613639631308615, "epoch": 0.01269, "frac_reward_zero_std": 0.75, "grad_norm": 0.02660522609949112, "kl": 0.5679017528891563, "learning_rate": 9.999296432750842e-06, "loss": -0.0149, "num_tokens": 11477885.0, "reward": 0.7717019319534302, "reward_std": 0.009110790677368641, "rewards/rollout_reward_func/mean": 0.7717019319534302, "rewards/rollout_reward_func/std": 0.41512367129325867, "sampling/importance_sampling_ratio/max": 1.0051696300506592, "sampling/importance_sampling_ratio/mean": 0.9466469287872314, "sampling/importance_sampling_ratio/min": 0.03525248169898987, "sampling/sampling_logp_difference/max": 1.237603783607483, "sampling/sampling_logp_difference/mean": 0.024609634652733803, "step": 1269, "step_time": 4.369471233992954 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.2604563827626407, "epoch": 0.0127, "grad_norm": 0.01567414402961731, "kl": 0.5514759533107281, "learning_rate": 9.999295291095512e-06, "loss": -0.0149, "step": 1270, "step_time": 2.011538486003701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 122.5, "completions/mean_terminated_length": 122.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.06472526350989938, "epoch": 0.01271, "frac_reward_zero_std": 1.0, "grad_norm": 0.000608273025136441, "kl": 0.48318323493003845, "learning_rate": 9.999294148514757e-06, "loss": 0.0013, "num_tokens": 11492869.0, "reward": 0.7450000047683716, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7450000047683716, "rewards/rollout_reward_func/std": 0.15006113052368164, "sampling/importance_sampling_ratio/max": 1.0033786296844482, "sampling/importance_sampling_ratio/mean": 0.997002124786377, "sampling/importance_sampling_ratio/min": 0.9900893568992615, "sampling/sampling_logp_difference/max": 0.006472501903772354, "sampling/sampling_logp_difference/mean": 0.001048292382620275, "step": 1271, "step_time": 4.98468138800672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06837596325203776, "epoch": 0.01272, "grad_norm": 0.0006608310504816473, "kl": 0.4825228080153465, "learning_rate": 9.999293005008579e-06, "loss": 0.0013, "step": 1272, "step_time": 2.01439106700127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 145.9375, "completions/mean_terminated_length": 145.9375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4476079228334129, "epoch": 0.01273, "frac_reward_zero_std": 0.75, "grad_norm": 0.005550411529839039, "kl": 0.4288676157593727, "learning_rate": 9.999291860576973e-06, "loss": -0.0177, "num_tokens": 11508843.0, "reward": 0.5054519176483154, "reward_std": 0.03603525087237358, "rewards/rollout_reward_func/mean": 0.5054519176483154, "rewards/rollout_reward_func/std": 0.2092372477054596, "sampling/importance_sampling_ratio/max": 1.0030913352966309, "sampling/importance_sampling_ratio/mean": 0.966032862663269, "sampling/importance_sampling_ratio/min": 1.458261614521132e-26, "sampling/sampling_logp_difference/max": 4.375275611877441, "sampling/sampling_logp_difference/mean": 0.24863716959953308, "step": 1273, "step_time": 4.596880240002065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.44741020118817687, "epoch": 0.01274, "grad_norm": 0.0054478212259709835, "kl": 0.4269736409187317, "learning_rate": 9.999290715219945e-06, "loss": -0.0177, "step": 1274, "step_time": 2.0141013779939385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 90.5625, "completions/mean_terminated_length": 90.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9716956224292517, "epoch": 0.01275, "frac_reward_zero_std": 0.5, "grad_norm": 0.013021862134337425, "kl": 0.6956874206662178, "learning_rate": 9.999289568937491e-06, "loss": -0.0233, "num_tokens": 11522973.0, "reward": 0.7040865421295166, "reward_std": 0.08414476364850998, "rewards/rollout_reward_func/mean": 0.7040865421295166, "rewards/rollout_reward_func/std": 0.22845810651779175, "sampling/importance_sampling_ratio/max": 1.0039337873458862, "sampling/importance_sampling_ratio/mean": 0.8705209493637085, "sampling/importance_sampling_ratio/min": 0.00196604966185987, "sampling/sampling_logp_difference/max": 2.2308812141418457, "sampling/sampling_logp_difference/mean": 0.16772174835205078, "step": 1275, "step_time": 4.02600424800039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9861187199130654, "epoch": 0.01276, "grad_norm": 0.011719112284481525, "kl": 0.6763712465763092, "learning_rate": 9.999288421729613e-06, "loss": -0.0234, "step": 1276, "step_time": 2.4950275519877323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 197.625, "completions/mean_terminated_length": 197.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.18048338498920202, "epoch": 0.01277, "frac_reward_zero_std": 0.75, "grad_norm": 0.008356532081961632, "kl": 0.46980273351073265, "learning_rate": 9.999287273596313e-06, "loss": -0.0267, "num_tokens": 11540609.0, "reward": 1.0925673246383667, "reward_std": 0.03222775459289551, "rewards/rollout_reward_func/mean": 1.0925673246383667, "rewards/rollout_reward_func/std": 0.24839816987514496, "sampling/importance_sampling_ratio/max": 1.0012141466140747, "sampling/importance_sampling_ratio/mean": 0.9662699699401855, "sampling/importance_sampling_ratio/min": 0.02329881116747856, "sampling/sampling_logp_difference/max": 1.9261858463287354, "sampling/sampling_logp_difference/mean": 0.018195826560258865, "step": 1277, "step_time": 5.06327397601126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1842493717558682, "epoch": 0.01278, "grad_norm": 0.008992123417556286, "kl": 0.4661937728524208, "learning_rate": 9.999286124537588e-06, "loss": -0.0267, "step": 1278, "step_time": 2.02969202199165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3128381152637303, "epoch": 0.01279, "frac_reward_zero_std": 0.75, "grad_norm": 0.012113077566027641, "kl": 0.46890221163630486, "learning_rate": 9.999284974553441e-06, "loss": 0.0206, "num_tokens": 11556909.0, "reward": 0.6330962181091309, "reward_std": 0.020397312939167023, "rewards/rollout_reward_func/mean": 0.6330962181091309, "rewards/rollout_reward_func/std": 0.17011658847332, "sampling/importance_sampling_ratio/max": 0.9993257522583008, "sampling/importance_sampling_ratio/mean": 0.964759111404419, "sampling/importance_sampling_ratio/min": 0.0030877061653882265, "sampling/sampling_logp_difference/max": 1.718753695487976, "sampling/sampling_logp_difference/mean": 0.030519433319568634, "step": 1279, "step_time": 4.400817585999903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31188040133565664, "epoch": 0.0128, "grad_norm": 0.011623096652328968, "kl": 0.4643840044736862, "learning_rate": 9.99928382364387e-06, "loss": 0.0206, "step": 1280, "step_time": 2.0065381590029574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 143.375, "completions/mean_terminated_length": 143.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.21457201894372702, "epoch": 0.01281, "frac_reward_zero_std": 0.75, "grad_norm": 0.0699269101023674, "kl": 0.8113092444837093, "learning_rate": 9.999282671808878e-06, "loss": -0.0262, "num_tokens": 11572817.0, "reward": 0.8859134912490845, "reward_std": 0.0431063212454319, "rewards/rollout_reward_func/mean": 0.8859134912490845, "rewards/rollout_reward_func/std": 0.24299873411655426, "sampling/importance_sampling_ratio/max": 1.0060498714447021, "sampling/importance_sampling_ratio/mean": 0.9690951704978943, "sampling/importance_sampling_ratio/min": 0.002561582252383232, "sampling/sampling_logp_difference/max": 2.2307639122009277, "sampling/sampling_logp_difference/mean": 0.03230401128530502, "step": 1281, "step_time": 4.247797586016532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21598965488374233, "epoch": 0.01282, "grad_norm": 0.06936129182577133, "kl": 0.8107188828289509, "learning_rate": 9.999281519048462e-06, "loss": -0.0262, "step": 1282, "step_time": 2.454946564990678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 164.03125, "completions/mean_terminated_length": 164.03125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 1.2034504553303123, "epoch": 0.01283, "frac_reward_zero_std": 0.0, "grad_norm": 0.062414225190877914, "kl": 0.6023984141647816, "learning_rate": 9.999280365362623e-06, "loss": -0.0582, "num_tokens": 11589266.0, "reward": 0.7935658693313599, "reward_std": 0.12812639772891998, "rewards/rollout_reward_func/mean": 0.7935658693313599, "rewards/rollout_reward_func/std": 0.31235283613204956, "sampling/importance_sampling_ratio/max": 0.9992522597312927, "sampling/importance_sampling_ratio/mean": 0.8741808533668518, "sampling/importance_sampling_ratio/min": 4.8578126893776607e-32, "sampling/sampling_logp_difference/max": 3.6665773391723633, "sampling/sampling_logp_difference/mean": 0.4251552224159241, "step": 1283, "step_time": 5.323260452991235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1905957283452153, "epoch": 0.01284, "grad_norm": 0.06557829678058624, "kl": 0.5934944674372673, "learning_rate": 9.999279210751366e-06, "loss": -0.0584, "step": 1284, "step_time": 2.0686206919926917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 227.0625, "completions/mean_terminated_length": 227.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7987079760059714, "epoch": 0.01285, "frac_reward_zero_std": 0.5, "grad_norm": 0.03505060821771622, "kl": 0.4348960556089878, "learning_rate": 9.999278055214684e-06, "loss": -0.0694, "num_tokens": 11607620.0, "reward": 0.8940730690956116, "reward_std": 0.02867359295487404, "rewards/rollout_reward_func/mean": 0.8940730690956116, "rewards/rollout_reward_func/std": 0.3795839548110962, "sampling/importance_sampling_ratio/max": 1.002026081085205, "sampling/importance_sampling_ratio/mean": 0.8752467036247253, "sampling/importance_sampling_ratio/min": 4.523766397630028e-17, "sampling/sampling_logp_difference/max": 4.333621978759766, "sampling/sampling_logp_difference/mean": 0.22078928351402283, "step": 1285, "step_time": 4.754138121003052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7917940854094923, "epoch": 0.01286, "grad_norm": 0.037475988268852234, "kl": 0.42974161729216576, "learning_rate": 9.999276898752583e-06, "loss": -0.0693, "step": 1286, "step_time": 2.108399917007773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 129.875, "completions/mean_terminated_length": 129.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.7231459356844425, "epoch": 0.01287, "frac_reward_zero_std": 0.25, "grad_norm": 0.031143054366111755, "kl": 0.570008672773838, "learning_rate": 9.99927574136506e-06, "loss": -0.0546, "num_tokens": 11623264.0, "reward": 0.7625672817230225, "reward_std": 0.09641128033399582, "rewards/rollout_reward_func/mean": 0.7625672817230225, "rewards/rollout_reward_func/std": 0.12007791548967361, "sampling/importance_sampling_ratio/max": 1.0040040016174316, "sampling/importance_sampling_ratio/mean": 0.9082813858985901, "sampling/importance_sampling_ratio/min": 0.0006620291969738901, "sampling/sampling_logp_difference/max": 2.669934034347534, "sampling/sampling_logp_difference/mean": 0.10966402292251587, "step": 1287, "step_time": 4.062634117006382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7216344494372606, "epoch": 0.01288, "grad_norm": 0.03163834288716316, "kl": 0.570501122623682, "learning_rate": 9.999274583052117e-06, "loss": -0.0546, "step": 1288, "step_time": 2.47368278898648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 146.1875, "completions/mean_terminated_length": 146.1875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.27651628851890564, "epoch": 0.01289, "frac_reward_zero_std": 0.75, "grad_norm": 0.018666399642825127, "kl": 0.41108809411525726, "learning_rate": 9.999273423813754e-06, "loss": -0.017, "num_tokens": 11638814.0, "reward": 0.572211503982544, "reward_std": 0.0013598214136436582, "rewards/rollout_reward_func/mean": 0.572211503982544, "rewards/rollout_reward_func/std": 0.3078639507293701, "sampling/importance_sampling_ratio/max": 0.9992583990097046, "sampling/importance_sampling_ratio/mean": 0.9652523994445801, "sampling/importance_sampling_ratio/min": 0.021951591596007347, "sampling/sampling_logp_difference/max": 1.2562401294708252, "sampling/sampling_logp_difference/mean": 0.02993243932723999, "step": 1289, "step_time": 4.615533914999105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2610416179522872, "epoch": 0.0129, "grad_norm": 0.018947433680295944, "kl": 0.4109705500304699, "learning_rate": 9.99927226364997e-06, "loss": -0.017, "step": 1290, "step_time": 2.0010559519942035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 155.90625, "completions/mean_terminated_length": 155.90625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.7506033242680132, "epoch": 0.01291, "frac_reward_zero_std": 0.25, "grad_norm": 0.047324810177087784, "kl": 0.5619952045381069, "learning_rate": 9.999271102560767e-06, "loss": -0.011, "num_tokens": 11654851.0, "reward": 0.7593461275100708, "reward_std": 0.044874079525470734, "rewards/rollout_reward_func/mean": 0.7593461275100708, "rewards/rollout_reward_func/std": 0.4363458454608917, "sampling/importance_sampling_ratio/max": 1.006218671798706, "sampling/importance_sampling_ratio/mean": 0.9108190536499023, "sampling/importance_sampling_ratio/min": 2.1188668597460492e-07, "sampling/sampling_logp_difference/max": 3.9241809844970703, "sampling/sampling_logp_difference/mean": 0.1824510544538498, "step": 1291, "step_time": 4.221031810004206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7525098789483309, "epoch": 0.01292, "grad_norm": 0.04704195261001587, "kl": 0.5610715001821518, "learning_rate": 9.999269940546145e-06, "loss": -0.011, "step": 1292, "step_time": 2.028258356011065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.7220657579600811, "epoch": 0.01293, "frac_reward_zero_std": 0.25, "grad_norm": 0.05894298478960991, "kl": 0.7305177710950375, "learning_rate": 9.999268777606102e-06, "loss": -0.0652, "num_tokens": 11670747.0, "reward": 0.5154807567596436, "reward_std": 0.21034827828407288, "rewards/rollout_reward_func/mean": 0.5154807567596436, "rewards/rollout_reward_func/std": 0.3429405689239502, "sampling/importance_sampling_ratio/max": 0.9994614124298096, "sampling/importance_sampling_ratio/mean": 0.8798926472663879, "sampling/importance_sampling_ratio/min": 0.004689632914960384, "sampling/sampling_logp_difference/max": 1.8231682777404785, "sampling/sampling_logp_difference/mean": 0.09295858442783356, "step": 1293, "step_time": 4.676635049996548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.723910640925169, "epoch": 0.01294, "grad_norm": 0.05369097366929054, "kl": 0.7453597784042358, "learning_rate": 9.999267613740642e-06, "loss": -0.0653, "step": 1294, "step_time": 2.009367974977067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 139.9375, "completions/mean_terminated_length": 139.9375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.727079389616847, "epoch": 0.01295, "frac_reward_zero_std": 0.5, "grad_norm": 0.04598722234368324, "kl": 0.45561081171035767, "learning_rate": 9.999266448949762e-06, "loss": -0.0521, "num_tokens": 11686129.0, "reward": 0.7117307186126709, "reward_std": 0.03628195449709892, "rewards/rollout_reward_func/mean": 0.7117307186126709, "rewards/rollout_reward_func/std": 0.24720579385757446, "sampling/importance_sampling_ratio/max": 1.0020992755889893, "sampling/importance_sampling_ratio/mean": 0.8775624632835388, "sampling/importance_sampling_ratio/min": 0.007174880709499121, "sampling/sampling_logp_difference/max": 1.8772859573364258, "sampling/sampling_logp_difference/mean": 0.09484173357486725, "step": 1295, "step_time": 4.556713084006333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7162381438538432, "epoch": 0.01296, "grad_norm": 0.04470881447196007, "kl": 0.46169329434633255, "learning_rate": 9.999265283233466e-06, "loss": -0.0522, "step": 1296, "step_time": 2.0021107730135554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 70.75, "completions/mean_terminated_length": 70.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3763555195182562, "epoch": 0.01297, "frac_reward_zero_std": 0.25, "grad_norm": 0.03733229637145996, "kl": 0.7244555354118347, "learning_rate": 9.99926411659175e-06, "loss": -0.0239, "num_tokens": 11699793.0, "reward": 0.7370432615280151, "reward_std": 0.09267513453960419, "rewards/rollout_reward_func/mean": 0.7370432615280151, "rewards/rollout_reward_func/std": 0.1291072517633438, "sampling/importance_sampling_ratio/max": 0.9985619783401489, "sampling/importance_sampling_ratio/mean": 0.8358238935470581, "sampling/importance_sampling_ratio/min": 0.006413630675524473, "sampling/sampling_logp_difference/max": 2.1153347492218018, "sampling/sampling_logp_difference/mean": 0.2365667223930359, "step": 1297, "step_time": 3.8630477219994646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3991561103612185, "epoch": 0.01298, "grad_norm": 0.034402038902044296, "kl": 0.7417536191642284, "learning_rate": 9.999262949024617e-06, "loss": -0.024, "step": 1298, "step_time": 2.0065536470065126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 122.96875, "completions/mean_terminated_length": 122.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.29422425385564566, "epoch": 0.01299, "frac_reward_zero_std": 0.75, "grad_norm": 0.013984274119138718, "kl": 0.5495308488607407, "learning_rate": 9.999261780532066e-06, "loss": -0.0174, "num_tokens": 11715072.0, "reward": 0.9103077054023743, "reward_std": 0.013598206453025341, "rewards/rollout_reward_func/mean": 0.9103077054023743, "rewards/rollout_reward_func/std": 0.34617045521736145, "sampling/importance_sampling_ratio/max": 1.0032687187194824, "sampling/importance_sampling_ratio/mean": 0.9638819694519043, "sampling/importance_sampling_ratio/min": 0.01736447960138321, "sampling/sampling_logp_difference/max": 1.080897331237793, "sampling/sampling_logp_difference/mean": 0.032573431730270386, "step": 1299, "step_time": 4.58648933499353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29671358317136765, "epoch": 0.013, "grad_norm": 0.012687320820987225, "kl": 0.5512314066290855, "learning_rate": 9.999260611114098e-06, "loss": -0.0174, "step": 1300, "step_time": 2.467985619994579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 193.5, "completions/mean_terminated_length": 193.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.051078492775559425, "epoch": 0.01301, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005929034086875618, "kl": 0.42518704384565353, "learning_rate": 9.999259440770715e-06, "loss": 0.0016, "num_tokens": 11732408.0, "reward": 0.7478076815605164, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7478076815605164, "rewards/rollout_reward_func/std": 0.317525714635849, "sampling/importance_sampling_ratio/max": 1.0050745010375977, "sampling/importance_sampling_ratio/mean": 0.9978852272033691, "sampling/importance_sampling_ratio/min": 0.9927322268486023, "sampling/sampling_logp_difference/max": 0.0044958144426345825, "sampling/sampling_logp_difference/mean": 0.000996569637209177, "step": 1301, "step_time": 4.314879794008448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04953324096277356, "epoch": 0.01302, "grad_norm": 0.0005727360839955509, "kl": 0.4254228062927723, "learning_rate": 9.999258269501912e-06, "loss": 0.0016, "step": 1302, "step_time": 2.039455184007238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 215.21875, "completions/mean_terminated_length": 215.21875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.6909585651010275, "epoch": 0.01303, "frac_reward_zero_std": 0.5, "grad_norm": 0.009494687430560589, "kl": 0.45928776636719704, "learning_rate": 9.999257097307696e-06, "loss": -0.0646, "num_tokens": 11750519.0, "reward": 0.9116394519805908, "reward_std": 0.03974756598472595, "rewards/rollout_reward_func/mean": 0.9116394519805908, "rewards/rollout_reward_func/std": 0.36208227276802063, "sampling/importance_sampling_ratio/max": 1.0032148361206055, "sampling/importance_sampling_ratio/mean": 0.9035483002662659, "sampling/importance_sampling_ratio/min": 1.5542214867533821e-12, "sampling/sampling_logp_difference/max": 3.4693245887756348, "sampling/sampling_logp_difference/mean": 0.1564265340566635, "step": 1303, "step_time": 4.618917405008688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6950715845450759, "epoch": 0.01304, "grad_norm": 0.008440546691417694, "kl": 0.47050773724913597, "learning_rate": 9.999255924188063e-06, "loss": -0.0646, "step": 1304, "step_time": 2.013287432011566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 166.6875, "completions/mean_terminated_length": 166.6875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2356631592847407, "epoch": 0.01305, "frac_reward_zero_std": 0.75, "grad_norm": 0.004198180977255106, "kl": 0.5429203137755394, "learning_rate": 9.999254750143012e-06, "loss": -0.0172, "num_tokens": 11766533.0, "reward": 0.4482884705066681, "reward_std": 0.020397311076521873, "rewards/rollout_reward_func/mean": 0.4482884705066681, "rewards/rollout_reward_func/std": 0.1981276422739029, "sampling/importance_sampling_ratio/max": 1.0011836290359497, "sampling/importance_sampling_ratio/mean": 0.9673216342926025, "sampling/importance_sampling_ratio/min": 0.01738257333636284, "sampling/sampling_logp_difference/max": 2.036444664001465, "sampling/sampling_logp_difference/mean": 0.026019785553216934, "step": 1305, "step_time": 4.5339935190058895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2361466269940138, "epoch": 0.01306, "grad_norm": 0.003493096213787794, "kl": 0.557588554918766, "learning_rate": 9.99925357517255e-06, "loss": -0.0172, "step": 1306, "step_time": 2.468658403995505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 154.15625, "completions/mean_terminated_length": 154.15625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.6461389726027846, "epoch": 0.01307, "frac_reward_zero_std": 0.75, "grad_norm": 0.0535661019384861, "kl": 0.6469090729951859, "learning_rate": 9.999252399276669e-06, "loss": 0.0141, "num_tokens": 11782426.0, "reward": 0.7689135074615479, "reward_std": 0.024839669466018677, "rewards/rollout_reward_func/mean": 0.7689135074615479, "rewards/rollout_reward_func/std": 0.298496812582016, "sampling/importance_sampling_ratio/max": 1.0011019706726074, "sampling/importance_sampling_ratio/mean": 0.908641517162323, "sampling/importance_sampling_ratio/min": 2.0689545635832474e-05, "sampling/sampling_logp_difference/max": 2.3695168495178223, "sampling/sampling_logp_difference/mean": 0.10037582367658615, "step": 1307, "step_time": 4.156307684010244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.652153268456459, "epoch": 0.01308, "grad_norm": 0.05777965113520622, "kl": 0.6511204950511456, "learning_rate": 9.999251222455376e-06, "loss": 0.0141, "step": 1308, "step_time": 2.0253215320044546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 245.6875, "completions/mean_terminated_length": 245.6875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.6509340247139335, "epoch": 0.01309, "frac_reward_zero_std": 0.5, "grad_norm": 0.05387648567557335, "kl": 0.7005035392940044, "learning_rate": 9.999250044708666e-06, "loss": -0.0565, "num_tokens": 11801240.0, "reward": 0.9965283870697021, "reward_std": 0.040655918419361115, "rewards/rollout_reward_func/mean": 0.9965283870697021, "rewards/rollout_reward_func/std": 0.4395962059497833, "sampling/importance_sampling_ratio/max": 1.000552773475647, "sampling/importance_sampling_ratio/mean": 0.9037533402442932, "sampling/importance_sampling_ratio/min": 3.349004629171759e-09, "sampling/sampling_logp_difference/max": 10.544958114624023, "sampling/sampling_logp_difference/mean": 0.17927321791648865, "step": 1309, "step_time": 4.760324481008865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.649867620319128, "epoch": 0.0131, "grad_norm": 0.045154470950365067, "kl": 0.6647818461060524, "learning_rate": 9.999248866036543e-06, "loss": -0.0567, "step": 1310, "step_time": 2.052830881009868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 186.84375, "completions/mean_terminated_length": 186.84375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6549588688649237, "epoch": 0.01311, "frac_reward_zero_std": 0.5, "grad_norm": 0.017483720555901527, "kl": 0.487897589802742, "learning_rate": 9.999247686439005e-06, "loss": -0.0672, "num_tokens": 11818387.0, "reward": 0.5681154131889343, "reward_std": 0.08361306041479111, "rewards/rollout_reward_func/mean": 0.5681154131889343, "rewards/rollout_reward_func/std": 0.4468555450439453, "sampling/importance_sampling_ratio/max": 1.0045764446258545, "sampling/importance_sampling_ratio/mean": 0.905582070350647, "sampling/importance_sampling_ratio/min": 0.001776637858711183, "sampling/sampling_logp_difference/max": 2.325226068496704, "sampling/sampling_logp_difference/mean": 0.07978974282741547, "step": 1311, "step_time": 4.980236396986584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6630033547990024, "epoch": 0.01312, "grad_norm": 0.017017453908920288, "kl": 0.4907609708607197, "learning_rate": 9.999246505916055e-06, "loss": -0.0672, "step": 1312, "step_time": 2.4769956920063123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 146.25, "completions/mean_terminated_length": 146.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05234120925888419, "epoch": 0.01313, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005347445257939398, "kl": 0.44472306221723557, "learning_rate": 9.99924532446769e-06, "loss": 0.0014, "num_tokens": 11834211.0, "reward": 0.6688461303710938, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6688461303710938, "rewards/rollout_reward_func/std": 0.2575201392173767, "sampling/importance_sampling_ratio/max": 1.0041334629058838, "sampling/importance_sampling_ratio/mean": 0.9984411001205444, "sampling/importance_sampling_ratio/min": 0.9907696843147278, "sampling/sampling_logp_difference/max": 0.0044445740059018135, "sampling/sampling_logp_difference/mean": 0.0011071200715377927, "step": 1313, "step_time": 4.051188111989177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05306385178118944, "epoch": 0.01314, "grad_norm": 0.0005458735395222902, "kl": 0.44460633397102356, "learning_rate": 9.999244142093913e-06, "loss": 0.0014, "step": 1314, "step_time": 2.013289378010086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 208.84375, "completions/mean_terminated_length": 208.84375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.8066281089559197, "epoch": 0.01315, "frac_reward_zero_std": 0.0, "grad_norm": 0.08321329951286316, "kl": 0.6464288011193275, "learning_rate": 9.999242958794724e-06, "loss": 0.0068, "num_tokens": 11851766.0, "reward": 0.499860554933548, "reward_std": 0.05503194406628609, "rewards/rollout_reward_func/mean": 0.499860554933548, "rewards/rollout_reward_func/std": 0.11962682008743286, "sampling/importance_sampling_ratio/max": 1.0027421712875366, "sampling/importance_sampling_ratio/mean": 0.8778905868530273, "sampling/importance_sampling_ratio/min": 2.389971276439695e-10, "sampling/sampling_logp_difference/max": 2.750720500946045, "sampling/sampling_logp_difference/mean": 0.18934404850006104, "step": 1315, "step_time": 4.5151926089965855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8078696732409298, "epoch": 0.01316, "grad_norm": 0.0843530148267746, "kl": 0.6284646615386009, "learning_rate": 9.999241774570122e-06, "loss": 0.0066, "step": 1316, "step_time": 2.5308803230072954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06623086798936129, "epoch": 0.01317, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007329689105972648, "kl": 0.44935471937060356, "learning_rate": 9.999240589420108e-06, "loss": 0.0015, "num_tokens": 11868094.0, "reward": 0.450884610414505, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.450884610414505, "rewards/rollout_reward_func/std": 0.06583263725042343, "sampling/importance_sampling_ratio/max": 1.0057001113891602, "sampling/importance_sampling_ratio/mean": 0.9984897375106812, "sampling/importance_sampling_ratio/min": 0.992713212966919, "sampling/sampling_logp_difference/max": 0.004808738827705383, "sampling/sampling_logp_difference/mean": 0.0009190706769004464, "step": 1317, "step_time": 4.762564324984851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07097805151715875, "epoch": 0.01318, "grad_norm": 0.000802101450972259, "kl": 0.44853324070572853, "learning_rate": 9.999239403344681e-06, "loss": 0.0015, "step": 1318, "step_time": 2.0348554799857084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06524069141596556, "epoch": 0.01319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007747364579699934, "kl": 0.3943054899573326, "learning_rate": 9.999238216343842e-06, "loss": 0.0014, "num_tokens": 11885606.0, "reward": 0.6229230761528015, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6229230761528015, "rewards/rollout_reward_func/std": 0.2924079895019531, "sampling/importance_sampling_ratio/max": 1.0018413066864014, "sampling/importance_sampling_ratio/mean": 0.9955152869224548, "sampling/importance_sampling_ratio/min": 0.9905089735984802, "sampling/sampling_logp_difference/max": 0.00717267207801342, "sampling/sampling_logp_difference/mean": 0.0009615521994419396, "step": 1319, "step_time": 4.757057088987494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06855549849569798, "epoch": 0.0132, "grad_norm": 0.0008187766652554274, "kl": 0.3937304690480232, "learning_rate": 9.999237028417591e-06, "loss": 0.0014, "step": 1320, "step_time": 2.0037564060039585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 203.125, "completions/mean_terminated_length": 203.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.9317051833495498, "epoch": 0.01321, "frac_reward_zero_std": 0.25, "grad_norm": 0.07586918771266937, "kl": 0.5527590401470661, "learning_rate": 9.999235839565933e-06, "loss": 0.0085, "num_tokens": 11903194.0, "reward": 0.6712067127227783, "reward_std": 0.07380010187625885, "rewards/rollout_reward_func/mean": 0.6712067127227783, "rewards/rollout_reward_func/std": 0.3106011748313904, "sampling/importance_sampling_ratio/max": 1.0035250186920166, "sampling/importance_sampling_ratio/mean": 0.8849591016769409, "sampling/importance_sampling_ratio/min": 3.2811954542300015e-11, "sampling/sampling_logp_difference/max": 11.458961486816406, "sampling/sampling_logp_difference/mean": 0.260027676820755, "step": 1321, "step_time": 4.609124626003904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.933549664914608, "epoch": 0.01322, "grad_norm": 0.06856736540794373, "kl": 0.5528660491108894, "learning_rate": 9.999234649788861e-06, "loss": 0.0082, "step": 1322, "step_time": 2.5238265579973813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 124.75, "completions/mean_terminated_length": 124.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1031043529510498, "epoch": 0.01323, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009799805702641606, "kl": 0.5687473751604557, "learning_rate": 9.99923345908638e-06, "loss": 0.0015, "num_tokens": 11918370.0, "reward": 0.7939615249633789, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7939615249633789, "rewards/rollout_reward_func/std": 0.2937357425689697, "sampling/importance_sampling_ratio/max": 1.0020906925201416, "sampling/importance_sampling_ratio/mean": 0.9931386709213257, "sampling/importance_sampling_ratio/min": 0.9725257754325867, "sampling/sampling_logp_difference/max": 0.025823406875133514, "sampling/sampling_logp_difference/mean": 0.0025376668199896812, "step": 1323, "step_time": 4.717920845003391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10332404635846615, "epoch": 0.01324, "grad_norm": 0.0009492363897152245, "kl": 0.5687924884259701, "learning_rate": 9.999232267458488e-06, "loss": 0.0015, "step": 1324, "step_time": 2.115498603998276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 124.59375, "completions/mean_terminated_length": 124.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6363370297476649, "epoch": 0.01325, "frac_reward_zero_std": 0.25, "grad_norm": 0.09376170486211777, "kl": 0.6984387934207916, "learning_rate": 9.999231074905187e-06, "loss": -0.0591, "num_tokens": 11933669.0, "reward": 0.7180769443511963, "reward_std": 0.06910813599824905, "rewards/rollout_reward_func/mean": 0.7180769443511963, "rewards/rollout_reward_func/std": 0.2270122617483139, "sampling/importance_sampling_ratio/max": 0.9998301863670349, "sampling/importance_sampling_ratio/mean": 0.8829416036605835, "sampling/importance_sampling_ratio/min": 0.0009730057790875435, "sampling/sampling_logp_difference/max": 2.2971601486206055, "sampling/sampling_logp_difference/mean": 0.12101665139198303, "step": 1325, "step_time": 4.485896392005088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6362133184447885, "epoch": 0.01326, "grad_norm": 0.08544206619262695, "kl": 0.7157261520624161, "learning_rate": 9.999229881426476e-06, "loss": -0.0594, "step": 1326, "step_time": 2.033899279005709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 145.3125, "completions/mean_terminated_length": 145.3125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.13537881057709455, "epoch": 0.01327, "frac_reward_zero_std": 0.75, "grad_norm": 0.031385209411382675, "kl": 0.4489494003355503, "learning_rate": 9.999228687022356e-06, "loss": -0.0246, "num_tokens": 11949359.0, "reward": 0.8773605823516846, "reward_std": 0.041515324264764786, "rewards/rollout_reward_func/mean": 0.8773605823516846, "rewards/rollout_reward_func/std": 0.3331027626991272, "sampling/importance_sampling_ratio/max": 1.0021686553955078, "sampling/importance_sampling_ratio/mean": 0.969268798828125, "sampling/importance_sampling_ratio/min": 0.15182296931743622, "sampling/sampling_logp_difference/max": 1.764430046081543, "sampling/sampling_logp_difference/mean": 0.01148536428809166, "step": 1327, "step_time": 4.293529957998544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1313513796776533, "epoch": 0.01328, "grad_norm": 0.03064010851085186, "kl": 0.4557613804936409, "learning_rate": 9.999227491692825e-06, "loss": -0.0247, "step": 1328, "step_time": 2.500959741009865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 151.84375, "completions/mean_terminated_length": 151.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6260937759652734, "epoch": 0.01329, "frac_reward_zero_std": 0.75, "grad_norm": 0.013812687247991562, "kl": 0.6496467404067516, "learning_rate": 9.999226295437887e-06, "loss": -0.03, "num_tokens": 11965314.0, "reward": 0.8168302774429321, "reward_std": 0.021456705406308174, "rewards/rollout_reward_func/mean": 0.8168302774429321, "rewards/rollout_reward_func/std": 0.25876280665397644, "sampling/importance_sampling_ratio/max": 1.005192518234253, "sampling/importance_sampling_ratio/mean": 0.9023069143295288, "sampling/importance_sampling_ratio/min": 0.0005354344611987472, "sampling/sampling_logp_difference/max": 2.083009958267212, "sampling/sampling_logp_difference/mean": 0.13044987618923187, "step": 1329, "step_time": 4.923586313008855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6146269598975778, "epoch": 0.0133, "grad_norm": 0.015620307065546513, "kl": 0.6614680960774422, "learning_rate": 9.99922509825754e-06, "loss": -0.0299, "step": 1330, "step_time": 2.014601914997911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 170.21875, "completions/mean_terminated_length": 170.21875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.40788113651797175, "epoch": 0.01331, "frac_reward_zero_std": 0.75, "grad_norm": 0.010469703935086727, "kl": 0.607055801898241, "learning_rate": 9.999223900151786e-06, "loss": -0.0261, "num_tokens": 11981665.0, "reward": 0.861432671546936, "reward_std": 0.21959979832172394, "rewards/rollout_reward_func/mean": 0.861432671546936, "rewards/rollout_reward_func/std": 0.45803648233413696, "sampling/importance_sampling_ratio/max": 1.0023905038833618, "sampling/importance_sampling_ratio/mean": 0.9415366649627686, "sampling/importance_sampling_ratio/min": 5.319370757206343e-06, "sampling/sampling_logp_difference/max": 2.370485782623291, "sampling/sampling_logp_difference/mean": 0.08569689095020294, "step": 1331, "step_time": 4.2483374250005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40635170927271247, "epoch": 0.01332, "grad_norm": 0.010403476655483246, "kl": 0.6087285690009594, "learning_rate": 9.999222701120623e-06, "loss": -0.0261, "step": 1332, "step_time": 2.014900211994245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4311720421537757, "epoch": 0.01333, "frac_reward_zero_std": 0.75, "grad_norm": 0.020575227215886116, "kl": 0.5933473408222198, "learning_rate": 9.999221501164055e-06, "loss": -0.0097, "num_tokens": 11998461.0, "reward": 0.7923461198806763, "reward_std": 0.03026719205081463, "rewards/rollout_reward_func/mean": 0.7923461198806763, "rewards/rollout_reward_func/std": 0.1469733715057373, "sampling/importance_sampling_ratio/max": 1.0052353143692017, "sampling/importance_sampling_ratio/mean": 0.9373966455459595, "sampling/importance_sampling_ratio/min": 0.009200683794915676, "sampling/sampling_logp_difference/max": 2.1956820487976074, "sampling/sampling_logp_difference/mean": 0.05060049518942833, "step": 1333, "step_time": 4.94970794099936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4381307829171419, "epoch": 0.01334, "grad_norm": 0.018543794751167297, "kl": 0.6067352667450905, "learning_rate": 9.999220300282077e-06, "loss": -0.0097, "step": 1334, "step_time": 2.032575192009972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 204.25, "completions/mean_terminated_length": 204.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.04670982202515006, "epoch": 0.01335, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005387832061387599, "kl": 0.4197981096804142, "learning_rate": 9.999219098474695e-06, "loss": 0.0015, "num_tokens": 12016349.0, "reward": 1.2118844985961914, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.2118844985961914, "rewards/rollout_reward_func/std": 0.25157904624938965, "sampling/importance_sampling_ratio/max": 1.0055474042892456, "sampling/importance_sampling_ratio/mean": 1.0012726783752441, "sampling/importance_sampling_ratio/min": 0.99580317735672, "sampling/sampling_logp_difference/max": 0.004027398303151131, "sampling/sampling_logp_difference/mean": 0.0012514678528532386, "step": 1335, "step_time": 4.971051824999449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.045299858786165714, "epoch": 0.01336, "grad_norm": 0.0005169525393284857, "kl": 0.4200350306928158, "learning_rate": 9.999217895741903e-06, "loss": 0.0015, "step": 1336, "step_time": 2.030689242994413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 120.75, "completions/mean_terminated_length": 120.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05266926297917962, "epoch": 0.01337, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004329493676777929, "kl": 0.4936642348766327, "learning_rate": 9.999216692083707e-06, "loss": 0.0014, "num_tokens": 12031573.0, "reward": 0.5584615468978882, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5584615468978882, "rewards/rollout_reward_func/std": 0.21005530655384064, "sampling/importance_sampling_ratio/max": 1.0027903318405151, "sampling/importance_sampling_ratio/mean": 0.9980959892272949, "sampling/importance_sampling_ratio/min": 0.9951150417327881, "sampling/sampling_logp_difference/max": 0.005144350230693817, "sampling/sampling_logp_difference/mean": 0.000965076731517911, "step": 1337, "step_time": 4.2024823789906804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.051986947655677795, "epoch": 0.01338, "grad_norm": 0.0004310199583414942, "kl": 0.49377651885151863, "learning_rate": 9.999215487500103e-06, "loss": 0.0014, "step": 1338, "step_time": 1.9948313670101925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 176.78125, "completions/mean_terminated_length": 176.78125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5759980478323996, "epoch": 0.01339, "frac_reward_zero_std": 0.5, "grad_norm": 0.010307326912879944, "kl": 0.49363891780376434, "learning_rate": 9.999214281991096e-06, "loss": 0.0115, "num_tokens": 12048950.0, "reward": 0.899599552154541, "reward_std": 0.012683052569627762, "rewards/rollout_reward_func/mean": 0.899599552154541, "rewards/rollout_reward_func/std": 0.32372522354125977, "sampling/importance_sampling_ratio/max": 1.0006123781204224, "sampling/importance_sampling_ratio/mean": 0.9351847171783447, "sampling/importance_sampling_ratio/min": 6.795831309665398e-15, "sampling/sampling_logp_difference/max": 15.459518432617188, "sampling/sampling_logp_difference/mean": 0.1939167082309723, "step": 1339, "step_time": 5.160620510003355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5801120600663126, "epoch": 0.0134, "grad_norm": 0.010390887968242168, "kl": 0.4956454858183861, "learning_rate": 9.999213075556682e-06, "loss": 0.0115, "step": 1340, "step_time": 2.457390096009476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4818174382671714, "epoch": 0.01341, "frac_reward_zero_std": 0.75, "grad_norm": 0.007630728650838137, "kl": 0.5256568677723408, "learning_rate": 9.999211868196863e-06, "loss": -0.0274, "num_tokens": 12064826.0, "reward": 0.7719904184341431, "reward_std": 0.05126524344086647, "rewards/rollout_reward_func/mean": 0.7719904184341431, "rewards/rollout_reward_func/std": 0.2779315114021301, "sampling/importance_sampling_ratio/max": 1.003145456314087, "sampling/importance_sampling_ratio/mean": 0.9666610956192017, "sampling/importance_sampling_ratio/min": 4.2287351033753925e-21, "sampling/sampling_logp_difference/max": 3.0140089988708496, "sampling/sampling_logp_difference/mean": 0.18453861773014069, "step": 1341, "step_time": 4.7391264540056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.47904964396730065, "epoch": 0.01342, "grad_norm": 0.006716574542224407, "kl": 0.522053599357605, "learning_rate": 9.999210659911638e-06, "loss": -0.0274, "step": 1342, "step_time": 2.0281360980006866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 95.75, "completions/mean_terminated_length": 95.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.059315482154488564, "epoch": 0.01343, "frac_reward_zero_std": 1.0, "grad_norm": 0.00043207823182456195, "kl": 0.5117799453437328, "learning_rate": 9.999209450701009e-06, "loss": 0.0012, "num_tokens": 12078826.0, "reward": 0.8149999380111694, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8149999380111694, "rewards/rollout_reward_func/std": 0.2264343798160553, "sampling/importance_sampling_ratio/max": 1.001037836074829, "sampling/importance_sampling_ratio/mean": 0.9968430995941162, "sampling/importance_sampling_ratio/min": 0.9904137849807739, "sampling/sampling_logp_difference/max": 0.006702210754156113, "sampling/sampling_logp_difference/mean": 0.0013135680928826332, "step": 1343, "step_time": 3.7892751410036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05951621849089861, "epoch": 0.01344, "grad_norm": 0.00043140113120898604, "kl": 0.5117335394024849, "learning_rate": 9.999208240564978e-06, "loss": 0.0012, "step": 1344, "step_time": 1.9903047639891156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 223.1875, "completions/mean_terminated_length": 223.1875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.4367507817223668, "epoch": 0.01345, "frac_reward_zero_std": 0.75, "grad_norm": 0.011523577384650707, "kl": 0.4689914733171463, "learning_rate": 9.999207029503541e-06, "loss": -0.0363, "num_tokens": 12097200.0, "reward": 0.7209999561309814, "reward_std": 0.12782315909862518, "rewards/rollout_reward_func/mean": 0.7209999561309814, "rewards/rollout_reward_func/std": 0.49433648586273193, "sampling/importance_sampling_ratio/max": 1.002821683883667, "sampling/importance_sampling_ratio/mean": 0.9676686525344849, "sampling/importance_sampling_ratio/min": 2.3823370328198327e-19, "sampling/sampling_logp_difference/max": 4.23630428314209, "sampling/sampling_logp_difference/mean": 0.1683693826198578, "step": 1345, "step_time": 5.287238096003421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4357219501398504, "epoch": 0.01346, "grad_norm": 0.011536425910890102, "kl": 0.4684981629252434, "learning_rate": 9.999205817516701e-06, "loss": -0.0363, "step": 1346, "step_time": 2.523295689992665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 242.375, "completions/mean_terminated_length": 242.375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.31406279979273677, "epoch": 0.01347, "frac_reward_zero_std": 0.75, "grad_norm": 0.08000204712152481, "kl": 0.6492717899382114, "learning_rate": 9.999204604604457e-06, "loss": -0.0335, "num_tokens": 12115804.0, "reward": 0.7500095963478088, "reward_std": 0.016951072961091995, "rewards/rollout_reward_func/mean": 0.7500095963478088, "rewards/rollout_reward_func/std": 0.35336947441101074, "sampling/importance_sampling_ratio/max": 1.0428881645202637, "sampling/importance_sampling_ratio/mean": 0.9433289170265198, "sampling/importance_sampling_ratio/min": 1.6710144336684607e-05, "sampling/sampling_logp_difference/max": 2.5947837829589844, "sampling/sampling_logp_difference/mean": 0.08336191624403, "step": 1347, "step_time": 4.703565016010543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31333306478336453, "epoch": 0.01348, "grad_norm": 0.0690365880727768, "kl": 0.606798741966486, "learning_rate": 9.999203390766811e-06, "loss": -0.0336, "step": 1348, "step_time": 2.024493816992617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 121.15625, "completions/mean_terminated_length": 121.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.48909490276128054, "epoch": 0.01349, "frac_reward_zero_std": 0.5, "grad_norm": 0.03005053661763668, "kl": 0.9482407234609127, "learning_rate": 9.999202176003763e-06, "loss": -0.0356, "num_tokens": 12130449.0, "reward": 0.7396153807640076, "reward_std": 0.06962281465530396, "rewards/rollout_reward_func/mean": 0.7396153807640076, "rewards/rollout_reward_func/std": 0.2650400698184967, "sampling/importance_sampling_ratio/max": 1.001360535621643, "sampling/importance_sampling_ratio/mean": 0.936799168586731, "sampling/importance_sampling_ratio/min": 0.012797392904758453, "sampling/sampling_logp_difference/max": 2.045071601867676, "sampling/sampling_logp_difference/mean": 0.0641692727804184, "step": 1349, "step_time": 3.9826614699850325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.48642501467838883, "epoch": 0.0135, "grad_norm": 0.02991998940706253, "kl": 0.947018314152956, "learning_rate": 9.999200960315312e-06, "loss": -0.0356, "step": 1350, "step_time": 1.9728293610023684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 120.0625, "completions/mean_terminated_length": 120.0625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2197945723310113, "epoch": 0.01351, "frac_reward_zero_std": 0.75, "grad_norm": 0.017488369718194008, "kl": 0.5944109931588173, "learning_rate": 9.999199743701456e-06, "loss": -0.0175, "num_tokens": 12145419.0, "reward": 0.6192788481712341, "reward_std": 0.022709006443619728, "rewards/rollout_reward_func/mean": 0.6192788481712341, "rewards/rollout_reward_func/std": 0.22627586126327515, "sampling/importance_sampling_ratio/max": 1.003373146057129, "sampling/importance_sampling_ratio/mean": 0.9688379168510437, "sampling/importance_sampling_ratio/min": 0.007155242841690779, "sampling/sampling_logp_difference/max": 2.0277023315429688, "sampling/sampling_logp_difference/mean": 0.03189213201403618, "step": 1351, "step_time": 4.4114407199958805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22033963399007916, "epoch": 0.01352, "grad_norm": 0.014723519794642925, "kl": 0.5745290294289589, "learning_rate": 9.999198526162202e-06, "loss": -0.0176, "step": 1352, "step_time": 2.40465373700863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 146.0625, "completions/mean_terminated_length": 146.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.40688054729253054, "epoch": 0.01353, "frac_reward_zero_std": 0.75, "grad_norm": 0.00904904305934906, "kl": 0.662972554564476, "learning_rate": 9.999197307697545e-06, "loss": -0.0174, "num_tokens": 12161133.0, "reward": 0.8201634883880615, "reward_std": 0.05099327862262726, "rewards/rollout_reward_func/mean": 0.8201634883880615, "rewards/rollout_reward_func/std": 0.336887001991272, "sampling/importance_sampling_ratio/max": 1.001434087753296, "sampling/importance_sampling_ratio/mean": 0.9654762744903564, "sampling/importance_sampling_ratio/min": 1.2906067468065885e-06, "sampling/sampling_logp_difference/max": 2.5970852375030518, "sampling/sampling_logp_difference/mean": 0.13160300254821777, "step": 1353, "step_time": 4.4981224719958846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4058140846900642, "epoch": 0.01354, "grad_norm": 0.007583166006952524, "kl": 0.6496817991137505, "learning_rate": 9.999196088307487e-06, "loss": -0.0175, "step": 1354, "step_time": 2.0104349340035697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 143.25, "completions/mean_terminated_length": 143.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.12931731436401606, "epoch": 0.01355, "frac_reward_zero_std": 0.75, "grad_norm": 0.016071567311882973, "kl": 0.5564239881932735, "learning_rate": 9.999194867992026e-06, "loss": -0.0157, "num_tokens": 12176813.0, "reward": 0.8249038457870483, "reward_std": 0.01767767034471035, "rewards/rollout_reward_func/mean": 0.8249038457870483, "rewards/rollout_reward_func/std": 0.18787260353565216, "sampling/importance_sampling_ratio/max": 1.0071794986724854, "sampling/importance_sampling_ratio/mean": 0.9721825122833252, "sampling/importance_sampling_ratio/min": 0.10037659853696823, "sampling/sampling_logp_difference/max": 2.2357051372528076, "sampling/sampling_logp_difference/mean": 0.019186442717909813, "step": 1355, "step_time": 4.111026488011703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13023911649361253, "epoch": 0.01356, "grad_norm": 0.015237793326377869, "kl": 0.5636190734803677, "learning_rate": 9.999193646751167e-06, "loss": -0.0157, "step": 1356, "step_time": 2.462414745023125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 194.53125, "completions/mean_terminated_length": 194.53125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3437130209058523, "epoch": 0.01357, "frac_reward_zero_std": 0.75, "grad_norm": 0.00733186025172472, "kl": 0.48427412286400795, "learning_rate": 9.999192424584906e-06, "loss": -0.0347, "num_tokens": 12193774.0, "reward": 0.9333798289299011, "reward_std": 0.017734022811055183, "rewards/rollout_reward_func/mean": 0.9333798289299011, "rewards/rollout_reward_func/std": 0.2987435758113861, "sampling/importance_sampling_ratio/max": 1.003184199333191, "sampling/importance_sampling_ratio/mean": 0.9370660185813904, "sampling/importance_sampling_ratio/min": 0.0037602116353809834, "sampling/sampling_logp_difference/max": 2.4102180004119873, "sampling/sampling_logp_difference/mean": 0.051414910703897476, "step": 1357, "step_time": 4.148638689999643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34025434171780944, "epoch": 0.01358, "grad_norm": 0.007552587892860174, "kl": 0.484032217413187, "learning_rate": 9.999191201493247e-06, "loss": -0.0347, "step": 1358, "step_time": 2.458330389999901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 191.0, "completions/mean_terminated_length": 191.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3357530948705971, "epoch": 0.01359, "frac_reward_zero_std": 0.75, "grad_norm": 0.01257350668311119, "kl": 0.5063042640686035, "learning_rate": 9.999189977476188e-06, "loss": 0.021, "num_tokens": 12210838.0, "reward": 0.8524903655052185, "reward_std": 0.0020397312473505735, "rewards/rollout_reward_func/mean": 0.8524903655052185, "rewards/rollout_reward_func/std": 0.3207476735115051, "sampling/importance_sampling_ratio/max": 1.0045397281646729, "sampling/importance_sampling_ratio/mean": 0.9670442342758179, "sampling/importance_sampling_ratio/min": 1.701995643088594e-05, "sampling/sampling_logp_difference/max": 3.387744903564453, "sampling/sampling_logp_difference/mean": 0.08165252208709717, "step": 1359, "step_time": 4.509426980999706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3350064572878182, "epoch": 0.0136, "grad_norm": 0.012852661311626434, "kl": 0.5060388520359993, "learning_rate": 9.999188752533728e-06, "loss": 0.0209, "step": 1360, "step_time": 2.001818197008106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 177.71875, "completions/mean_terminated_length": 177.71875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.5397427496500313, "epoch": 0.01361, "frac_reward_zero_std": 0.5, "grad_norm": 0.1271923929452896, "kl": 1.5583664141595364, "learning_rate": 9.99918752666587e-06, "loss": -0.0045, "num_tokens": 12227397.0, "reward": 0.4965769350528717, "reward_std": 0.18940982222557068, "rewards/rollout_reward_func/mean": 0.4965769350528717, "rewards/rollout_reward_func/std": 0.35632216930389404, "sampling/importance_sampling_ratio/max": 1.0063540935516357, "sampling/importance_sampling_ratio/mean": 0.9096061587333679, "sampling/importance_sampling_ratio/min": 0.002385109430179, "sampling/sampling_logp_difference/max": 2.384986639022827, "sampling/sampling_logp_difference/mean": 0.07930295169353485, "step": 1361, "step_time": 4.479400538017217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5426637511700392, "epoch": 0.01362, "grad_norm": 0.08106484264135361, "kl": 1.3040331341326237, "learning_rate": 9.999186299872614e-06, "loss": -0.005, "step": 1362, "step_time": 2.4715453279932262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 146.5, "completions/mean_terminated_length": 146.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07102681137621403, "epoch": 0.01363, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007038083858788013, "kl": 0.465444128960371, "learning_rate": 9.99918507215396e-06, "loss": 0.0014, "num_tokens": 12243181.0, "reward": 0.6335768699645996, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6335768699645996, "rewards/rollout_reward_func/std": 0.22288638353347778, "sampling/importance_sampling_ratio/max": 1.0047892332077026, "sampling/importance_sampling_ratio/mean": 1.0010035037994385, "sampling/importance_sampling_ratio/min": 0.9961838722229004, "sampling/sampling_logp_difference/max": 0.006094768643379211, "sampling/sampling_logp_difference/mean": 0.0012398490216583014, "step": 1363, "step_time": 4.186871309007984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07690385449677706, "epoch": 0.01364, "grad_norm": 0.0007794251432642341, "kl": 0.4645272344350815, "learning_rate": 9.999183843509903e-06, "loss": 0.0014, "step": 1364, "step_time": 2.4637214859976666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.08200452104210854, "epoch": 0.01365, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007606302388012409, "kl": 0.5146479830145836, "learning_rate": 9.999182613940452e-06, "loss": 0.0014, "num_tokens": 12257741.0, "reward": 0.7238461971282959, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7238461971282959, "rewards/rollout_reward_func/std": 0.28801921010017395, "sampling/importance_sampling_ratio/max": 1.0031704902648926, "sampling/importance_sampling_ratio/mean": 0.9989506006240845, "sampling/importance_sampling_ratio/min": 0.9946816563606262, "sampling/sampling_logp_difference/max": 0.004712164402008057, "sampling/sampling_logp_difference/mean": 0.0010021296329796314, "step": 1365, "step_time": 3.907494380007847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08693439699709415, "epoch": 0.01366, "grad_norm": 0.0008106130408123136, "kl": 0.5139194242656231, "learning_rate": 9.999181383445602e-06, "loss": 0.0014, "step": 1366, "step_time": 1.9836224729879177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 120.6875, "completions/mean_terminated_length": 120.6875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2811178732663393, "epoch": 0.01367, "frac_reward_zero_std": 0.75, "grad_norm": 0.02918010950088501, "kl": 0.5318299680948257, "learning_rate": 9.999180152025356e-06, "loss": 0.0216, "num_tokens": 12272339.0, "reward": 0.5185576677322388, "reward_std": 0.006799102760851383, "rewards/rollout_reward_func/mean": 0.5185576677322388, "rewards/rollout_reward_func/std": 0.19621457159519196, "sampling/importance_sampling_ratio/max": 1.0775545835494995, "sampling/importance_sampling_ratio/mean": 0.9862741827964783, "sampling/importance_sampling_ratio/min": 0.024978695437312126, "sampling/sampling_logp_difference/max": 1.2847152948379517, "sampling/sampling_logp_difference/mean": 0.03394435718655586, "step": 1367, "step_time": 3.811352714015811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28934193402528763, "epoch": 0.01368, "grad_norm": 0.02940436452627182, "kl": 0.5202482491731644, "learning_rate": 9.999178919679713e-06, "loss": 0.0216, "step": 1368, "step_time": 2.447779760012054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 133.125, "completions/mean_terminated_length": 133.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4850674392655492, "epoch": 0.01369, "frac_reward_zero_std": 0.5, "grad_norm": 0.016773715615272522, "kl": 0.6367255449295044, "learning_rate": 9.999177686408673e-06, "loss": -0.0257, "num_tokens": 12287455.0, "reward": 0.9923558235168457, "reward_std": 0.07547005265951157, "rewards/rollout_reward_func/mean": 0.9923558235168457, "rewards/rollout_reward_func/std": 0.2065395712852478, "sampling/importance_sampling_ratio/max": 1.0046789646148682, "sampling/importance_sampling_ratio/mean": 0.9374600648880005, "sampling/importance_sampling_ratio/min": 0.002231293823570013, "sampling/sampling_logp_difference/max": 2.677088737487793, "sampling/sampling_logp_difference/mean": 0.06440258771181107, "step": 1369, "step_time": 4.139232818000892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4865878652781248, "epoch": 0.0137, "grad_norm": 0.019528552889823914, "kl": 0.6219322942197323, "learning_rate": 9.999176452212236e-06, "loss": -0.0256, "step": 1370, "step_time": 2.0205036960032885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 102.53125, "completions/mean_terminated_length": 102.53125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.8695158874616027, "epoch": 0.01371, "frac_reward_zero_std": 0.25, "grad_norm": 0.027154557406902313, "kl": 0.5973448343575001, "learning_rate": 9.999175217090401e-06, "loss": -0.056, "num_tokens": 12301744.0, "reward": 0.6783653497695923, "reward_std": 0.06119193509221077, "rewards/rollout_reward_func/mean": 0.6783653497695923, "rewards/rollout_reward_func/std": 0.16600164771080017, "sampling/importance_sampling_ratio/max": 1.001449465751648, "sampling/importance_sampling_ratio/mean": 0.9045583605766296, "sampling/importance_sampling_ratio/min": 0.0012114016572013497, "sampling/sampling_logp_difference/max": 2.1083812713623047, "sampling/sampling_logp_difference/mean": 0.14358443021774292, "step": 1371, "step_time": 4.408951572004298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8669895268976688, "epoch": 0.01372, "grad_norm": 0.02725890651345253, "kl": 0.595854002982378, "learning_rate": 9.999173981043175e-06, "loss": -0.056, "step": 1372, "step_time": 1.9960361549965455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 128.8125, "completions/mean_terminated_length": 129.22579956054688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.5036677531898022, "epoch": 0.01373, "frac_reward_zero_std": 0.0, "grad_norm": 0.029646476730704308, "kl": 0.5232557915151119, "learning_rate": 9.99917274407055e-06, "loss": -0.0403, "num_tokens": 12316882.0, "reward": 0.7312148809432983, "reward_std": 0.13171762228012085, "rewards/rollout_reward_func/mean": 0.7312148809432983, "rewards/rollout_reward_func/std": 0.18164603412151337, "sampling/importance_sampling_ratio/max": 1.0023701190948486, "sampling/importance_sampling_ratio/mean": 0.8101273775100708, "sampling/importance_sampling_ratio/min": 1.820904115276257e-18, "sampling/sampling_logp_difference/max": 3.710282802581787, "sampling/sampling_logp_difference/mean": 0.5068193078041077, "step": 1373, "step_time": 4.720518965987139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4926120871677995, "epoch": 0.01374, "grad_norm": 0.02887430414557457, "kl": 0.5161009766161442, "learning_rate": 9.999171506172532e-06, "loss": -0.0404, "step": 1374, "step_time": 2.0371625629923074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 105.71875, "completions/mean_terminated_length": 105.71875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4679281320422888, "epoch": 0.01375, "frac_reward_zero_std": 0.75, "grad_norm": 0.015795093029737473, "kl": 0.5835131667554379, "learning_rate": 9.999170267349118e-06, "loss": -0.0173, "num_tokens": 12331193.0, "reward": 0.7834134697914124, "reward_std": 0.008838835172355175, "rewards/rollout_reward_func/mean": 0.7834134697914124, "rewards/rollout_reward_func/std": 0.20654596388339996, "sampling/importance_sampling_ratio/max": 1.008737564086914, "sampling/importance_sampling_ratio/mean": 0.9389569759368896, "sampling/importance_sampling_ratio/min": 0.006452065426856279, "sampling/sampling_logp_difference/max": 1.9014438390731812, "sampling/sampling_logp_difference/mean": 0.07128790020942688, "step": 1375, "step_time": 3.7985843169954023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4633519984781742, "epoch": 0.01376, "grad_norm": 0.01617044396698475, "kl": 0.5834544450044632, "learning_rate": 9.999169027600309e-06, "loss": -0.0173, "step": 1376, "step_time": 2.41631865499221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 165.21875, "completions/mean_terminated_length": 165.21875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3220081115141511, "epoch": 0.01377, "frac_reward_zero_std": 0.75, "grad_norm": 0.01665676012635231, "kl": 0.4954490661621094, "learning_rate": 9.999167786926106e-06, "loss": -0.0368, "num_tokens": 12347248.0, "reward": 0.866783618927002, "reward_std": 0.02791711315512657, "rewards/rollout_reward_func/mean": 0.866783618927002, "rewards/rollout_reward_func/std": 0.23665395379066467, "sampling/importance_sampling_ratio/max": 1.010995864868164, "sampling/importance_sampling_ratio/mean": 0.9673831462860107, "sampling/importance_sampling_ratio/min": 0.009758726693689823, "sampling/sampling_logp_difference/max": 1.4458205699920654, "sampling/sampling_logp_difference/mean": 0.02658650279045105, "step": 1377, "step_time": 4.14362886500021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3161628348752856, "epoch": 0.01378, "grad_norm": 0.016196710988879204, "kl": 0.4958449527621269, "learning_rate": 9.99916654532651e-06, "loss": -0.0369, "step": 1378, "step_time": 2.000872555996466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 172.46875, "completions/mean_terminated_length": 172.46875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5128108076751232, "epoch": 0.01379, "frac_reward_zero_std": 0.5, "grad_norm": 0.07364558428525925, "kl": 0.5060891583561897, "learning_rate": 9.999165302801519e-06, "loss": -0.0109, "num_tokens": 12363391.0, "reward": 0.662788450717926, "reward_std": 0.2406882792711258, "rewards/rollout_reward_func/mean": 0.662788450717926, "rewards/rollout_reward_func/std": 0.5276803970336914, "sampling/importance_sampling_ratio/max": 1.0047335624694824, "sampling/importance_sampling_ratio/mean": 0.9126632809638977, "sampling/importance_sampling_ratio/min": 0.0010039698099717498, "sampling/sampling_logp_difference/max": 2.103825569152832, "sampling/sampling_logp_difference/mean": 0.06570138782262802, "step": 1379, "step_time": 4.568309344998852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5001291716471314, "epoch": 0.0138, "grad_norm": 0.07614477723836899, "kl": 0.5024303607642651, "learning_rate": 9.999164059351137e-06, "loss": -0.0111, "step": 1380, "step_time": 1.9971879940057988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 135.1875, "completions/mean_terminated_length": 135.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.817263531498611, "epoch": 0.01381, "frac_reward_zero_std": 0.75, "grad_norm": 0.014259659685194492, "kl": 0.664495475590229, "learning_rate": 9.99916281497536e-06, "loss": -0.0171, "num_tokens": 12378597.0, "reward": 0.5683653354644775, "reward_std": 0.0176776684820652, "rewards/rollout_reward_func/mean": 0.5683653354644775, "rewards/rollout_reward_func/std": 0.12390966713428497, "sampling/importance_sampling_ratio/max": 1.0104612112045288, "sampling/importance_sampling_ratio/mean": 0.8773683309555054, "sampling/importance_sampling_ratio/min": 0.006618546787649393, "sampling/sampling_logp_difference/max": 1.9717295169830322, "sampling/sampling_logp_difference/mean": 0.11384529620409012, "step": 1381, "step_time": 4.002089371002512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.783108034171164, "epoch": 0.01382, "grad_norm": 0.013485092669725418, "kl": 0.6446544080972672, "learning_rate": 9.999161569674191e-06, "loss": -0.0172, "step": 1382, "step_time": 2.4208132389903767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 175.0625, "completions/mean_terminated_length": 175.0625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4434260446578264, "epoch": 0.01383, "frac_reward_zero_std": 0.25, "grad_norm": 0.031273212283849716, "kl": 0.8943923190236092, "learning_rate": 9.999160323447628e-06, "loss": -0.0639, "num_tokens": 12394967.0, "reward": 0.6832884550094604, "reward_std": 0.26176825165748596, "rewards/rollout_reward_func/mean": 0.6832884550094604, "rewards/rollout_reward_func/std": 0.4574299454689026, "sampling/importance_sampling_ratio/max": 1.0068681240081787, "sampling/importance_sampling_ratio/mean": 0.8859388828277588, "sampling/importance_sampling_ratio/min": 0.008725062012672424, "sampling/sampling_logp_difference/max": 2.1010191440582275, "sampling/sampling_logp_difference/mean": 0.06570400297641754, "step": 1383, "step_time": 4.133009851990209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4419189472682774, "epoch": 0.01384, "grad_norm": 0.030903087928891182, "kl": 0.8995630256831646, "learning_rate": 9.999159076295675e-06, "loss": -0.0639, "step": 1384, "step_time": 2.468983096005104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 150.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.08219679724425077, "epoch": 0.01385, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007676847744733095, "kl": 0.47618338093161583, "learning_rate": 9.99915782821833e-06, "loss": 0.0015, "num_tokens": 12410823.0, "reward": 0.6754999756813049, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6754999756813049, "rewards/rollout_reward_func/std": 0.26546791195869446, "sampling/importance_sampling_ratio/max": 1.0133010149002075, "sampling/importance_sampling_ratio/mean": 1.0044779777526855, "sampling/importance_sampling_ratio/min": 0.9950301647186279, "sampling/sampling_logp_difference/max": 0.01211346685886383, "sampling/sampling_logp_difference/mean": 0.002053346950560808, "step": 1385, "step_time": 4.110177884998848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0809654202312231, "epoch": 0.01386, "grad_norm": 0.000766462879255414, "kl": 0.4763578549027443, "learning_rate": 9.999156579215592e-06, "loss": 0.0015, "step": 1386, "step_time": 1.9920555899807368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3825209988281131, "epoch": 0.01387, "frac_reward_zero_std": 0.75, "grad_norm": 0.03558950498700142, "kl": 0.5165588743984699, "learning_rate": 9.999155329287464e-06, "loss": -0.0346, "num_tokens": 12426295.0, "reward": 0.947615385055542, "reward_std": 0.008902122266590595, "rewards/rollout_reward_func/mean": 0.947615385055542, "rewards/rollout_reward_func/std": 0.34202390909194946, "sampling/importance_sampling_ratio/max": 1.007815957069397, "sampling/importance_sampling_ratio/mean": 0.9430568814277649, "sampling/importance_sampling_ratio/min": 0.04334427788853645, "sampling/sampling_logp_difference/max": 1.296081781387329, "sampling/sampling_logp_difference/mean": 0.04008103162050247, "step": 1387, "step_time": 3.823725080997974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.375577749684453, "epoch": 0.01388, "grad_norm": 0.03632884472608566, "kl": 0.5201511159539223, "learning_rate": 9.999154078433945e-06, "loss": -0.0346, "step": 1388, "step_time": 2.401931988009892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 172.0, "completions/mean_terminated_length": 172.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.34646408818662167, "epoch": 0.01389, "frac_reward_zero_std": 0.5, "grad_norm": 0.022010965272784233, "kl": 0.5605318434536457, "learning_rate": 9.999152826655035e-06, "loss": -0.0167, "num_tokens": 12442703.0, "reward": 1.0217211246490479, "reward_std": 0.03603524714708328, "rewards/rollout_reward_func/mean": 1.0217211246490479, "rewards/rollout_reward_func/std": 0.32598522305488586, "sampling/importance_sampling_ratio/max": 1.0057344436645508, "sampling/importance_sampling_ratio/mean": 0.9376459121704102, "sampling/importance_sampling_ratio/min": 0.027822403237223625, "sampling/sampling_logp_difference/max": 1.9291226863861084, "sampling/sampling_logp_difference/mean": 0.05165933445096016, "step": 1389, "step_time": 4.5972454989969265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34652118664234877, "epoch": 0.0139, "grad_norm": 0.021853569895029068, "kl": 0.5612817965447903, "learning_rate": 9.999151573950735e-06, "loss": -0.0167, "step": 1390, "step_time": 2.4512472179922042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 101.125, "completions/mean_terminated_length": 101.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4262543460354209, "epoch": 0.01391, "frac_reward_zero_std": 0.5, "grad_norm": 0.061944350600242615, "kl": 0.6534812226891518, "learning_rate": 9.999150320321046e-06, "loss": 0.0114, "num_tokens": 12456843.0, "reward": 0.7219231128692627, "reward_std": 0.047593727707862854, "rewards/rollout_reward_func/mean": 0.7219231128692627, "rewards/rollout_reward_func/std": 0.287533700466156, "sampling/importance_sampling_ratio/max": 1.0153250694274902, "sampling/importance_sampling_ratio/mean": 0.9445497393608093, "sampling/importance_sampling_ratio/min": 0.09764223545789719, "sampling/sampling_logp_difference/max": 1.204838514328003, "sampling/sampling_logp_difference/mean": 0.046651579439640045, "step": 1391, "step_time": 3.9105458550111507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.44008579570800066, "epoch": 0.01392, "grad_norm": 0.05756082385778427, "kl": 0.6423111855983734, "learning_rate": 9.999149065765967e-06, "loss": 0.0113, "step": 1392, "step_time": 1.9683717180014355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 152.90625, "completions/mean_terminated_length": 152.90625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6128126690164208, "epoch": 0.01393, "frac_reward_zero_std": 0.5, "grad_norm": 0.019532080739736557, "kl": 0.4102897010743618, "learning_rate": 9.999147810285496e-06, "loss": -0.0536, "num_tokens": 12472800.0, "reward": 0.6294360756874084, "reward_std": 0.05859224870800972, "rewards/rollout_reward_func/mean": 0.6294360756874084, "rewards/rollout_reward_func/std": 0.2606509029865265, "sampling/importance_sampling_ratio/max": 1.0158395767211914, "sampling/importance_sampling_ratio/mean": 0.9099836349487305, "sampling/importance_sampling_ratio/min": 2.422014055980526e-08, "sampling/sampling_logp_difference/max": 3.306229591369629, "sampling/sampling_logp_difference/mean": 0.18350675702095032, "step": 1393, "step_time": 4.23938585799624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6255619451403618, "epoch": 0.01394, "grad_norm": 0.02013981342315674, "kl": 0.41484858840703964, "learning_rate": 9.99914655387964e-06, "loss": -0.0536, "step": 1394, "step_time": 2.4562614999958896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 170.6875, "completions/mean_terminated_length": 170.6875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.4281402686610818, "epoch": 0.01395, "frac_reward_zero_std": 0.0, "grad_norm": 0.17364832758903503, "kl": 0.5942321866750717, "learning_rate": 9.999145296548393e-06, "loss": -0.0695, "num_tokens": 12489222.0, "reward": 0.724504828453064, "reward_std": 0.26634809374809265, "rewards/rollout_reward_func/mean": 0.724504828453064, "rewards/rollout_reward_func/std": 0.542176365852356, "sampling/importance_sampling_ratio/max": 1.0104645490646362, "sampling/importance_sampling_ratio/mean": 0.8977660536766052, "sampling/importance_sampling_ratio/min": 0.017537930980324745, "sampling/sampling_logp_difference/max": 2.164198875427246, "sampling/sampling_logp_difference/mean": 0.05807113274931908, "step": 1395, "step_time": 4.299430949991802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4185531842522323, "epoch": 0.01396, "grad_norm": 0.19384856522083282, "kl": 0.602851677685976, "learning_rate": 9.999144038291758e-06, "loss": -0.0701, "step": 1396, "step_time": 2.4211460249935044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 169.5, "completions/mean_terminated_length": 169.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.10242708027362823, "epoch": 0.01397, "frac_reward_zero_std": 1.0, "grad_norm": 0.002022932516410947, "kl": 0.545298732817173, "learning_rate": 9.999142779109736e-06, "loss": 0.0019, "num_tokens": 12505326.0, "reward": 0.6180769205093384, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6180769205093384, "rewards/rollout_reward_func/std": 0.0764821246266365, "sampling/importance_sampling_ratio/max": 1.0299955606460571, "sampling/importance_sampling_ratio/mean": 1.013293743133545, "sampling/importance_sampling_ratio/min": 1.0006120204925537, "sampling/sampling_logp_difference/max": 0.030982717871665955, "sampling/sampling_logp_difference/mean": 0.0037532455753535032, "step": 1397, "step_time": 4.064939799995045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11467370856553316, "epoch": 0.01398, "grad_norm": 0.0032825747039169073, "kl": 0.5412747636437416, "learning_rate": 9.999141519002326e-06, "loss": 0.0019, "step": 1398, "step_time": 1.9604502670044894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 125.4375, "completions/mean_terminated_length": 125.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2171258190646768, "epoch": 0.01399, "frac_reward_zero_std": 0.25, "grad_norm": 0.3138609826564789, "kl": 0.5557936318218708, "learning_rate": 9.999140257969527e-06, "loss": -0.0508, "num_tokens": 12520332.0, "reward": 0.5559327006340027, "reward_std": 0.10510919243097305, "rewards/rollout_reward_func/mean": 0.5559327006340027, "rewards/rollout_reward_func/std": 0.17087993025779724, "sampling/importance_sampling_ratio/max": 1.048771619796753, "sampling/importance_sampling_ratio/mean": 0.8391426801681519, "sampling/importance_sampling_ratio/min": 2.3114859974541525e-15, "sampling/sampling_logp_difference/max": 2.781562328338623, "sampling/sampling_logp_difference/mean": 0.3150489330291748, "step": 1399, "step_time": 4.945356877993618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.212652787566185, "epoch": 0.014, "grad_norm": 0.24079477787017822, "kl": 0.5626588761806488, "learning_rate": 9.999138996011341e-06, "loss": -0.0515, "step": 1400, "step_time": 2.0846111059945542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 101.25, "completions/mean_terminated_length": 101.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.08788973558694124, "epoch": 0.01401, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008953181677497923, "kl": 0.5401466637849808, "learning_rate": 9.99913773312777e-06, "loss": 0.0014, "num_tokens": 12534428.0, "reward": 0.6919230818748474, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6919230818748474, "rewards/rollout_reward_func/std": 0.2704879641532898, "sampling/importance_sampling_ratio/max": 1.0161412954330444, "sampling/importance_sampling_ratio/mean": 1.003697156906128, "sampling/importance_sampling_ratio/min": 0.9964624643325806, "sampling/sampling_logp_difference/max": 0.016075514256954193, "sampling/sampling_logp_difference/mean": 0.0018808288732543588, "step": 1401, "step_time": 3.5939135419903323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08278811071068048, "epoch": 0.01402, "grad_norm": 0.0007648376049473882, "kl": 0.5412676259875298, "learning_rate": 9.99913646931881e-06, "loss": 0.0014, "step": 1402, "step_time": 2.378084897012741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 143.625, "completions/mean_terminated_length": 143.625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.17042669374495745, "epoch": 0.01403, "frac_reward_zero_std": 0.75, "grad_norm": 0.2506423890590668, "kl": 0.4807254709303379, "learning_rate": 9.999135204584467e-06, "loss": 0.0166, "num_tokens": 12549792.0, "reward": 0.6651154160499573, "reward_std": 0.014243390411138535, "rewards/rollout_reward_func/mean": 0.6651154160499573, "rewards/rollout_reward_func/std": 0.14669570326805115, "sampling/importance_sampling_ratio/max": 1.0282379388809204, "sampling/importance_sampling_ratio/mean": 0.9668968915939331, "sampling/importance_sampling_ratio/min": 0.39118725061416626, "sampling/sampling_logp_difference/max": 0.46290087699890137, "sampling/sampling_logp_difference/mean": 0.015843363478779793, "step": 1403, "step_time": 4.040457344992319 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.15723952651023865, "epoch": 0.01404, "grad_norm": 0.17210860550403595, "kl": 0.47960854321718216, "learning_rate": 9.999133938924735e-06, "loss": 0.0162, "step": 1404, "step_time": 1.9872441150073428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 79.3125, "completions/mean_terminated_length": 79.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.40403382293879986, "epoch": 0.01405, "frac_reward_zero_std": 0.5, "grad_norm": 0.18607208132743835, "kl": 0.6579828299582005, "learning_rate": 9.99913267233962e-06, "loss": -0.0044, "num_tokens": 12563178.0, "reward": 0.595432698726654, "reward_std": 0.04310794174671173, "rewards/rollout_reward_func/mean": 0.595432698726654, "rewards/rollout_reward_func/std": 0.16740338504314423, "sampling/importance_sampling_ratio/max": 1.0109922885894775, "sampling/importance_sampling_ratio/mean": 0.9451563358306885, "sampling/importance_sampling_ratio/min": 0.06237398833036423, "sampling/sampling_logp_difference/max": 1.344830870628357, "sampling/sampling_logp_difference/mean": 0.037773288786411285, "step": 1405, "step_time": 4.2722404659944 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.3786008432507515, "epoch": 0.01406, "grad_norm": 0.10005654394626617, "kl": 0.6435197591781616, "learning_rate": 9.99913140482912e-06, "loss": -0.0062, "step": 1406, "step_time": 1.968940343009308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 97.03125, "completions/mean_terminated_length": 97.03125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.4017299357801676, "epoch": 0.01407, "frac_reward_zero_std": 0.0, "grad_norm": 0.43391120433807373, "kl": 0.47016992047429085, "learning_rate": 9.999130136393232e-06, "loss": -0.0172, "num_tokens": 12577051.0, "reward": 0.6165384650230408, "reward_std": 0.08396421372890472, "rewards/rollout_reward_func/mean": 0.6165384650230408, "rewards/rollout_reward_func/std": 0.26689019799232483, "sampling/importance_sampling_ratio/max": 1.057274341583252, "sampling/importance_sampling_ratio/mean": 0.9521291255950928, "sampling/importance_sampling_ratio/min": 0.024416327476501465, "sampling/sampling_logp_difference/max": 1.9022215604782104, "sampling/sampling_logp_difference/mean": 0.04323121905326843, "step": 1407, "step_time": 4.028080007985409 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3908461332321167, "epoch": 0.01408, "grad_norm": 0.6826483607292175, "kl": 0.42929987981915474, "learning_rate": 9.999128867031961e-06, "loss": -0.0154, "step": 1408, "step_time": 1.9285568440027419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 235.34375, "completions/mean_terminated_length": 235.34375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.17250777501612902, "epoch": 0.01409, "frac_reward_zero_std": 0.75, "grad_norm": 0.022469153627753258, "kl": 0.42586223036050797, "learning_rate": 9.999127596745305e-06, "loss": -0.0356, "num_tokens": 12595374.0, "reward": 0.8483894467353821, "reward_std": 0.030228814110159874, "rewards/rollout_reward_func/mean": 0.8483894467353821, "rewards/rollout_reward_func/std": 0.3515201210975647, "sampling/importance_sampling_ratio/max": 1.0135880708694458, "sampling/importance_sampling_ratio/mean": 0.970579981803894, "sampling/importance_sampling_ratio/min": 0.057268381118774414, "sampling/sampling_logp_difference/max": 1.3871753215789795, "sampling/sampling_logp_difference/mean": 0.013560864143073559, "step": 1409, "step_time": 4.393600413000968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1681963987648487, "epoch": 0.0141, "grad_norm": 0.02345975488424301, "kl": 0.42830026149749756, "learning_rate": 9.999126325533266e-06, "loss": -0.0355, "step": 1410, "step_time": 1.9600173780199839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 203.59375, "completions/mean_terminated_length": 203.59375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.2610134603455663, "epoch": 0.01411, "frac_reward_zero_std": 0.5, "grad_norm": 0.14038598537445068, "kl": 0.4952058084309101, "learning_rate": 9.999125053395843e-06, "loss": -0.061, "num_tokens": 12612513.0, "reward": 0.7769278287887573, "reward_std": 0.08815427124500275, "rewards/rollout_reward_func/mean": 0.7769278287887573, "rewards/rollout_reward_func/std": 0.2906878888607025, "sampling/importance_sampling_ratio/max": 1.0142043828964233, "sampling/importance_sampling_ratio/mean": 0.9319032430648804, "sampling/importance_sampling_ratio/min": 0.036773212254047394, "sampling/sampling_logp_difference/max": 1.6795568466186523, "sampling/sampling_logp_difference/mean": 0.028493084013462067, "step": 1411, "step_time": 4.821669699005724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2661608047783375, "epoch": 0.01412, "grad_norm": 0.1467636376619339, "kl": 0.48914260417222977, "learning_rate": 9.999123780333036e-06, "loss": -0.0613, "step": 1412, "step_time": 2.003930696999305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 144.9375, "completions/mean_terminated_length": 144.9375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.39743785932660103, "epoch": 0.01413, "frac_reward_zero_std": 0.25, "grad_norm": 0.7841520309448242, "kl": 0.4671340398490429, "learning_rate": 9.999122506344846e-06, "loss": 0.0014, "num_tokens": 12628023.0, "reward": 0.5898221135139465, "reward_std": 0.1791234165430069, "rewards/rollout_reward_func/mean": 0.5898221135139465, "rewards/rollout_reward_func/std": 0.3742804527282715, "sampling/importance_sampling_ratio/max": 1.144099235534668, "sampling/importance_sampling_ratio/mean": 0.9162577390670776, "sampling/importance_sampling_ratio/min": 0.04557894915342331, "sampling/sampling_logp_difference/max": 1.4728350639343262, "sampling/sampling_logp_difference/mean": 0.04650668054819107, "step": 1413, "step_time": 4.570228523007245 }, { "clip_ratio/high_max": 0.06875000055879354, "clip_ratio/high_mean": 0.03437500027939677, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04843750037252903, "entropy": 0.39052229933440685, "epoch": 0.01414, "grad_norm": 0.13542474806308746, "kl": 0.4678841270506382, "learning_rate": 9.999121231431275e-06, "loss": -0.0016, "step": 1414, "step_time": 2.0076890439959243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 167.53125, "completions/mean_terminated_length": 167.53125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.4569274988025427, "epoch": 0.01415, "frac_reward_zero_std": 0.25, "grad_norm": 1.1788002252578735, "kl": 0.5056628957390785, "learning_rate": 9.999119955592321e-06, "loss": -0.0467, "num_tokens": 12644120.0, "reward": 0.6573605537414551, "reward_std": 0.04078223928809166, "rewards/rollout_reward_func/mean": 0.6573605537414551, "rewards/rollout_reward_func/std": 0.12847143411636353, "sampling/importance_sampling_ratio/max": 1.1322274208068848, "sampling/importance_sampling_ratio/mean": 0.9118623733520508, "sampling/importance_sampling_ratio/min": 0.0754656046628952, "sampling/sampling_logp_difference/max": 1.4700922966003418, "sampling/sampling_logp_difference/mean": 0.04649168625473976, "step": 1415, "step_time": 4.303515728002822 }, { "clip_ratio/high_max": 0.09062500018626451, "clip_ratio/high_mean": 0.06197916762903333, "clip_ratio/low_mean": 0.020870535634458065, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.08284970326349139, "entropy": 0.515049341134727, "epoch": 0.01416, "grad_norm": 0.2646479904651642, "kl": 0.550940278917551, "learning_rate": 9.999118678827984e-06, "loss": -0.0493, "step": 1416, "step_time": 2.000247757008765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 198.0, "completions/mean_terminated_length": 198.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.050446705892682076, "epoch": 0.01417, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006638117483817041, "kl": 0.5054936036467552, "learning_rate": 9.999117401138263e-06, "loss": 0.0019, "num_tokens": 12661192.0, "reward": 0.902999997138977, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.902999997138977, "rewards/rollout_reward_func/std": 0.23621517419815063, "sampling/importance_sampling_ratio/max": 1.0245715379714966, "sampling/importance_sampling_ratio/mean": 1.0046714544296265, "sampling/importance_sampling_ratio/min": 0.9957301020622253, "sampling/sampling_logp_difference/max": 0.013996034860610962, "sampling/sampling_logp_difference/mean": 0.0015385826118290424, "step": 1417, "step_time": 4.664041645010002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05048547685146332, "epoch": 0.01418, "grad_norm": 0.0007022495847195387, "kl": 0.5053940378129482, "learning_rate": 9.999116122523164e-06, "loss": 0.0019, "step": 1418, "step_time": 1.9694531560016912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 180.78125, "completions/mean_terminated_length": 180.78125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.6285794484429061, "epoch": 0.01419, "frac_reward_zero_std": 0.5, "grad_norm": 0.06710200756788254, "kl": 0.5395893156528473, "learning_rate": 9.999114842982682e-06, "loss": 0.0094, "num_tokens": 12677713.0, "reward": 0.6151922941207886, "reward_std": 0.0370444729924202, "rewards/rollout_reward_func/mean": 0.6151922941207886, "rewards/rollout_reward_func/std": 0.380024790763855, "sampling/importance_sampling_ratio/max": 1.0028351545333862, "sampling/importance_sampling_ratio/mean": 0.9092344045639038, "sampling/importance_sampling_ratio/min": 1.878988457881814e-11, "sampling/sampling_logp_difference/max": 10.737974166870117, "sampling/sampling_logp_difference/mean": 0.19770576059818268, "step": 1419, "step_time": 4.491896846993768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.648246746044606, "epoch": 0.0142, "grad_norm": 0.0719970166683197, "kl": 0.559385534375906, "learning_rate": 9.999113562516821e-06, "loss": 0.0095, "step": 1420, "step_time": 1.9784318539896049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 166.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3205236466601491, "epoch": 0.01421, "frac_reward_zero_std": 0.5, "grad_norm": 1.3903602361679077, "kl": 0.42100105062127113, "learning_rate": 9.999112281125578e-06, "loss": -0.0311, "num_tokens": 12693761.0, "reward": 0.8218510150909424, "reward_std": 0.08777642250061035, "rewards/rollout_reward_func/mean": 0.8218510150909424, "rewards/rollout_reward_func/std": 0.3979637324810028, "sampling/importance_sampling_ratio/max": 1.1215614080429077, "sampling/importance_sampling_ratio/mean": 0.9398998022079468, "sampling/importance_sampling_ratio/min": 3.098668543977112e-10, "sampling/sampling_logp_difference/max": 14.429254531860352, "sampling/sampling_logp_difference/mean": 0.18155835568904877, "step": 1421, "step_time": 4.2351937479907065 }, { "clip_ratio/high_max": 0.061755954287946224, "clip_ratio/high_mean": 0.030877977143973112, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04129464365541935, "entropy": 0.32300792913883924, "epoch": 0.01422, "grad_norm": 0.019963808357715607, "kl": 0.47818805277347565, "learning_rate": 9.999110998808955e-06, "loss": -0.0318, "step": 1422, "step_time": 2.4361729709853535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 169.9375, "completions/mean_terminated_length": 169.9375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.235102666541934, "epoch": 0.01423, "frac_reward_zero_std": 0.75, "grad_norm": 0.0472429133951664, "kl": 0.7316914051771164, "learning_rate": 9.999109715566952e-06, "loss": -0.0268, "num_tokens": 12709999.0, "reward": 0.9127788543701172, "reward_std": 0.04112023115158081, "rewards/rollout_reward_func/mean": 0.9127788543701172, "rewards/rollout_reward_func/std": 0.3946215510368347, "sampling/importance_sampling_ratio/max": 1.012774109840393, "sampling/importance_sampling_ratio/mean": 0.9411905407905579, "sampling/importance_sampling_ratio/min": 0.033784955739974976, "sampling/sampling_logp_difference/max": 3.2002177238464355, "sampling/sampling_logp_difference/mean": 0.03351806476712227, "step": 1423, "step_time": 4.05454024500068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25080566201359034, "epoch": 0.01424, "grad_norm": 0.05428235977888107, "kl": 0.7448385991156101, "learning_rate": 9.99910843139957e-06, "loss": -0.0267, "step": 1424, "step_time": 2.462955752009293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 145.1875, "completions/mean_terminated_length": 145.1875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.31617829017341137, "epoch": 0.01425, "frac_reward_zero_std": 0.75, "grad_norm": 0.020277414470911026, "kl": 0.6717238053679466, "learning_rate": 9.999107146306808e-06, "loss": -0.0267, "num_tokens": 12725325.0, "reward": 0.6643750071525574, "reward_std": 0.01590990275144577, "rewards/rollout_reward_func/mean": 0.6643750071525574, "rewards/rollout_reward_func/std": 0.16062584519386292, "sampling/importance_sampling_ratio/max": 1.0323373079299927, "sampling/importance_sampling_ratio/mean": 0.9749892950057983, "sampling/importance_sampling_ratio/min": 2.471409970894456e-06, "sampling/sampling_logp_difference/max": 3.0306835174560547, "sampling/sampling_logp_difference/mean": 0.06913141906261444, "step": 1425, "step_time": 3.9348290619891486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33060287358239293, "epoch": 0.01426, "grad_norm": 0.019756583496928215, "kl": 0.652090273797512, "learning_rate": 9.999105860288666e-06, "loss": -0.0268, "step": 1426, "step_time": 1.9450083779956913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 144.46875, "completions/mean_terminated_length": 144.46875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4674448221921921, "epoch": 0.01427, "frac_reward_zero_std": 1.0, "grad_norm": 0.002708090003579855, "kl": 0.3789512477815151, "learning_rate": 9.999104573345148e-06, "loss": 0.0013, "num_tokens": 12740572.0, "reward": 0.648192286491394, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.648192286491394, "rewards/rollout_reward_func/std": 0.11492631584405899, "sampling/importance_sampling_ratio/max": 1.206260323524475, "sampling/importance_sampling_ratio/mean": 0.9147454500198364, "sampling/importance_sampling_ratio/min": 8.564687050238717e-06, "sampling/sampling_logp_difference/max": 3.852311134338379, "sampling/sampling_logp_difference/mean": 0.09402333945035934, "step": 1427, "step_time": 4.095367683999939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.45690221013501287, "epoch": 0.01428, "grad_norm": 0.0031503040809184313, "kl": 0.38048670068383217, "learning_rate": 9.99910328547625e-06, "loss": 0.0013, "step": 1428, "step_time": 1.9623442129959585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 188.59375, "completions/mean_terminated_length": 188.59375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.5329195726662874, "epoch": 0.01429, "frac_reward_zero_std": 0.25, "grad_norm": 1.530047059059143, "kl": 0.509904719889164, "learning_rate": 9.999101996681975e-06, "loss": -0.0785, "num_tokens": 12757231.0, "reward": 0.46703362464904785, "reward_std": 0.04834998399019241, "rewards/rollout_reward_func/mean": 0.46703362464904785, "rewards/rollout_reward_func/std": 0.1761235147714615, "sampling/importance_sampling_ratio/max": 1.0983325242996216, "sampling/importance_sampling_ratio/mean": 0.9411941170692444, "sampling/importance_sampling_ratio/min": 2.7931062504649162e-05, "sampling/sampling_logp_difference/max": 2.483325242996216, "sampling/sampling_logp_difference/mean": 0.08912897855043411, "step": 1429, "step_time": 4.169915946004039 }, { "clip_ratio/high_max": 0.0610119067132473, "clip_ratio/high_mean": 0.03050595335662365, "clip_ratio/low_mean": 0.05357143096625805, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0840773843228817, "entropy": 0.6837100302800536, "epoch": 0.0143, "grad_norm": 0.031311627477407455, "kl": 0.5618726946413517, "learning_rate": 9.99910070696232e-06, "loss": -0.0826, "step": 1430, "step_time": 2.4824683729893877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.020833333488553762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020833333488553762, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 1.1925504207611084, "epoch": 0.01431, "frac_reward_zero_std": 0.0, "grad_norm": 0.5035630464553833, "kl": 0.8684304878115654, "learning_rate": 9.999099416317291e-06, "loss": -0.0851, "num_tokens": 12774375.0, "reward": 0.5988302826881409, "reward_std": 0.19612818956375122, "rewards/rollout_reward_func/mean": 0.5988302826881409, "rewards/rollout_reward_func/std": 0.4225568473339081, "sampling/importance_sampling_ratio/max": 1.2320013046264648, "sampling/importance_sampling_ratio/mean": 0.6790039539337158, "sampling/importance_sampling_ratio/min": 0.00012901479203719646, "sampling/sampling_logp_difference/max": 3.284075975418091, "sampling/sampling_logp_difference/mean": 0.2219659686088562, "step": 1431, "step_time": 4.24316518199339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0825892873108387, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0825892873108387, "entropy": 1.3500014580786228, "epoch": 0.01432, "grad_norm": 0.46658560633659363, "kl": 0.878785714507103, "learning_rate": 9.999098124746882e-06, "loss": -0.0866, "step": 1432, "step_time": 2.0198185529952752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 174.375, "completions/mean_terminated_length": 176.1290283203125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5793253798037767, "epoch": 0.01433, "frac_reward_zero_std": 0.5, "grad_norm": 0.6682931780815125, "kl": 0.3823203556239605, "learning_rate": 9.999096832251097e-06, "loss": 0.0008, "num_tokens": 12790779.0, "reward": 0.6720528602600098, "reward_std": 0.042385611683130264, "rewards/rollout_reward_func/mean": 0.6720528602600098, "rewards/rollout_reward_func/std": 0.32263171672821045, "sampling/importance_sampling_ratio/max": 1.2859340906143188, "sampling/importance_sampling_ratio/mean": 1.0100922584533691, "sampling/importance_sampling_ratio/min": 3.2450742107986054e-17, "sampling/sampling_logp_difference/max": 3.1354382038116455, "sampling/sampling_logp_difference/mean": 0.2722651958465576, "step": 1433, "step_time": 4.27167452100548 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.11221590917557478, "entropy": 0.8109822468832135, "epoch": 0.01434, "grad_norm": 0.32750383019447327, "kl": 0.353719849139452, "learning_rate": 9.999095538829937e-06, "loss": -0.0029, "step": 1434, "step_time": 2.4686313190031797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 126.03125, "completions/mean_terminated_length": 126.03125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7487560380250216, "epoch": 0.01435, "frac_reward_zero_std": 0.5, "grad_norm": 0.04601634293794632, "kl": 0.8583520315587521, "learning_rate": 9.999094244483401e-06, "loss": -0.0416, "num_tokens": 12805660.0, "reward": 0.3427884876728058, "reward_std": 0.0431075282394886, "rewards/rollout_reward_func/mean": 0.3427884876728058, "rewards/rollout_reward_func/std": 0.08704908937215805, "sampling/importance_sampling_ratio/max": 1.0332168340682983, "sampling/importance_sampling_ratio/mean": 0.9089992642402649, "sampling/importance_sampling_ratio/min": 1.4895150890481546e-08, "sampling/sampling_logp_difference/max": 3.371687889099121, "sampling/sampling_logp_difference/mean": 0.19355851411819458, "step": 1435, "step_time": 4.03239951901196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.7519718860276043, "epoch": 0.01436, "grad_norm": 0.04840616136789322, "kl": 0.8841870874166489, "learning_rate": 9.999092949211486e-06, "loss": -0.0416, "step": 1436, "step_time": 2.4719685350210057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 146.75, "completions/mean_terminated_length": 146.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.034884640481323004, "epoch": 0.01437, "frac_reward_zero_std": 1.0, "grad_norm": 0.00027636540471576154, "kl": 0.423201322555542, "learning_rate": 9.999091653014199e-06, "loss": 0.0013, "num_tokens": 12821148.0, "reward": 0.7765384912490845, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7765384912490845, "rewards/rollout_reward_func/std": 0.37002626061439514, "sampling/importance_sampling_ratio/max": 0.9989114999771118, "sampling/importance_sampling_ratio/mean": 0.9961363673210144, "sampling/importance_sampling_ratio/min": 0.9932559728622437, "sampling/sampling_logp_difference/max": 0.005699235014617443, "sampling/sampling_logp_difference/mean": 0.0008020015666261315, "step": 1437, "step_time": 3.868452583992621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.033692465629428625, "epoch": 0.01438, "grad_norm": 0.00028127917903475463, "kl": 0.4231500066816807, "learning_rate": 9.999090355891535e-06, "loss": 0.0013, "step": 1438, "step_time": 1.95594019699638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2670126724988222, "epoch": 0.01439, "frac_reward_zero_std": 0.75, "grad_norm": 0.01974465139210224, "kl": 0.49196160584688187, "learning_rate": 9.999089057843496e-06, "loss": -0.0175, "num_tokens": 12837288.0, "reward": 0.5408365726470947, "reward_std": 0.03603525087237358, "rewards/rollout_reward_func/mean": 0.5408365726470947, "rewards/rollout_reward_func/std": 0.2412474900484085, "sampling/importance_sampling_ratio/max": 1.000999927520752, "sampling/importance_sampling_ratio/mean": 0.9665580987930298, "sampling/importance_sampling_ratio/min": 1.5268467905116268e-05, "sampling/sampling_logp_difference/max": 3.219585418701172, "sampling/sampling_logp_difference/mean": 0.06886475533246994, "step": 1439, "step_time": 4.185992890998023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2665444356389344, "epoch": 0.0144, "grad_norm": 0.01864847168326378, "kl": 0.48992592841386795, "learning_rate": 9.999087758870084e-06, "loss": -0.0175, "step": 1440, "step_time": 2.4778001710001263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 127.34375, "completions/mean_terminated_length": 127.34375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.7493433654308319, "epoch": 0.01441, "frac_reward_zero_std": 0.25, "grad_norm": 0.02362591214478016, "kl": 0.7101966738700867, "learning_rate": 9.999086458971296e-06, "loss": -0.0651, "num_tokens": 12852075.0, "reward": 0.902596116065979, "reward_std": 0.060104068368673325, "rewards/rollout_reward_func/mean": 0.902596116065979, "rewards/rollout_reward_func/std": 0.13694700598716736, "sampling/importance_sampling_ratio/max": 1.00190269947052, "sampling/importance_sampling_ratio/mean": 0.9042240381240845, "sampling/importance_sampling_ratio/min": 2.744205858107307e-06, "sampling/sampling_logp_difference/max": 2.71690034866333, "sampling/sampling_logp_difference/mean": 0.19495533406734467, "step": 1441, "step_time": 4.334574527994846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7453758222982287, "epoch": 0.01442, "grad_norm": 0.0225579384714365, "kl": 0.6949146613478661, "learning_rate": 9.999085158147136e-06, "loss": -0.0652, "step": 1442, "step_time": 1.9512729749985738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 240.5, "completions/mean_terminated_length": 240.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.03284505009651184, "epoch": 0.01443, "frac_reward_zero_std": 1.0, "grad_norm": 0.00036935339448973536, "kl": 0.4064505062997341, "learning_rate": 9.999083856397602e-06, "loss": 0.0018, "num_tokens": 12870451.0, "reward": 0.7577307820320129, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7577307820320129, "rewards/rollout_reward_func/std": 0.35848817229270935, "sampling/importance_sampling_ratio/max": 0.9987547397613525, "sampling/importance_sampling_ratio/mean": 0.9962253570556641, "sampling/importance_sampling_ratio/min": 0.9935523867607117, "sampling/sampling_logp_difference/max": 0.002621948719024658, "sampling/sampling_logp_difference/mean": 0.0006876270053908229, "step": 1443, "step_time": 4.389099287989666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.03271764283999801, "epoch": 0.01444, "grad_norm": 0.0003780661791097373, "kl": 0.4064372517168522, "learning_rate": 9.999082553722695e-06, "loss": 0.0018, "step": 1444, "step_time": 1.9953408160072286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 188.46875, "completions/mean_terminated_length": 188.46875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.31995785888284445, "epoch": 0.01445, "frac_reward_zero_std": 0.75, "grad_norm": 0.007219242863357067, "kl": 0.4404812827706337, "learning_rate": 9.999081250122412e-06, "loss": -0.0368, "num_tokens": 12887162.0, "reward": 0.613081693649292, "reward_std": 0.03675596043467522, "rewards/rollout_reward_func/mean": 0.613081693649292, "rewards/rollout_reward_func/std": 0.45632296800613403, "sampling/importance_sampling_ratio/max": 0.9985660314559937, "sampling/importance_sampling_ratio/mean": 0.964999794960022, "sampling/importance_sampling_ratio/min": 6.643686447205255e-06, "sampling/sampling_logp_difference/max": 2.5744850635528564, "sampling/sampling_logp_difference/mean": 0.05501359701156616, "step": 1445, "step_time": 4.318930267989344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3137852228246629, "epoch": 0.01446, "grad_norm": 0.005887627601623535, "kl": 0.4305531606078148, "learning_rate": 9.99907994559676e-06, "loss": -0.0368, "step": 1446, "step_time": 2.4351066400049604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 124.0, "completions/mean_terminated_length": 124.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.47427322529256344, "epoch": 0.01447, "frac_reward_zero_std": 0.5, "grad_norm": 0.02528318390250206, "kl": 0.663519125431776, "learning_rate": 9.999078640145735e-06, "loss": -0.0336, "num_tokens": 12901866.0, "reward": 0.8158653974533081, "reward_std": 0.04215443506836891, "rewards/rollout_reward_func/mean": 0.8158653974533081, "rewards/rollout_reward_func/std": 0.19935892522335052, "sampling/importance_sampling_ratio/max": 1.0003657341003418, "sampling/importance_sampling_ratio/mean": 0.9388740062713623, "sampling/importance_sampling_ratio/min": 1.1269632977928268e-06, "sampling/sampling_logp_difference/max": 2.585141181945801, "sampling/sampling_logp_difference/mean": 0.11532698571681976, "step": 1447, "step_time": 4.317144710003049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4710465036332607, "epoch": 0.01448, "grad_norm": 0.025281410664319992, "kl": 0.6606289260089397, "learning_rate": 9.999077333769338e-06, "loss": -0.0336, "step": 1448, "step_time": 1.9559556140084169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.03632424492388964, "epoch": 0.01449, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003424327587708831, "kl": 0.4723494425415993, "learning_rate": 9.999076026467569e-06, "loss": 0.0016, "num_tokens": 12918058.0, "reward": 0.6593461632728577, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6593461632728577, "rewards/rollout_reward_func/std": 0.31954821944236755, "sampling/importance_sampling_ratio/max": 1.0011544227600098, "sampling/importance_sampling_ratio/mean": 0.9975689649581909, "sampling/importance_sampling_ratio/min": 0.9938071966171265, "sampling/sampling_logp_difference/max": 0.003589954227209091, "sampling/sampling_logp_difference/mean": 0.0009425103198736906, "step": 1449, "step_time": 4.138698850991204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.036009625531733036, "epoch": 0.0145, "grad_norm": 0.00034140696516260505, "kl": 0.4723789133131504, "learning_rate": 9.999074718240428e-06, "loss": 0.0016, "step": 1450, "step_time": 1.992060869008128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 147.0, "completions/mean_terminated_length": 147.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.10566215962171555, "epoch": 0.01451, "frac_reward_zero_std": 0.75, "grad_norm": 0.028716186061501503, "kl": 0.525360494852066, "learning_rate": 9.999073409087918e-06, "loss": -0.015, "num_tokens": 12933618.0, "reward": 0.6125961542129517, "reward_std": 0.01767767407000065, "rewards/rollout_reward_func/mean": 0.6125961542129517, "rewards/rollout_reward_func/std": 0.3383215069770813, "sampling/importance_sampling_ratio/max": 1.0024398565292358, "sampling/importance_sampling_ratio/mean": 0.9712411165237427, "sampling/importance_sampling_ratio/min": 0.1355515569448471, "sampling/sampling_logp_difference/max": 2.060122013092041, "sampling/sampling_logp_difference/mean": 0.014120872132480145, "step": 1451, "step_time": 4.318941186000302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10537441447377205, "epoch": 0.01452, "grad_norm": 0.026154693216085434, "kl": 0.5330046601593494, "learning_rate": 9.999072099010036e-06, "loss": -0.015, "step": 1452, "step_time": 1.9663500289971125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 96.25, "completions/mean_terminated_length": 96.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.04663557140156627, "epoch": 0.01453, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003816848329734057, "kl": 0.4240148663520813, "learning_rate": 9.999070788006784e-06, "loss": 0.0011, "num_tokens": 12947378.0, "reward": 0.46346157789230347, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.46346157789230347, "rewards/rollout_reward_func/std": 0.08568019419908524, "sampling/importance_sampling_ratio/max": 1.000162959098816, "sampling/importance_sampling_ratio/mean": 0.9970162510871887, "sampling/importance_sampling_ratio/min": 0.9951099157333374, "sampling/sampling_logp_difference/max": 0.0035475492477416992, "sampling/sampling_logp_difference/mean": 0.0009144346695393324, "step": 1453, "step_time": 3.9558988200005842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04637565230950713, "epoch": 0.01454, "grad_norm": 0.0003793835930991918, "kl": 0.42404066026210785, "learning_rate": 9.999069476078162e-06, "loss": 0.0011, "step": 1454, "step_time": 1.883817722016829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 256.71875, "completions/mean_terminated_length": 256.71875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.17722271638922393, "epoch": 0.01455, "frac_reward_zero_std": 0.75, "grad_norm": 0.0036826664581894875, "kl": 0.47467363253235817, "learning_rate": 9.99906816322417e-06, "loss": -0.0348, "num_tokens": 12966217.0, "reward": 0.8054519295692444, "reward_std": 0.0006799129769206047, "rewards/rollout_reward_func/mean": 0.8054519295692444, "rewards/rollout_reward_func/std": 0.39624494314193726, "sampling/importance_sampling_ratio/max": 0.9976837038993835, "sampling/importance_sampling_ratio/mean": 0.9637269973754883, "sampling/importance_sampling_ratio/min": 0.006162143312394619, "sampling/sampling_logp_difference/max": 2.431065559387207, "sampling/sampling_logp_difference/mean": 0.021767262369394302, "step": 1455, "step_time": 4.457642037996266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1753232809714973, "epoch": 0.01456, "grad_norm": 0.003744867630302906, "kl": 0.47330526635050774, "learning_rate": 9.99906684944481e-06, "loss": -0.0348, "step": 1456, "step_time": 1.991238156981126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 142.125, "completions/mean_terminated_length": 142.125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.27239932026714087, "epoch": 0.01457, "frac_reward_zero_std": 0.75, "grad_norm": 0.007662222720682621, "kl": 0.564899705350399, "learning_rate": 9.999065534740078e-06, "loss": -0.0175, "num_tokens": 12981589.0, "reward": 0.550000011920929, "reward_std": 0.010878566652536392, "rewards/rollout_reward_func/mean": 0.550000011920929, "rewards/rollout_reward_func/std": 0.1852346956729889, "sampling/importance_sampling_ratio/max": 1.0013134479522705, "sampling/importance_sampling_ratio/mean": 0.9654769897460938, "sampling/importance_sampling_ratio/min": 0.0007528717978857458, "sampling/sampling_logp_difference/max": 3.3707833290100098, "sampling/sampling_logp_difference/mean": 0.04580879583954811, "step": 1457, "step_time": 3.8556851130051655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2722147195599973, "epoch": 0.01458, "grad_norm": 0.007711052894592285, "kl": 0.5645252764225006, "learning_rate": 9.99906421910998e-06, "loss": -0.0175, "step": 1458, "step_time": 2.4451230259946897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.03560945438221097, "epoch": 0.01459, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003692864556796849, "kl": 0.4006831906735897, "learning_rate": 9.999062902554514e-06, "loss": 0.0015, "num_tokens": 12998237.0, "reward": 0.6335769295692444, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6335769295692444, "rewards/rollout_reward_func/std": 0.4213308095932007, "sampling/importance_sampling_ratio/max": 0.9980968236923218, "sampling/importance_sampling_ratio/mean": 0.9961892366409302, "sampling/importance_sampling_ratio/min": 0.9937880635261536, "sampling/sampling_logp_difference/max": 0.004111761227250099, "sampling/sampling_logp_difference/mean": 0.0007770906086079776, "step": 1459, "step_time": 4.493865900993114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.035471666138619184, "epoch": 0.0146, "grad_norm": 0.0003664418763946742, "kl": 0.4007030986249447, "learning_rate": 9.999061585073679e-06, "loss": 0.0015, "step": 1460, "step_time": 1.9364553570048884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.03426953312009573, "epoch": 0.01461, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003244871913921088, "kl": 0.4210783578455448, "learning_rate": 9.999060266667476e-06, "loss": 0.0014, "num_tokens": 13014069.0, "reward": 0.49076923727989197, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.49076923727989197, "rewards/rollout_reward_func/std": 0.13377340137958527, "sampling/importance_sampling_ratio/max": 1.0009602308273315, "sampling/importance_sampling_ratio/mean": 0.9967628717422485, "sampling/importance_sampling_ratio/min": 0.9928758144378662, "sampling/sampling_logp_difference/max": 0.0045323725789785385, "sampling/sampling_logp_difference/mean": 0.0007687989855185151, "step": 1461, "step_time": 3.9531549609964713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.03417007578536868, "epoch": 0.01462, "grad_norm": 0.0003203912347089499, "kl": 0.4210946708917618, "learning_rate": 9.999058947335904e-06, "loss": 0.0014, "step": 1462, "step_time": 1.952014077010972 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 188.90625, "completions/mean_terminated_length": 188.90625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2754759890958667, "epoch": 0.01463, "frac_reward_zero_std": 0.5, "grad_norm": 1.2490063905715942, "kl": 0.5407709740102291, "learning_rate": 9.999057627078967e-06, "loss": 0.0434, "num_tokens": 13030850.0, "reward": 0.5432692766189575, "reward_std": 0.029916059225797653, "rewards/rollout_reward_func/mean": 0.5432692766189575, "rewards/rollout_reward_func/std": 0.06300213187932968, "sampling/importance_sampling_ratio/max": 0.9982713460922241, "sampling/importance_sampling_ratio/mean": 0.8951413631439209, "sampling/importance_sampling_ratio/min": 0.00935123860836029, "sampling/sampling_logp_difference/max": 1.963290810585022, "sampling/sampling_logp_difference/mean": 0.05746270716190338, "step": 1463, "step_time": 4.779009792000579 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.07916666846722364, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09166666865348816, "entropy": 0.49757379014045, "epoch": 0.01464, "grad_norm": 0.6122258305549622, "kl": 1.0483201742172241, "learning_rate": 9.999056305896664e-06, "loss": 0.0438, "step": 1464, "step_time": 1.9992169040051522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 214.09375, "completions/mean_terminated_length": 214.09375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.23009330872446299, "epoch": 0.01465, "frac_reward_zero_std": 1.0, "grad_norm": 0.010449625551700592, "kl": 0.43829382210969925, "learning_rate": 9.999054983788993e-06, "loss": 0.0018, "num_tokens": 13048437.0, "reward": 0.7799230813980103, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7799230813980103, "rewards/rollout_reward_func/std": 0.4342886507511139, "sampling/importance_sampling_ratio/max": 0.9975734353065491, "sampling/importance_sampling_ratio/mean": 0.9423726201057434, "sampling/importance_sampling_ratio/min": 0.0008399540674872696, "sampling/sampling_logp_difference/max": 2.54217529296875, "sampling/sampling_logp_difference/mean": 0.05855460464954376, "step": 1465, "step_time": 4.741516041001887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2356398142874241, "epoch": 0.01466, "grad_norm": 0.020443230867385864, "kl": 0.4395764321088791, "learning_rate": 9.999053660755955e-06, "loss": 0.0018, "step": 1466, "step_time": 1.995338124987029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.03171296534128487, "epoch": 0.01467, "frac_reward_zero_std": 1.0, "grad_norm": 0.00031328675686381757, "kl": 0.4014049544930458, "learning_rate": 9.999052336797553e-06, "loss": 0.0015, "num_tokens": 13065445.0, "reward": 1.000692367553711, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.000692367553711, "rewards/rollout_reward_func/std": 0.15039245784282684, "sampling/importance_sampling_ratio/max": 1.003902792930603, "sampling/importance_sampling_ratio/mean": 0.9988095164299011, "sampling/importance_sampling_ratio/min": 0.9933537244796753, "sampling/sampling_logp_difference/max": 0.006436830386519432, "sampling/sampling_logp_difference/mean": 0.0007785989437252283, "step": 1467, "step_time": 4.31740261799132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.03178056003525853, "epoch": 0.01468, "grad_norm": 0.0003078704176004976, "kl": 0.401408351957798, "learning_rate": 9.999051011913786e-06, "loss": 0.0015, "step": 1468, "step_time": 1.9868192370122415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 213.53125, "completions/mean_terminated_length": 213.53125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.21400441485457122, "epoch": 0.01469, "frac_reward_zero_std": 0.75, "grad_norm": 0.004790784791111946, "kl": 0.49265389889478683, "learning_rate": 9.999049686104654e-06, "loss": -0.0365, "num_tokens": 13083046.0, "reward": 1.0613750219345093, "reward_std": 0.2183600217103958, "rewards/rollout_reward_func/mean": 1.0613750219345093, "rewards/rollout_reward_func/std": 0.4745481014251709, "sampling/importance_sampling_ratio/max": 1.0007610321044922, "sampling/importance_sampling_ratio/mean": 0.9647814631462097, "sampling/importance_sampling_ratio/min": 0.00492184329777956, "sampling/sampling_logp_difference/max": 2.058300733566284, "sampling/sampling_logp_difference/mean": 0.026510007679462433, "step": 1469, "step_time": 4.894998710005893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21355187986046076, "epoch": 0.0147, "grad_norm": 0.0047267465852200985, "kl": 0.49017753079533577, "learning_rate": 9.999048359370155e-06, "loss": -0.0365, "step": 1470, "step_time": 2.4809978079938446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.19873628625646234, "epoch": 0.01471, "frac_reward_zero_std": 0.75, "grad_norm": 0.005009328480809927, "kl": 0.4637329615652561, "learning_rate": 9.999047031710293e-06, "loss": -0.0269, "num_tokens": 13099118.0, "reward": 0.5534615516662598, "reward_std": 0.010878566652536392, "rewards/rollout_reward_func/mean": 0.5534615516662598, "rewards/rollout_reward_func/std": 0.207678884267807, "sampling/importance_sampling_ratio/max": 0.9984996318817139, "sampling/importance_sampling_ratio/mean": 0.9653869867324829, "sampling/importance_sampling_ratio/min": 0.018242623656988144, "sampling/sampling_logp_difference/max": 1.8443045616149902, "sampling/sampling_logp_difference/mean": 0.028209330514073372, "step": 1471, "step_time": 4.020297748007579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.19913105573505163, "epoch": 0.01472, "grad_norm": 0.004952732007950544, "kl": 0.4640561118721962, "learning_rate": 9.999045703125068e-06, "loss": -0.0269, "step": 1472, "step_time": 1.9082639930129517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 220.0, "completions/mean_terminated_length": 220.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.030956288799643517, "epoch": 0.01473, "frac_reward_zero_std": 1.0, "grad_norm": 0.00035211691283620894, "kl": 0.3894665949046612, "learning_rate": 9.999044373614477e-06, "loss": 0.0016, "num_tokens": 13116926.0, "reward": 1.1181154251098633, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.1181154251098633, "rewards/rollout_reward_func/std": 0.1870335042476654, "sampling/importance_sampling_ratio/max": 1.0006734132766724, "sampling/importance_sampling_ratio/mean": 0.996059775352478, "sampling/importance_sampling_ratio/min": 0.9925655126571655, "sampling/sampling_logp_difference/max": 0.0051439255475997925, "sampling/sampling_logp_difference/mean": 0.0008684266358613968, "step": 1473, "step_time": 4.472783734003315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.031022011302411556, "epoch": 0.01474, "grad_norm": 0.0003525132779031992, "kl": 0.3894467242062092, "learning_rate": 9.999043043178524e-06, "loss": 0.0016, "step": 1474, "step_time": 2.03258436001488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 170.40625, "completions/mean_terminated_length": 170.40625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4844174147583544, "epoch": 0.01475, "frac_reward_zero_std": 0.5, "grad_norm": 0.023725025355815887, "kl": 0.6383856385946274, "learning_rate": 9.999041711817206e-06, "loss": -0.0555, "num_tokens": 13133091.0, "reward": 0.8873125314712524, "reward_std": 0.09550020098686218, "rewards/rollout_reward_func/mean": 0.8873125314712524, "rewards/rollout_reward_func/std": 0.4179803431034088, "sampling/importance_sampling_ratio/max": 1.0011016130447388, "sampling/importance_sampling_ratio/mean": 0.9350712895393372, "sampling/importance_sampling_ratio/min": 0.0012326466385275126, "sampling/sampling_logp_difference/max": 2.030029535293579, "sampling/sampling_logp_difference/mean": 0.07690788060426712, "step": 1475, "step_time": 4.836851991000003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.47839117981493473, "epoch": 0.01476, "grad_norm": 0.022543976083397865, "kl": 0.632412564009428, "learning_rate": 9.999040379530528e-06, "loss": -0.0555, "step": 1476, "step_time": 2.4851487960186205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.042322677094489336, "epoch": 0.01477, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008407555287703872, "kl": 0.4414873793721199, "learning_rate": 9.999039046318486e-06, "loss": 0.0015, "num_tokens": 13149323.0, "reward": 0.697692334651947, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.697692334651947, "rewards/rollout_reward_func/std": 0.2616102695465088, "sampling/importance_sampling_ratio/max": 0.9991531372070312, "sampling/importance_sampling_ratio/mean": 0.9955735802650452, "sampling/importance_sampling_ratio/min": 0.9927474856376648, "sampling/sampling_logp_difference/max": 0.005779219791293144, "sampling/sampling_logp_difference/mean": 0.0010225970763713121, "step": 1477, "step_time": 4.044970089991693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04323956277221441, "epoch": 0.01478, "grad_norm": 0.0009750516037456691, "kl": 0.4412701725959778, "learning_rate": 9.99903771218108e-06, "loss": 0.0015, "step": 1478, "step_time": 1.9766245570135652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 146.5, "completions/mean_terminated_length": 146.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.052392981480807066, "epoch": 0.01479, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005932127824053168, "kl": 0.5245187357068062, "learning_rate": 9.999036377118315e-06, "loss": 0.0016, "num_tokens": 13164923.0, "reward": 0.4411538541316986, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.4411538541316986, "rewards/rollout_reward_func/std": 0.057890646159648895, "sampling/importance_sampling_ratio/max": 1.0086663961410522, "sampling/importance_sampling_ratio/mean": 1.0016720294952393, "sampling/importance_sampling_ratio/min": 0.9912366271018982, "sampling/sampling_logp_difference/max": 0.009177852421998978, "sampling/sampling_logp_difference/mean": 0.0019039309117943048, "step": 1479, "step_time": 3.922854203003226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05332421092316508, "epoch": 0.0148, "grad_norm": 0.0005939750699326396, "kl": 0.52434291690588, "learning_rate": 9.999035041130188e-06, "loss": 0.0016, "step": 1480, "step_time": 1.9089614619879285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 145.25, "completions/mean_terminated_length": 145.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.048807332292199135, "epoch": 0.01481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004479810013435781, "kl": 0.4339512325823307, "learning_rate": 9.9990337042167e-06, "loss": 0.0013, "num_tokens": 13180395.0, "reward": 0.7084615230560303, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7084615230560303, "rewards/rollout_reward_func/std": 0.36914223432540894, "sampling/importance_sampling_ratio/max": 1.009404182434082, "sampling/importance_sampling_ratio/mean": 0.9984573125839233, "sampling/importance_sampling_ratio/min": 0.9933847784996033, "sampling/sampling_logp_difference/max": 0.009406723082065582, "sampling/sampling_logp_difference/mean": 0.0011374971363693476, "step": 1481, "step_time": 4.450332270003855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05004089651629329, "epoch": 0.01482, "grad_norm": 0.00046534463763237, "kl": 0.4337315559387207, "learning_rate": 9.99903236637785e-06, "loss": 0.0013, "step": 1482, "step_time": 2.406914049999614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 217.53125, "completions/mean_terminated_length": 217.53125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.225834378041327, "epoch": 0.01483, "frac_reward_zero_std": 0.75, "grad_norm": 0.0023222004529088736, "kl": 0.4080544449388981, "learning_rate": 9.99903102761364e-06, "loss": -0.0272, "num_tokens": 13198180.0, "reward": 0.9060961604118347, "reward_std": 0.02474873699247837, "rewards/rollout_reward_func/mean": 0.9060961604118347, "rewards/rollout_reward_func/std": 0.270303875207901, "sampling/importance_sampling_ratio/max": 1.0026822090148926, "sampling/importance_sampling_ratio/mean": 0.9651307463645935, "sampling/importance_sampling_ratio/min": 0.0006796069792471826, "sampling/sampling_logp_difference/max": 1.9717960357666016, "sampling/sampling_logp_difference/mean": 0.03396822139620781, "step": 1483, "step_time": 4.466339580008935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2269827639684081, "epoch": 0.01484, "grad_norm": 0.002870851429179311, "kl": 0.41339169442653656, "learning_rate": 9.99902968792407e-06, "loss": -0.0271, "step": 1484, "step_time": 2.01453140299418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 164.78125, "completions/mean_terminated_length": 164.78125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.19524396816268563, "epoch": 0.01485, "frac_reward_zero_std": 0.75, "grad_norm": 0.006825774442404509, "kl": 0.4628205820918083, "learning_rate": 9.99902834730914e-06, "loss": -0.0267, "num_tokens": 13214277.0, "reward": 0.6140384674072266, "reward_std": 0.01359820831567049, "rewards/rollout_reward_func/mean": 0.6140384674072266, "rewards/rollout_reward_func/std": 0.2842106223106384, "sampling/importance_sampling_ratio/max": 1.0031324625015259, "sampling/importance_sampling_ratio/mean": 0.9660657048225403, "sampling/importance_sampling_ratio/min": 0.027261583134531975, "sampling/sampling_logp_difference/max": 1.9631059169769287, "sampling/sampling_logp_difference/mean": 0.021575896069407463, "step": 1485, "step_time": 4.03127634300472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1943874298594892, "epoch": 0.01486, "grad_norm": 0.007110233884304762, "kl": 0.46227898076176643, "learning_rate": 9.999027005768851e-06, "loss": -0.0267, "step": 1486, "step_time": 1.9914748700030032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 171.75, "completions/mean_terminated_length": 171.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.04406223492696881, "epoch": 0.01487, "frac_reward_zero_std": 1.0, "grad_norm": 0.00042531933286227286, "kl": 0.4109755642712116, "learning_rate": 9.9990256633032e-06, "loss": 0.0014, "num_tokens": 13230597.0, "reward": 0.8376153707504272, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8376153707504272, "rewards/rollout_reward_func/std": 0.4052523970603943, "sampling/importance_sampling_ratio/max": 1.002737045288086, "sampling/importance_sampling_ratio/mean": 0.9957503080368042, "sampling/importance_sampling_ratio/min": 0.9924904704093933, "sampling/sampling_logp_difference/max": 0.005506234243512154, "sampling/sampling_logp_difference/mean": 0.0010833158157765865, "step": 1487, "step_time": 4.804987227995298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04431256791576743, "epoch": 0.01488, "grad_norm": 0.00042616340215317905, "kl": 0.41091468930244446, "learning_rate": 9.999024319912194e-06, "loss": 0.0014, "step": 1488, "step_time": 2.4854632369970204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 195.0625, "completions/mean_terminated_length": 195.0625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.126220581587404, "epoch": 0.01489, "frac_reward_zero_std": 0.5, "grad_norm": 1.6927675008773804, "kl": 0.6044497936964035, "learning_rate": 9.999022975595829e-06, "loss": -0.0366, "num_tokens": 13247663.0, "reward": 0.742682695388794, "reward_std": 0.2198830097913742, "rewards/rollout_reward_func/mean": 0.742682695388794, "rewards/rollout_reward_func/std": 0.4860338270664215, "sampling/importance_sampling_ratio/max": 1.1336982250213623, "sampling/importance_sampling_ratio/mean": 0.9754631519317627, "sampling/importance_sampling_ratio/min": 0.19499725103378296, "sampling/sampling_logp_difference/max": 1.56856369972229, "sampling/sampling_logp_difference/mean": 0.012124236673116684, "step": 1489, "step_time": 4.556207993009593 }, { "clip_ratio/high_max": 0.042410715483129025, "clip_ratio/high_mean": 0.021205357741564512, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029017857741564512, "entropy": 0.13430478749796748, "epoch": 0.0149, "grad_norm": 0.03142426535487175, "kl": 0.645995881408453, "learning_rate": 9.999021630354104e-06, "loss": -0.0398, "step": 1490, "step_time": 2.033581099996809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 208.96875, "completions/mean_terminated_length": 208.96875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2496461640112102, "epoch": 0.01491, "frac_reward_zero_std": 0.75, "grad_norm": 0.3262941241264343, "kl": 0.4130001589655876, "learning_rate": 9.999020284187021e-06, "loss": -0.0432, "num_tokens": 13265086.0, "reward": 0.8878461718559265, "reward_std": 0.035082265734672546, "rewards/rollout_reward_func/mean": 0.8878461718559265, "rewards/rollout_reward_func/std": 0.3950344920158386, "sampling/importance_sampling_ratio/max": 0.9995907545089722, "sampling/importance_sampling_ratio/mean": 0.9374605417251587, "sampling/importance_sampling_ratio/min": 0.005054513458162546, "sampling/sampling_logp_difference/max": 1.5934034585952759, "sampling/sampling_logp_difference/mean": 0.03282732516527176, "step": 1491, "step_time": 4.406990522002161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 0.25199539540335536, "epoch": 0.01492, "grad_norm": 0.034533482044935226, "kl": 0.4777846373617649, "learning_rate": 9.999018937094582e-06, "loss": -0.0435, "step": 1492, "step_time": 2.037831431996892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 132.59375, "completions/mean_terminated_length": 132.59375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5056877820752561, "epoch": 0.01493, "frac_reward_zero_std": 0.5, "grad_norm": 0.011225725524127483, "kl": 0.5117140971124172, "learning_rate": 9.999017589076787e-06, "loss": -0.0069, "num_tokens": 13280097.0, "reward": 0.4712499976158142, "reward_std": 0.05223339423537254, "rewards/rollout_reward_func/mean": 0.4712499976158142, "rewards/rollout_reward_func/std": 0.20332396030426025, "sampling/importance_sampling_ratio/max": 1.0048257112503052, "sampling/importance_sampling_ratio/mean": 0.907925009727478, "sampling/importance_sampling_ratio/min": 0.02337471954524517, "sampling/sampling_logp_difference/max": 1.741882085800171, "sampling/sampling_logp_difference/mean": 0.0614749938249588, "step": 1493, "step_time": 4.468627106994973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5077706254087389, "epoch": 0.01494, "grad_norm": 0.013162810355424881, "kl": 0.5313075743615627, "learning_rate": 9.999016240133633e-06, "loss": -0.0068, "step": 1494, "step_time": 2.3972374229924753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 118.25, "completions/mean_terminated_length": 118.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.045604308135807514, "epoch": 0.01495, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004447145911399275, "kl": 0.43596065044403076, "learning_rate": 9.999014890265121e-06, "loss": 0.0012, "num_tokens": 13294681.0, "reward": 0.6657692193984985, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6657692193984985, "rewards/rollout_reward_func/std": 0.1987888514995575, "sampling/importance_sampling_ratio/max": 1.0024850368499756, "sampling/importance_sampling_ratio/mean": 0.9987800717353821, "sampling/importance_sampling_ratio/min": 0.9946309328079224, "sampling/sampling_logp_difference/max": 0.00444460567086935, "sampling/sampling_logp_difference/mean": 0.0011352329747751355, "step": 1495, "step_time": 3.8680636120116105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0470225652679801, "epoch": 0.01496, "grad_norm": 0.0005545375752262771, "kl": 0.4356926195323467, "learning_rate": 9.999013539471255e-06, "loss": 0.0012, "step": 1496, "step_time": 1.9904036770021776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 144.5625, "completions/mean_terminated_length": 144.5625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.5123354918323457, "epoch": 0.01497, "frac_reward_zero_std": 0.5, "grad_norm": 0.1408618539571762, "kl": 0.46732454374432564, "learning_rate": 9.999012187752034e-06, "loss": 0.0005, "num_tokens": 13309987.0, "reward": 0.615336537361145, "reward_std": 0.04011470824480057, "rewards/rollout_reward_func/mean": 0.615336537361145, "rewards/rollout_reward_func/std": 0.39002907276153564, "sampling/importance_sampling_ratio/max": 1.0115329027175903, "sampling/importance_sampling_ratio/mean": 0.9337700605392456, "sampling/importance_sampling_ratio/min": 0.0002596183039713651, "sampling/sampling_logp_difference/max": 2.6809260845184326, "sampling/sampling_logp_difference/mean": 0.08099885284900665, "step": 1497, "step_time": 4.1394601939900895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5061758598312736, "epoch": 0.01498, "grad_norm": 0.1515267789363861, "kl": 0.47348450869321823, "learning_rate": 9.999010835107456e-06, "loss": 0.0005, "step": 1498, "step_time": 1.9767362860002322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 170.53125, "completions/mean_terminated_length": 170.53125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.21431544376537204, "epoch": 0.01499, "frac_reward_zero_std": 0.75, "grad_norm": 0.005391112994402647, "kl": 0.525151789188385, "learning_rate": 9.999009481537522e-06, "loss": -0.0263, "num_tokens": 13326212.0, "reward": 0.7857692241668701, "reward_std": 0.0016317844856530428, "rewards/rollout_reward_func/mean": 0.7857692241668701, "rewards/rollout_reward_func/std": 0.24240699410438538, "sampling/importance_sampling_ratio/max": 1.0054904222488403, "sampling/importance_sampling_ratio/mean": 0.9665868878364563, "sampling/importance_sampling_ratio/min": 0.013050539419054985, "sampling/sampling_logp_difference/max": 1.6853224039077759, "sampling/sampling_logp_difference/mean": 0.025247104465961456, "step": 1499, "step_time": 4.627122689002135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2099783062003553, "epoch": 0.015, "grad_norm": 0.005126310046762228, "kl": 0.5249020829796791, "learning_rate": 9.999008127042235e-06, "loss": -0.0263, "step": 1500, "step_time": 2.450365590004367 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 201.875, "completions/mean_terminated_length": 201.875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.256867122836411, "epoch": 0.01501, "frac_reward_zero_std": 0.75, "grad_norm": 0.4230978786945343, "kl": 0.4415591433644295, "learning_rate": 9.999006771621594e-06, "loss": -0.0321, "num_tokens": 13343408.0, "reward": 0.5937981009483337, "reward_std": 0.008838835172355175, "rewards/rollout_reward_func/mean": 0.5937981009483337, "rewards/rollout_reward_func/std": 0.2921047806739807, "sampling/importance_sampling_ratio/max": 1.482192039489746, "sampling/importance_sampling_ratio/mean": 0.9616769552230835, "sampling/importance_sampling_ratio/min": 0.008892826735973358, "sampling/sampling_logp_difference/max": 1.8289463520050049, "sampling/sampling_logp_difference/mean": 0.033489882946014404, "step": 1501, "step_time": 4.184967622990371 }, { "clip_ratio/high_max": 0.020292208530008793, "clip_ratio/high_mean": 0.010146104265004396, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010146104265004396, "entropy": 0.2705323905684054, "epoch": 0.01502, "grad_norm": 0.6206046938896179, "kl": 0.4874321334064007, "learning_rate": 9.999005415275596e-06, "loss": -0.0297, "step": 1502, "step_time": 1.9877605159999803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 103.8125, "completions/mean_terminated_length": 103.8125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.44677721429616213, "epoch": 0.01503, "frac_reward_zero_std": 0.5, "grad_norm": 0.027207467705011368, "kl": 0.5345182232558727, "learning_rate": 9.999004058004245e-06, "loss": -0.0359, "num_tokens": 13357586.0, "reward": 0.8336538672447205, "reward_std": 0.017949635162949562, "rewards/rollout_reward_func/mean": 0.8336538672447205, "rewards/rollout_reward_func/std": 0.1373646855354309, "sampling/importance_sampling_ratio/max": 1.0021559000015259, "sampling/importance_sampling_ratio/mean": 0.9364806413650513, "sampling/importance_sampling_ratio/min": 0.0023885113187134266, "sampling/sampling_logp_difference/max": 2.213890314102173, "sampling/sampling_logp_difference/mean": 0.07215800136327744, "step": 1503, "step_time": 3.768236372983665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.44515493605285883, "epoch": 0.01504, "grad_norm": 0.03385313227772713, "kl": 0.5184664875268936, "learning_rate": 9.999002699807543e-06, "loss": -0.0359, "step": 1504, "step_time": 1.9975259279963211 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 175.25, "completions/mean_terminated_length": 175.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7000049515627325, "epoch": 0.01505, "frac_reward_zero_std": 0.5, "grad_norm": 0.15449441969394684, "kl": 0.45541827753186226, "learning_rate": 9.999001340685484e-06, "loss": 0.0091, "num_tokens": 13373962.0, "reward": 0.2567307651042938, "reward_std": 0.06877352297306061, "rewards/rollout_reward_func/mean": 0.2567307651042938, "rewards/rollout_reward_func/std": 0.16280080378055573, "sampling/importance_sampling_ratio/max": 1.045630693435669, "sampling/importance_sampling_ratio/mean": 0.9066246747970581, "sampling/importance_sampling_ratio/min": 9.07182027388249e-16, "sampling/sampling_logp_difference/max": 3.7803356647491455, "sampling/sampling_logp_difference/mean": 0.19027522206306458, "step": 1505, "step_time": 5.1230534670030465 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 0.7065695999190211, "epoch": 0.01506, "grad_norm": 0.2766295075416565, "kl": 0.4574240483343601, "learning_rate": 9.998999980638074e-06, "loss": 0.0093, "step": 1506, "step_time": 1.9942663539986825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 166.90625, "completions/mean_terminated_length": 166.90625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.29278680635616183, "epoch": 0.01507, "frac_reward_zero_std": 0.5, "grad_norm": 0.508242130279541, "kl": 0.6639661155641079, "learning_rate": 9.998998619665313e-06, "loss": -0.0458, "num_tokens": 13390127.0, "reward": 0.7680768966674805, "reward_std": 0.03698712959885597, "rewards/rollout_reward_func/mean": 0.7680768966674805, "rewards/rollout_reward_func/std": 0.13064627349376678, "sampling/importance_sampling_ratio/max": 1.0719799995422363, "sampling/importance_sampling_ratio/mean": 0.9302133917808533, "sampling/importance_sampling_ratio/min": 0.03636951372027397, "sampling/sampling_logp_difference/max": 1.9094483852386475, "sampling/sampling_logp_difference/mean": 0.042632848024368286, "step": 1507, "step_time": 4.160276203001558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.29736759793013334, "epoch": 0.01508, "grad_norm": 0.03204701095819473, "kl": 0.6927240267395973, "learning_rate": 9.998997257767197e-06, "loss": -0.047, "step": 1508, "step_time": 2.010605267008941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 228.375, "completions/mean_terminated_length": 228.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.11465800460428, "epoch": 0.01509, "frac_reward_zero_std": 0.75, "grad_norm": 0.9752508997917175, "kl": 0.43663957342505455, "learning_rate": 9.998995894943731e-06, "loss": 0.0755, "num_tokens": 13408115.0, "reward": 0.7059711813926697, "reward_std": 0.0020830959547311068, "rewards/rollout_reward_func/mean": 0.7059711813926697, "rewards/rollout_reward_func/std": 0.3948350250720978, "sampling/importance_sampling_ratio/max": 1.6042916774749756, "sampling/importance_sampling_ratio/mean": 1.0459253787994385, "sampling/importance_sampling_ratio/min": 0.11044753342866898, "sampling/sampling_logp_difference/max": 1.4517300128936768, "sampling/sampling_logp_difference/mean": 0.02868260256946087, "step": 1509, "step_time": 4.258067927999946 }, { "clip_ratio/high_max": 0.0357142873108387, "clip_ratio/high_mean": 0.01785714365541935, "clip_ratio/low_mean": 0.07366071455180645, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0915178582072258, "entropy": 0.35542831756174564, "epoch": 0.0151, "grad_norm": 0.6608728170394897, "kl": 1.3890470750629902, "learning_rate": 9.998994531194913e-06, "loss": 0.0724, "step": 1510, "step_time": 2.3469988019933226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 162.875, "completions/mean_terminated_length": 162.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.49916960624977946, "epoch": 0.01511, "frac_reward_zero_std": 0.5, "grad_norm": 0.011370044201612473, "kl": 0.4746651127934456, "learning_rate": 9.998993166520743e-06, "loss": -0.018, "num_tokens": 13424039.0, "reward": 0.5816875100135803, "reward_std": 0.021118009462952614, "rewards/rollout_reward_func/mean": 0.5816875100135803, "rewards/rollout_reward_func/std": 0.24260446429252625, "sampling/importance_sampling_ratio/max": 1.0000866651535034, "sampling/importance_sampling_ratio/mean": 0.9328384399414062, "sampling/importance_sampling_ratio/min": 0.00017680673045106232, "sampling/sampling_logp_difference/max": 2.669811964035034, "sampling/sampling_logp_difference/mean": 0.07569260150194168, "step": 1511, "step_time": 4.917757174000144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.503879617433995, "epoch": 0.01512, "grad_norm": 0.011036157608032227, "kl": 0.47028476372361183, "learning_rate": 9.998991800921223e-06, "loss": -0.018, "step": 1512, "step_time": 1.9743449150046217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 214.5, "completions/mean_terminated_length": 214.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 1.0393538121134043, "epoch": 0.01513, "frac_reward_zero_std": 0.5, "grad_norm": 0.0421389602124691, "kl": 0.5995061583817005, "learning_rate": 9.998990434396354e-06, "loss": -0.0767, "num_tokens": 13441639.0, "reward": 0.5757644176483154, "reward_std": 0.07855186611413956, "rewards/rollout_reward_func/mean": 0.5757644176483154, "rewards/rollout_reward_func/std": 0.24481061100959778, "sampling/importance_sampling_ratio/max": 0.9976806044578552, "sampling/importance_sampling_ratio/mean": 0.8660061955451965, "sampling/importance_sampling_ratio/min": 7.427223287250358e-18, "sampling/sampling_logp_difference/max": 4.2212748527526855, "sampling/sampling_logp_difference/mean": 0.3909975588321686, "step": 1513, "step_time": 4.866144440988137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0428576106205583, "epoch": 0.01514, "grad_norm": 0.03325515612959862, "kl": 0.5619751773774624, "learning_rate": 9.998989066946134e-06, "loss": -0.0768, "step": 1514, "step_time": 2.0199960509926314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 196.125, "completions/mean_terminated_length": 196.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.6375819677487016, "epoch": 0.01515, "frac_reward_zero_std": 0.25, "grad_norm": 0.06968457251787186, "kl": 0.49934323877096176, "learning_rate": 9.998987698570562e-06, "loss": -0.048, "num_tokens": 13458739.0, "reward": 0.7826442718505859, "reward_std": 0.10688190907239914, "rewards/rollout_reward_func/mean": 0.7826442718505859, "rewards/rollout_reward_func/std": 0.3335241377353668, "sampling/importance_sampling_ratio/max": 1.0041738748550415, "sampling/importance_sampling_ratio/mean": 0.8995534181594849, "sampling/importance_sampling_ratio/min": 4.008954590517533e-08, "sampling/sampling_logp_difference/max": 3.3636600971221924, "sampling/sampling_logp_difference/mean": 0.12328639626502991, "step": 1515, "step_time": 4.725564847998612 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.6256183041259646, "epoch": 0.01516, "grad_norm": 0.04611312970519066, "kl": 0.49363189935684204, "learning_rate": 9.998986329269642e-06, "loss": -0.0482, "step": 1516, "step_time": 2.482707215989649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 155.40625, "completions/mean_terminated_length": 155.40625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.9347151834517717, "epoch": 0.01517, "frac_reward_zero_std": 0.25, "grad_norm": 0.013339231722056866, "kl": 0.5261540785431862, "learning_rate": 9.998984959043373e-06, "loss": -0.0269, "num_tokens": 13474536.0, "reward": 0.5115865468978882, "reward_std": 0.061055950820446014, "rewards/rollout_reward_func/mean": 0.5115865468978882, "rewards/rollout_reward_func/std": 0.22478799521923065, "sampling/importance_sampling_ratio/max": 1.0359861850738525, "sampling/importance_sampling_ratio/mean": 0.8829471468925476, "sampling/importance_sampling_ratio/min": 3.7755205031562445e-20, "sampling/sampling_logp_difference/max": 3.668381690979004, "sampling/sampling_logp_difference/mean": 0.3282282054424286, "step": 1517, "step_time": 4.629708728018159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9262435380369425, "epoch": 0.01518, "grad_norm": 0.013433991931378841, "kl": 0.5230069383978844, "learning_rate": 9.998983587891756e-06, "loss": -0.0269, "step": 1518, "step_time": 2.0084633470032713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 170.15625, "completions/mean_terminated_length": 170.15625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.7441830448806286, "epoch": 0.01519, "frac_reward_zero_std": 0.25, "grad_norm": 0.11156785488128662, "kl": 0.5589053109288216, "learning_rate": 9.998982215814789e-06, "loss": -0.0031, "num_tokens": 13490925.0, "reward": 0.5372692346572876, "reward_std": 0.07219768315553665, "rewards/rollout_reward_func/mean": 0.5372692346572876, "rewards/rollout_reward_func/std": 0.23586328327655792, "sampling/importance_sampling_ratio/max": 1.0030605792999268, "sampling/importance_sampling_ratio/mean": 0.8775935173034668, "sampling/importance_sampling_ratio/min": 0.0018164085922762752, "sampling/sampling_logp_difference/max": 2.0202529430389404, "sampling/sampling_logp_difference/mean": 0.107724130153656, "step": 1519, "step_time": 4.331386677011324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7373926993459463, "epoch": 0.0152, "grad_norm": 0.15171539783477783, "kl": 0.5309457741677761, "learning_rate": 9.998980842812474e-06, "loss": -0.0035, "step": 1520, "step_time": 2.012425629000063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 188.71875, "completions/mean_terminated_length": 188.71875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4653576882556081, "epoch": 0.01521, "frac_reward_zero_std": 0.5, "grad_norm": 0.038596250116825104, "kl": 0.4449247121810913, "learning_rate": 9.998979468884813e-06, "loss": -0.0637, "num_tokens": 13507700.0, "reward": 0.802139401435852, "reward_std": 0.19792191684246063, "rewards/rollout_reward_func/mean": 0.802139401435852, "rewards/rollout_reward_func/std": 0.4726480543613434, "sampling/importance_sampling_ratio/max": 1.0326370000839233, "sampling/importance_sampling_ratio/mean": 0.9192080497741699, "sampling/importance_sampling_ratio/min": 0.0008596357074566185, "sampling/sampling_logp_difference/max": 1.9127476215362549, "sampling/sampling_logp_difference/mean": 0.05716026574373245, "step": 1521, "step_time": 4.237887527997373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4630948919802904, "epoch": 0.01522, "grad_norm": 0.03085189498960972, "kl": 0.4419981874525547, "learning_rate": 9.998978094031804e-06, "loss": -0.0636, "step": 1522, "step_time": 2.4842183430009754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 195.5, "completions/mean_terminated_length": 195.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.08980499673634768, "epoch": 0.01523, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010816229041665792, "kl": 0.39797256514430046, "learning_rate": 9.998976718253446e-06, "loss": 0.0015, "num_tokens": 13524636.0, "reward": 0.8018461465835571, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8018461465835571, "rewards/rollout_reward_func/std": 0.3462260961532593, "sampling/importance_sampling_ratio/max": 0.9922364354133606, "sampling/importance_sampling_ratio/mean": 0.9872236251831055, "sampling/importance_sampling_ratio/min": 0.9817575812339783, "sampling/sampling_logp_difference/max": 0.013884402811527252, "sampling/sampling_logp_difference/mean": 0.0023566442541778088, "step": 1523, "step_time": 4.901693329004047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09141491446644068, "epoch": 0.01524, "grad_norm": 0.001085333526134491, "kl": 0.3977266512811184, "learning_rate": 9.998975341549743e-06, "loss": 0.0015, "step": 1524, "step_time": 2.0224536570021883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 150.125, "completions/mean_terminated_length": 150.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.30972985457628965, "epoch": 0.01525, "frac_reward_zero_std": 0.5, "grad_norm": 0.09520737081766129, "kl": 0.4103514850139618, "learning_rate": 9.998973963920692e-06, "loss": -0.0051, "num_tokens": 13540152.0, "reward": 0.7499518990516663, "reward_std": 0.022437043488025665, "rewards/rollout_reward_func/mean": 0.7499518990516663, "rewards/rollout_reward_func/std": 0.33562448620796204, "sampling/importance_sampling_ratio/max": 1.0153731107711792, "sampling/importance_sampling_ratio/mean": 0.9344441294670105, "sampling/importance_sampling_ratio/min": 0.03188497945666313, "sampling/sampling_logp_difference/max": 1.7369965314865112, "sampling/sampling_logp_difference/mean": 0.03847809508442879, "step": 1525, "step_time": 4.067332041995542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3092050999403, "epoch": 0.01526, "grad_norm": 0.08933979272842407, "kl": 0.4092203192412853, "learning_rate": 9.998972585366296e-06, "loss": -0.0052, "step": 1526, "step_time": 2.0241977619880345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 163.09375, "completions/mean_terminated_length": 163.09375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5140768261626363, "epoch": 0.01527, "frac_reward_zero_std": 0.25, "grad_norm": 0.033792126923799515, "kl": 0.42023735865950584, "learning_rate": 9.998971205886555e-06, "loss": 0.0206, "num_tokens": 13556051.0, "reward": 0.5516105890274048, "reward_std": 0.03134387731552124, "rewards/rollout_reward_func/mean": 0.5516105890274048, "rewards/rollout_reward_func/std": 0.37036004662513733, "sampling/importance_sampling_ratio/max": 0.9889293909072876, "sampling/importance_sampling_ratio/mean": 0.8961123824119568, "sampling/importance_sampling_ratio/min": 0.03851544111967087, "sampling/sampling_logp_difference/max": 1.6564348936080933, "sampling/sampling_logp_difference/mean": 0.061264533549547195, "step": 1527, "step_time": 4.0152600620058365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5124249598011374, "epoch": 0.01528, "grad_norm": 0.03322732821106911, "kl": 0.41207166016101837, "learning_rate": 9.998969825481467e-06, "loss": 0.0205, "step": 1528, "step_time": 2.3908597150075366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6110865408554673, "epoch": 0.01529, "frac_reward_zero_std": 0.5, "grad_norm": 0.04551335796713829, "kl": 0.38983626663684845, "learning_rate": 9.998968444151034e-06, "loss": -0.0596, "num_tokens": 13572215.0, "reward": 0.7030096054077148, "reward_std": 0.06609482318162918, "rewards/rollout_reward_func/mean": 0.7030096054077148, "rewards/rollout_reward_func/std": 0.427450954914093, "sampling/importance_sampling_ratio/max": 0.9968085289001465, "sampling/importance_sampling_ratio/mean": 0.8990833163261414, "sampling/importance_sampling_ratio/min": 0.0015432885847985744, "sampling/sampling_logp_difference/max": 1.8700300455093384, "sampling/sampling_logp_difference/mean": 0.07071879506111145, "step": 1529, "step_time": 4.684506790996238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6148277213796973, "epoch": 0.0153, "grad_norm": 0.04398846626281738, "kl": 0.39099542796611786, "learning_rate": 9.998967061895257e-06, "loss": -0.0597, "step": 1530, "step_time": 1.9985703849888523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 258.75, "completions/mean_terminated_length": 258.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.8327331393957138, "epoch": 0.01531, "frac_reward_zero_std": 0.25, "grad_norm": 0.03275461494922638, "kl": 0.36866395175457, "learning_rate": 9.998965678714136e-06, "loss": -0.1011, "num_tokens": 13591119.0, "reward": 0.876028835773468, "reward_std": 0.2156403809785843, "rewards/rollout_reward_func/mean": 0.876028835773468, "rewards/rollout_reward_func/std": 0.5375837683677673, "sampling/importance_sampling_ratio/max": 0.9967144131660461, "sampling/importance_sampling_ratio/mean": 0.8935505151748657, "sampling/importance_sampling_ratio/min": 1.6271237184983327e-20, "sampling/sampling_logp_difference/max": 5.396095275878906, "sampling/sampling_logp_difference/mean": 0.22538138926029205, "step": 1531, "step_time": 4.690616343992588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.833609202876687, "epoch": 0.01532, "grad_norm": 0.030908925458788872, "kl": 0.3726438097655773, "learning_rate": 9.99896429460767e-06, "loss": -0.1011, "step": 1532, "step_time": 2.0084815230002278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 173.3125, "completions/mean_terminated_length": 173.3125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4281484866514802, "epoch": 0.01533, "frac_reward_zero_std": 0.5, "grad_norm": 0.056644804775714874, "kl": 0.47418802231550217, "learning_rate": 9.99896290957586e-06, "loss": -0.0539, "num_tokens": 13607521.0, "reward": 0.7316874861717224, "reward_std": 0.0771426260471344, "rewards/rollout_reward_func/mean": 0.7316874861717224, "rewards/rollout_reward_func/std": 0.3589124083518982, "sampling/importance_sampling_ratio/max": 1.003355860710144, "sampling/importance_sampling_ratio/mean": 0.9353222846984863, "sampling/importance_sampling_ratio/min": 0.0005892830085940659, "sampling/sampling_logp_difference/max": 1.7382748126983643, "sampling/sampling_logp_difference/mean": 0.054085150361061096, "step": 1533, "step_time": 4.529485557002772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43683746457099915, "epoch": 0.01534, "grad_norm": 0.05182123929262161, "kl": 0.47582340985536575, "learning_rate": 9.998961523618709e-06, "loss": -0.054, "step": 1534, "step_time": 2.969468693001545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 148.4375, "completions/mean_terminated_length": 148.4375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6234182193875313, "epoch": 0.01535, "frac_reward_zero_std": 0.25, "grad_norm": 0.06693417578935623, "kl": 0.41413383558392525, "learning_rate": 9.998960136736213e-06, "loss": -0.0661, "num_tokens": 13622983.0, "reward": 0.369038462638855, "reward_std": 0.05463584512472153, "rewards/rollout_reward_func/mean": 0.369038462638855, "rewards/rollout_reward_func/std": 0.10245376080274582, "sampling/importance_sampling_ratio/max": 1.0093681812286377, "sampling/importance_sampling_ratio/mean": 0.8790203332901001, "sampling/importance_sampling_ratio/min": 0.002465674187988043, "sampling/sampling_logp_difference/max": 2.0391135215759277, "sampling/sampling_logp_difference/mean": 0.09506374597549438, "step": 1535, "step_time": 4.3234670360106975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.6191084235906601, "epoch": 0.01536, "grad_norm": 0.05996406078338623, "kl": 0.42670419812202454, "learning_rate": 9.998958748928375e-06, "loss": -0.0664, "step": 1536, "step_time": 2.0634713569888845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 165.21875, "completions/mean_terminated_length": 165.21875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5401640729978681, "epoch": 0.01537, "frac_reward_zero_std": 0.25, "grad_norm": 2.5019311904907227, "kl": 0.7415833547711372, "learning_rate": 9.998957360195193e-06, "loss": -0.0031, "num_tokens": 13639038.0, "reward": 0.8207211494445801, "reward_std": 0.039866551756858826, "rewards/rollout_reward_func/mean": 0.8207211494445801, "rewards/rollout_reward_func/std": 0.20018352568149567, "sampling/importance_sampling_ratio/max": 1.0377823114395142, "sampling/importance_sampling_ratio/mean": 0.8721455335617065, "sampling/importance_sampling_ratio/min": 0.028761694207787514, "sampling/sampling_logp_difference/max": 1.9317295551300049, "sampling/sampling_logp_difference/mean": 0.06975582242012024, "step": 1537, "step_time": 4.165394277013547 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.07083333469927311, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.08333333488553762, "entropy": 0.8733360655605793, "epoch": 0.01538, "grad_norm": 0.07968975603580475, "kl": 0.8353341370820999, "learning_rate": 9.998955970536672e-06, "loss": -0.0079, "step": 1538, "step_time": 2.0429537239906495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.6066107163205743, "epoch": 0.01539, "frac_reward_zero_std": 0.5, "grad_norm": 0.011840242892503738, "kl": 0.44638389348983765, "learning_rate": 9.998954579952808e-06, "loss": 0.0107, "num_tokens": 13654578.0, "reward": 0.8374663591384888, "reward_std": 0.020846053957939148, "rewards/rollout_reward_func/mean": 0.8374663591384888, "rewards/rollout_reward_func/std": 0.1694304347038269, "sampling/importance_sampling_ratio/max": 1.0049967765808105, "sampling/importance_sampling_ratio/mean": 0.9329580068588257, "sampling/importance_sampling_ratio/min": 8.351236829184927e-06, "sampling/sampling_logp_difference/max": 3.3221261501312256, "sampling/sampling_logp_difference/mean": 0.11027322709560394, "step": 1539, "step_time": 4.459770761000982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6157896025106311, "epoch": 0.0154, "grad_norm": 0.011982081457972527, "kl": 0.4552307352423668, "learning_rate": 9.998953188443603e-06, "loss": 0.0108, "step": 1540, "step_time": 2.49373014599405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06243497133255005, "epoch": 0.01541, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005882985424250364, "kl": 0.39096907898783684, "learning_rate": 9.998951796009056e-06, "loss": 0.0013, "num_tokens": 13670562.0, "reward": 0.570884644985199, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.570884644985199, "rewards/rollout_reward_func/std": 0.3551532030105591, "sampling/importance_sampling_ratio/max": 1.0019363164901733, "sampling/importance_sampling_ratio/mean": 0.9965255260467529, "sampling/importance_sampling_ratio/min": 0.9914100766181946, "sampling/sampling_logp_difference/max": 0.005405532196164131, "sampling/sampling_logp_difference/mean": 0.0010166984284296632, "step": 1541, "step_time": 3.9792153399903327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06116178072988987, "epoch": 0.01542, "grad_norm": 0.0005798141355626285, "kl": 0.3911690078675747, "learning_rate": 9.998950402649168e-06, "loss": 0.0013, "step": 1542, "step_time": 2.0096301270095864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 214.6875, "completions/mean_terminated_length": 214.6875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.10828704154118896, "epoch": 0.01543, "frac_reward_zero_std": 0.75, "grad_norm": 0.020175663754343987, "kl": 0.49260706827044487, "learning_rate": 9.998949008363941e-06, "loss": -0.0321, "num_tokens": 13688224.0, "reward": 0.7332451939582825, "reward_std": 0.041515324264764786, "rewards/rollout_reward_func/mean": 0.7332451939582825, "rewards/rollout_reward_func/std": 0.3848763704299927, "sampling/importance_sampling_ratio/max": 1.0001345872879028, "sampling/importance_sampling_ratio/mean": 0.96906578540802, "sampling/importance_sampling_ratio/min": 0.15146994590759277, "sampling/sampling_logp_difference/max": 1.94560706615448, "sampling/sampling_logp_difference/mean": 0.009956621564924717, "step": 1543, "step_time": 4.391394490987295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10708396509289742, "epoch": 0.01544, "grad_norm": 0.01759488694369793, "kl": 0.5081860385835171, "learning_rate": 9.998947613153373e-06, "loss": -0.0322, "step": 1544, "step_time": 2.0658634299979894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 198.9375, "completions/mean_terminated_length": 198.9375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.7170860017649829, "epoch": 0.01545, "frac_reward_zero_std": 0.25, "grad_norm": 0.019781114533543587, "kl": 0.6105604991316795, "learning_rate": 9.998946217017467e-06, "loss": -0.0893, "num_tokens": 13705358.0, "reward": 0.6321297883987427, "reward_std": 0.0871860682964325, "rewards/rollout_reward_func/mean": 0.6321297883987427, "rewards/rollout_reward_func/std": 0.29532957077026367, "sampling/importance_sampling_ratio/max": 1.0047396421432495, "sampling/importance_sampling_ratio/mean": 0.8745616674423218, "sampling/importance_sampling_ratio/min": 6.956680590519682e-05, "sampling/sampling_logp_difference/max": 2.0777204036712646, "sampling/sampling_logp_difference/mean": 0.1110767275094986, "step": 1545, "step_time": 5.133136453012412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7240103976801038, "epoch": 0.01546, "grad_norm": 0.021021805703639984, "kl": 0.6159029938280582, "learning_rate": 9.998944819956222e-06, "loss": -0.0893, "step": 1546, "step_time": 2.515314207994379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6282641738653183, "epoch": 0.01547, "frac_reward_zero_std": 0.25, "grad_norm": 0.5710558891296387, "kl": 0.49412751942873, "learning_rate": 9.998943421969635e-06, "loss": -0.0504, "num_tokens": 13722070.0, "reward": 0.5961394309997559, "reward_std": 0.07404398918151855, "rewards/rollout_reward_func/mean": 0.5961394309997559, "rewards/rollout_reward_func/std": 0.21855542063713074, "sampling/importance_sampling_ratio/max": 1.2694348096847534, "sampling/importance_sampling_ratio/mean": 0.8626421689987183, "sampling/importance_sampling_ratio/min": 0.0010156371863558888, "sampling/sampling_logp_difference/max": 2.00109601020813, "sampling/sampling_logp_difference/mean": 0.12410608679056168, "step": 1547, "step_time": 4.272875606984599 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.03645833395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.045386905781924725, "entropy": 0.6358547285199165, "epoch": 0.01548, "grad_norm": 0.196613147854805, "kl": 0.6382065713405609, "learning_rate": 9.998942023057712e-06, "loss": -0.0515, "step": 1548, "step_time": 2.0587536660023034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.10666920896619558, "epoch": 0.01549, "frac_reward_zero_std": 1.0, "grad_norm": 0.024557014927268028, "kl": 0.37883594259619713, "learning_rate": 9.99894062322045e-06, "loss": 0.0013, "num_tokens": 13738286.0, "reward": 0.8753076791763306, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8753076791763306, "rewards/rollout_reward_func/std": 0.3813537061214447, "sampling/importance_sampling_ratio/max": 1.7572312355041504, "sampling/importance_sampling_ratio/mean": 0.9566030502319336, "sampling/importance_sampling_ratio/min": 0.5009691715240479, "sampling/sampling_logp_difference/max": 0.6152374744415283, "sampling/sampling_logp_difference/mean": 0.017611442133784294, "step": 1549, "step_time": 4.373806372001127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20959587208926678, "epoch": 0.0155, "grad_norm": 0.01618337631225586, "kl": 0.3637037128210068, "learning_rate": 9.99893922245785e-06, "loss": 0.0012, "step": 1550, "step_time": 2.0281467529930524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 246.4375, "completions/mean_terminated_length": 246.4375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3329214081168175, "epoch": 0.01551, "frac_reward_zero_std": 0.5, "grad_norm": 0.03273750841617584, "kl": 0.5880587361752987, "learning_rate": 9.998937820769913e-06, "loss": -0.0405, "num_tokens": 13756940.0, "reward": 1.0746427774429321, "reward_std": 0.050017617642879486, "rewards/rollout_reward_func/mean": 1.0746427774429321, "rewards/rollout_reward_func/std": 0.3611779510974884, "sampling/importance_sampling_ratio/max": 1.0230814218521118, "sampling/importance_sampling_ratio/mean": 0.9117341041564941, "sampling/importance_sampling_ratio/min": 0.0014772946015000343, "sampling/sampling_logp_difference/max": 2.3765246868133545, "sampling/sampling_logp_difference/mean": 0.0657261312007904, "step": 1551, "step_time": 5.129651415008993 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 0.32937303744256496, "epoch": 0.01552, "grad_norm": 0.026567529886960983, "kl": 0.5937405824661255, "learning_rate": 9.998936418156639e-06, "loss": -0.0406, "step": 1552, "step_time": 2.5361803710111417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.05218089185655117, "epoch": 0.01553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004021666827611625, "kl": 0.40999264642596245, "learning_rate": 9.998935014618028e-06, "loss": 0.0013, "num_tokens": 13772396.0, "reward": 0.4142307639122009, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.4142307639122009, "rewards/rollout_reward_func/std": 0.10464093089103699, "sampling/importance_sampling_ratio/max": 0.9973535537719727, "sampling/importance_sampling_ratio/mean": 0.9953262209892273, "sampling/importance_sampling_ratio/min": 0.9931840896606445, "sampling/sampling_logp_difference/max": 0.003394392319023609, "sampling/sampling_logp_difference/mean": 0.0010137949138879776, "step": 1553, "step_time": 3.9945653379909345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.051346806809306145, "epoch": 0.01554, "grad_norm": 0.0003913698601536453, "kl": 0.4101179614663124, "learning_rate": 9.99893361015408e-06, "loss": 0.0013, "step": 1554, "step_time": 1.9626885409961687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 234.75, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.045708184130489826, "epoch": 0.01555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005567166954278946, "kl": 0.3694641701877117, "learning_rate": 9.998932204764797e-06, "loss": 0.0016, "num_tokens": 13790588.0, "reward": 0.8248461484909058, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8248461484909058, "rewards/rollout_reward_func/std": 0.44351720809936523, "sampling/importance_sampling_ratio/max": 1.0030944347381592, "sampling/importance_sampling_ratio/mean": 0.9969929456710815, "sampling/importance_sampling_ratio/min": 0.9937255382537842, "sampling/sampling_logp_difference/max": 0.00652734749019146, "sampling/sampling_logp_difference/mean": 0.0008437176584266126, "step": 1555, "step_time": 4.3269627609988675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04479996534064412, "epoch": 0.01556, "grad_norm": 0.0005228728987276554, "kl": 0.36961664631962776, "learning_rate": 9.998930798450177e-06, "loss": 0.0016, "step": 1556, "step_time": 2.486617479029519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 169.1875, "completions/mean_terminated_length": 169.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.558852762915194, "epoch": 0.01557, "frac_reward_zero_std": 0.5, "grad_norm": 0.05913727730512619, "kl": 0.5337528698146343, "learning_rate": 9.998929391210221e-06, "loss": -0.0524, "num_tokens": 13806714.0, "reward": 0.4710947275161743, "reward_std": 0.045857228338718414, "rewards/rollout_reward_func/mean": 0.4710947275161743, "rewards/rollout_reward_func/std": 0.190700501203537, "sampling/importance_sampling_ratio/max": 1.0171512365341187, "sampling/importance_sampling_ratio/mean": 0.9104386568069458, "sampling/importance_sampling_ratio/min": 0.0008925226866267622, "sampling/sampling_logp_difference/max": 2.0502307415008545, "sampling/sampling_logp_difference/mean": 0.09503330290317535, "step": 1557, "step_time": 4.32093725699815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.5584444347769022, "epoch": 0.01558, "grad_norm": 0.06674555689096451, "kl": 0.5329323187470436, "learning_rate": 9.998927983044931e-06, "loss": -0.0524, "step": 1558, "step_time": 2.5390509529970586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 170.90625, "completions/mean_terminated_length": 170.90625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.5759478700347245, "epoch": 0.01559, "frac_reward_zero_std": 0.5, "grad_norm": 0.010226222686469555, "kl": 0.5660379715263844, "learning_rate": 9.998926573954307e-06, "loss": -0.0201, "num_tokens": 13822919.0, "reward": 0.5154327154159546, "reward_std": 0.0349222868680954, "rewards/rollout_reward_func/mean": 0.5154327154159546, "rewards/rollout_reward_func/std": 0.1562264859676361, "sampling/importance_sampling_ratio/max": 0.9994887709617615, "sampling/importance_sampling_ratio/mean": 0.9045015573501587, "sampling/importance_sampling_ratio/min": 0.00014140519488137215, "sampling/sampling_logp_difference/max": 2.3091366291046143, "sampling/sampling_logp_difference/mean": 0.09775589406490326, "step": 1559, "step_time": 4.295275397991645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5736333723179996, "epoch": 0.0156, "grad_norm": 0.010137432254850864, "kl": 0.5660252422094345, "learning_rate": 9.998925163938348e-06, "loss": -0.0201, "step": 1560, "step_time": 2.0561431199894287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 152.125, "completions/mean_terminated_length": 152.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5068995309993625, "epoch": 0.01561, "frac_reward_zero_std": 0.75, "grad_norm": 0.007509336341172457, "kl": 0.5161635801196098, "learning_rate": 9.998923752997053e-06, "loss": -0.0222, "num_tokens": 13838643.0, "reward": 0.8039134740829468, "reward_std": 0.026093119755387306, "rewards/rollout_reward_func/mean": 0.8039134740829468, "rewards/rollout_reward_func/std": 0.44770511984825134, "sampling/importance_sampling_ratio/max": 1.0017985105514526, "sampling/importance_sampling_ratio/mean": 0.9366698265075684, "sampling/importance_sampling_ratio/min": 0.0018960211891680956, "sampling/sampling_logp_difference/max": 2.1015117168426514, "sampling/sampling_logp_difference/mean": 0.08635474741458893, "step": 1561, "step_time": 4.392370098001265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5063807545229793, "epoch": 0.01562, "grad_norm": 0.007840310223400593, "kl": 0.5162951201200485, "learning_rate": 9.998922341130428e-06, "loss": -0.0222, "step": 1562, "step_time": 2.505951354003628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 169.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2114869225770235, "epoch": 0.01563, "frac_reward_zero_std": 0.75, "grad_norm": 0.0037426534108817577, "kl": 0.37744592130184174, "learning_rate": 9.998920928338467e-06, "loss": -0.0178, "num_tokens": 13854735.0, "reward": 0.6658365726470947, "reward_std": 0.03603525087237358, "rewards/rollout_reward_func/mean": 0.6658365726470947, "rewards/rollout_reward_func/std": 0.4079718589782715, "sampling/importance_sampling_ratio/max": 1.0193930864334106, "sampling/importance_sampling_ratio/mean": 0.9696792960166931, "sampling/importance_sampling_ratio/min": 0.0019937781617045403, "sampling/sampling_logp_difference/max": 2.415492534637451, "sampling/sampling_logp_difference/mean": 0.03824971243739128, "step": 1563, "step_time": 4.960707560989249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21215476794168353, "epoch": 0.01564, "grad_norm": 0.004290574695914984, "kl": 0.37981076911091805, "learning_rate": 9.998919514621175e-06, "loss": -0.0178, "step": 1564, "step_time": 2.0710349989967654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 194.0625, "completions/mean_terminated_length": 194.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.14104454079642892, "epoch": 0.01565, "frac_reward_zero_std": 0.75, "grad_norm": 0.08484257012605667, "kl": 0.4473605155944824, "learning_rate": 9.998918099978547e-06, "loss": 0.019, "num_tokens": 13871769.0, "reward": 0.6303750276565552, "reward_std": 0.01563793420791626, "rewards/rollout_reward_func/mean": 0.6303750276565552, "rewards/rollout_reward_func/std": 0.4408899247646332, "sampling/importance_sampling_ratio/max": 1.008009433746338, "sampling/importance_sampling_ratio/mean": 0.9740031957626343, "sampling/importance_sampling_ratio/min": 0.21206222474575043, "sampling/sampling_logp_difference/max": 1.3701812028884888, "sampling/sampling_logp_difference/mean": 0.01361897587776184, "step": 1565, "step_time": 4.663705375998688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1409477712586522, "epoch": 0.01566, "grad_norm": 0.08699534088373184, "kl": 0.4466157555580139, "learning_rate": 9.998916684410589e-06, "loss": 0.019, "step": 1566, "step_time": 2.0100987490077387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 221.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.7343848925083876, "epoch": 0.01567, "frac_reward_zero_std": 0.25, "grad_norm": 0.021071525290608406, "kl": 0.49820907041430473, "learning_rate": 9.9989152679173e-06, "loss": -0.0365, "num_tokens": 13889589.0, "reward": 0.6831187605857849, "reward_std": 0.22240275144577026, "rewards/rollout_reward_func/mean": 0.6831187605857849, "rewards/rollout_reward_func/std": 0.4641241431236267, "sampling/importance_sampling_ratio/max": 1.0016874074935913, "sampling/importance_sampling_ratio/mean": 0.9037182331085205, "sampling/importance_sampling_ratio/min": 4.350867754965293e-07, "sampling/sampling_logp_difference/max": 2.1563720703125, "sampling/sampling_logp_difference/mean": 0.14607936143875122, "step": 1567, "step_time": 4.8817067820054945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7355877249501646, "epoch": 0.01568, "grad_norm": 0.02107478305697441, "kl": 0.4968366511166096, "learning_rate": 9.998913850498676e-06, "loss": -0.0365, "step": 1568, "step_time": 2.5308673799881944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 190.6875, "completions/mean_terminated_length": 190.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.33807325130328536, "epoch": 0.01569, "frac_reward_zero_std": 0.75, "grad_norm": 0.15060804784297943, "kl": 0.4436180926859379, "learning_rate": 9.998912432154724e-06, "loss": 0.0203, "num_tokens": 13906571.0, "reward": 0.6818461418151855, "reward_std": 0.030267195776104927, "rewards/rollout_reward_func/mean": 0.6818461418151855, "rewards/rollout_reward_func/std": 0.19902893900871277, "sampling/importance_sampling_ratio/max": 1.014812707901001, "sampling/importance_sampling_ratio/mean": 0.927857518196106, "sampling/importance_sampling_ratio/min": 0.14760339260101318, "sampling/sampling_logp_difference/max": 1.1480542421340942, "sampling/sampling_logp_difference/mean": 0.03187629580497742, "step": 1569, "step_time": 5.074382582002727 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.32771104481071234, "epoch": 0.0157, "grad_norm": 0.1220095157623291, "kl": 0.4357011690735817, "learning_rate": 9.99891101288544e-06, "loss": 0.0198, "step": 1570, "step_time": 2.022914799999853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 143.4375, "completions/mean_terminated_length": 143.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6867769528180361, "epoch": 0.01571, "frac_reward_zero_std": 0.25, "grad_norm": 0.17422102391719818, "kl": 0.5612519346177578, "learning_rate": 9.998909592690825e-06, "loss": -0.0409, "num_tokens": 13921961.0, "reward": 0.724879801273346, "reward_std": 0.07078159600496292, "rewards/rollout_reward_func/mean": 0.724879801273346, "rewards/rollout_reward_func/std": 0.3020172715187073, "sampling/importance_sampling_ratio/max": 1.0300798416137695, "sampling/importance_sampling_ratio/mean": 0.8917627334594727, "sampling/importance_sampling_ratio/min": 0.004269411321729422, "sampling/sampling_logp_difference/max": 2.1228437423706055, "sampling/sampling_logp_difference/mean": 0.10276544094085693, "step": 1571, "step_time": 3.860932527000841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6836952278390527, "epoch": 0.01572, "grad_norm": 0.21131187677383423, "kl": 0.5495080016553402, "learning_rate": 9.998908171570881e-06, "loss": -0.0403, "step": 1572, "step_time": 2.00588044700271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 189.71875, "completions/mean_terminated_length": 189.71875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.37507462315261364, "epoch": 0.01573, "frac_reward_zero_std": 0.5, "grad_norm": 0.015609572641551495, "kl": 0.4219941310584545, "learning_rate": 9.998906749525606e-06, "loss": 0.0117, "num_tokens": 13938768.0, "reward": 0.6974519491195679, "reward_std": 0.022437039762735367, "rewards/rollout_reward_func/mean": 0.6974519491195679, "rewards/rollout_reward_func/std": 0.27697426080703735, "sampling/importance_sampling_ratio/max": 1.022133469581604, "sampling/importance_sampling_ratio/mean": 0.9416066408157349, "sampling/importance_sampling_ratio/min": 0.0059082540683448315, "sampling/sampling_logp_difference/max": 1.7753002643585205, "sampling/sampling_logp_difference/mean": 0.05563740059733391, "step": 1573, "step_time": 4.330531918996712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.37323931977152824, "epoch": 0.01574, "grad_norm": 0.016390835866332054, "kl": 0.41940664127469063, "learning_rate": 9.998905326555002e-06, "loss": 0.0117, "step": 1574, "step_time": 2.55204055300419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 200.53125, "completions/mean_terminated_length": 200.53125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3259367784485221, "epoch": 0.01575, "frac_reward_zero_std": 0.75, "grad_norm": 0.9204230308532715, "kl": 0.43480680510401726, "learning_rate": 9.998903902659068e-06, "loss": -0.0243, "num_tokens": 13955977.0, "reward": 0.6677885055541992, "reward_std": 0.04249677062034607, "rewards/rollout_reward_func/mean": 0.6677885055541992, "rewards/rollout_reward_func/std": 0.22692638635635376, "sampling/importance_sampling_ratio/max": 1.2405807971954346, "sampling/importance_sampling_ratio/mean": 0.9796872735023499, "sampling/importance_sampling_ratio/min": 1.1350459317327477e-05, "sampling/sampling_logp_difference/max": 3.4792349338531494, "sampling/sampling_logp_difference/mean": 0.06561611592769623, "step": 1575, "step_time": 4.821216197997273 }, { "clip_ratio/high_max": 0.050595239736139774, "clip_ratio/high_mean": 0.025297619868069887, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04092261986806989, "entropy": 0.34464165987446904, "epoch": 0.01576, "grad_norm": 0.1008039340376854, "kl": 0.44285188615322113, "learning_rate": 9.998902477837807e-06, "loss": -0.0278, "step": 1576, "step_time": 2.071170246003021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 193.21875, "completions/mean_terminated_length": 193.21875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3721776697784662, "epoch": 0.01577, "frac_reward_zero_std": 0.5, "grad_norm": 0.2077728509902954, "kl": 0.4662017151713371, "learning_rate": 9.998901052091216e-06, "loss": -0.0527, "num_tokens": 13972984.0, "reward": 0.798480749130249, "reward_std": 0.05687759071588516, "rewards/rollout_reward_func/mean": 0.798480749130249, "rewards/rollout_reward_func/std": 0.43109622597694397, "sampling/importance_sampling_ratio/max": 1.0161186456680298, "sampling/importance_sampling_ratio/mean": 0.9148990511894226, "sampling/importance_sampling_ratio/min": 0.023151155561208725, "sampling/sampling_logp_difference/max": 1.9790879487991333, "sampling/sampling_logp_difference/mean": 0.04507989436388016, "step": 1577, "step_time": 4.382777578983223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.020833333488553762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020833333488553762, "entropy": 0.40875415597110987, "epoch": 0.01578, "grad_norm": 0.0340898223221302, "kl": 0.4994298778474331, "learning_rate": 9.998899625419296e-06, "loss": -0.0532, "step": 1578, "step_time": 2.063792552005907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 182.5625, "completions/mean_terminated_length": 182.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2486268407665193, "epoch": 0.01579, "frac_reward_zero_std": 0.75, "grad_norm": 0.19293048977851868, "kl": 0.4101002775132656, "learning_rate": 9.99889819782205e-06, "loss": -0.0336, "num_tokens": 13989594.0, "reward": 0.7370384931564331, "reward_std": 0.07022162526845932, "rewards/rollout_reward_func/mean": 0.7370384931564331, "rewards/rollout_reward_func/std": 0.3394930958747864, "sampling/importance_sampling_ratio/max": 1.0302025079727173, "sampling/importance_sampling_ratio/mean": 0.9543687105178833, "sampling/importance_sampling_ratio/min": 0.03017735481262207, "sampling/sampling_logp_difference/max": 2.1004836559295654, "sampling/sampling_logp_difference/mean": 0.02226177603006363, "step": 1579, "step_time": 4.794966782988922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.2543403101153672, "epoch": 0.0158, "grad_norm": 0.09370554238557816, "kl": 0.4102311246097088, "learning_rate": 9.998896769299478e-06, "loss": -0.0341, "step": 1580, "step_time": 2.4952904359888635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 192.90625, "completions/mean_terminated_length": 192.90625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.2989946249872446, "epoch": 0.01581, "frac_reward_zero_std": 1.0, "grad_norm": 0.005170648917555809, "kl": 0.4368905611336231, "learning_rate": 9.998895339851577e-06, "loss": 0.0016, "num_tokens": 14006503.0, "reward": 0.8913461565971375, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8913461565971375, "rewards/rollout_reward_func/std": 0.34998011589050293, "sampling/importance_sampling_ratio/max": 1.013200044631958, "sampling/importance_sampling_ratio/mean": 0.9641557931900024, "sampling/importance_sampling_ratio/min": 0.00043065997306257486, "sampling/sampling_logp_difference/max": 2.113164186477661, "sampling/sampling_logp_difference/mean": 0.04162170737981796, "step": 1581, "step_time": 4.117762414003664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3001220729202032, "epoch": 0.01582, "grad_norm": 0.005218109581619501, "kl": 0.43724846467375755, "learning_rate": 9.998893909478348e-06, "loss": 0.0016, "step": 1582, "step_time": 2.0212522980073118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 166.1875, "completions/mean_terminated_length": 166.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2283861180767417, "epoch": 0.01583, "frac_reward_zero_std": 0.75, "grad_norm": 0.003167414804920554, "kl": 0.4541136734187603, "learning_rate": 9.998892478179796e-06, "loss": -0.0271, "num_tokens": 14022589.0, "reward": 0.7581490278244019, "reward_std": 0.01774565875530243, "rewards/rollout_reward_func/mean": 0.7581490278244019, "rewards/rollout_reward_func/std": 0.34736523032188416, "sampling/importance_sampling_ratio/max": 1.0006757974624634, "sampling/importance_sampling_ratio/mean": 0.9647156000137329, "sampling/importance_sampling_ratio/min": 0.012311041355133057, "sampling/sampling_logp_difference/max": 2.3063910007476807, "sampling/sampling_logp_difference/mean": 0.025021569803357124, "step": 1583, "step_time": 4.143400752982416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2298188116401434, "epoch": 0.01584, "grad_norm": 0.003037406364455819, "kl": 0.4547286927700043, "learning_rate": 9.998891045955917e-06, "loss": -0.0271, "step": 1584, "step_time": 2.0304285650054226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 170.4375, "completions/mean_terminated_length": 170.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7988863931968808, "epoch": 0.01585, "frac_reward_zero_std": 0.25, "grad_norm": 0.10058571398258209, "kl": 0.5240915678441525, "learning_rate": 9.998889612806711e-06, "loss": -0.0636, "num_tokens": 14038811.0, "reward": 0.5598317384719849, "reward_std": 0.10292059928178787, "rewards/rollout_reward_func/mean": 0.5598317384719849, "rewards/rollout_reward_func/std": 0.372687429189682, "sampling/importance_sampling_ratio/max": 1.018478274345398, "sampling/importance_sampling_ratio/mean": 0.8815479278564453, "sampling/importance_sampling_ratio/min": 1.2284727745281998e-05, "sampling/sampling_logp_difference/max": 2.52579402923584, "sampling/sampling_logp_difference/mean": 0.14879825711250305, "step": 1585, "step_time": 4.7597786510086735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.8135521151125431, "epoch": 0.01586, "grad_norm": 0.03246071934700012, "kl": 0.5340706333518028, "learning_rate": 9.998888178732179e-06, "loss": -0.064, "step": 1586, "step_time": 2.5324796730055823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 195.375, "completions/mean_terminated_length": 195.375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.34886465314775705, "epoch": 0.01587, "frac_reward_zero_std": 0.5, "grad_norm": 0.06458047777414322, "kl": 0.47919338196516037, "learning_rate": 9.998886743732323e-06, "loss": -0.0551, "num_tokens": 14055919.0, "reward": 1.004014492034912, "reward_std": 0.0594649538397789, "rewards/rollout_reward_func/mean": 1.004014492034912, "rewards/rollout_reward_func/std": 0.43310418725013733, "sampling/importance_sampling_ratio/max": 1.0024687051773071, "sampling/importance_sampling_ratio/mean": 0.93145751953125, "sampling/importance_sampling_ratio/min": 0.005764362867921591, "sampling/sampling_logp_difference/max": 2.4265289306640625, "sampling/sampling_logp_difference/mean": 0.0562695637345314, "step": 1587, "step_time": 4.172362888013595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3474686094559729, "epoch": 0.01588, "grad_norm": 0.04197601601481438, "kl": 0.4899079129099846, "learning_rate": 9.998885307807144e-06, "loss": -0.0552, "step": 1588, "step_time": 2.0275606169889215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 168.84375, "completions/mean_terminated_length": 168.84375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4934041523374617, "epoch": 0.01589, "frac_reward_zero_std": 0.75, "grad_norm": 0.03668885678052902, "kl": 0.7525640651583672, "learning_rate": 9.99888387095664e-06, "loss": -0.0351, "num_tokens": 14072146.0, "reward": 0.8904423117637634, "reward_std": 0.09136427938938141, "rewards/rollout_reward_func/mean": 0.8904423117637634, "rewards/rollout_reward_func/std": 0.4635046720504761, "sampling/importance_sampling_ratio/max": 1.0017642974853516, "sampling/importance_sampling_ratio/mean": 0.9332424402236938, "sampling/importance_sampling_ratio/min": 0.00019229674944654107, "sampling/sampling_logp_difference/max": 3.264847755432129, "sampling/sampling_logp_difference/mean": 0.09989123046398163, "step": 1589, "step_time": 4.171953804012446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.49489424796774983, "epoch": 0.0159, "grad_norm": 0.030839143320918083, "kl": 0.7385220192372799, "learning_rate": 9.99888243318081e-06, "loss": -0.0352, "step": 1590, "step_time": 2.051178178997361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 146.78125, "completions/mean_terminated_length": 146.78125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.25818710355088115, "epoch": 0.01591, "frac_reward_zero_std": 1.0, "grad_norm": 0.004101758357137442, "kl": 0.47121362388134, "learning_rate": 9.998880994479659e-06, "loss": 0.0014, "num_tokens": 14087699.0, "reward": 0.8335769176483154, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8335769176483154, "rewards/rollout_reward_func/std": 0.1557008922100067, "sampling/importance_sampling_ratio/max": 1.0030605792999268, "sampling/importance_sampling_ratio/mean": 0.9661107659339905, "sampling/importance_sampling_ratio/min": 0.018626375123858452, "sampling/sampling_logp_difference/max": 2.0487990379333496, "sampling/sampling_logp_difference/mean": 0.02895558439195156, "step": 1591, "step_time": 4.377125248000084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25681094639003277, "epoch": 0.01592, "grad_norm": 0.004348252434283495, "kl": 0.47440746426582336, "learning_rate": 9.998879554853185e-06, "loss": 0.0014, "step": 1592, "step_time": 2.4635427569955937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 101.1875, "completions/mean_terminated_length": 101.1875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.26511020585894585, "epoch": 0.01593, "frac_reward_zero_std": 0.75, "grad_norm": 0.012246694415807724, "kl": 0.49000347778201103, "learning_rate": 9.998878114301387e-06, "loss": 0.0201, "num_tokens": 14101761.0, "reward": 0.5685576796531677, "reward_std": 0.006799102760851383, "rewards/rollout_reward_func/mean": 0.5685576796531677, "rewards/rollout_reward_func/std": 0.11331170052289963, "sampling/importance_sampling_ratio/max": 1.002394676208496, "sampling/importance_sampling_ratio/mean": 0.9653019309043884, "sampling/importance_sampling_ratio/min": 0.015305398032069206, "sampling/sampling_logp_difference/max": 1.5112437009811401, "sampling/sampling_logp_difference/mean": 0.034756869077682495, "step": 1593, "step_time": 3.754188979997707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2688949010334909, "epoch": 0.01594, "grad_norm": 0.012500430457293987, "kl": 0.49100109189748764, "learning_rate": 9.998876672824267e-06, "loss": 0.0201, "step": 1594, "step_time": 1.9862460940057645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 212.0, "completions/mean_terminated_length": 212.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.05989072332158685, "epoch": 0.01595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007735577528364956, "kl": 0.42136670276522636, "learning_rate": 9.998875230421825e-06, "loss": 0.0017, "num_tokens": 14119281.0, "reward": 0.6239615678787231, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6239615678787231, "rewards/rollout_reward_func/std": 0.2620695233345032, "sampling/importance_sampling_ratio/max": 0.9991062879562378, "sampling/importance_sampling_ratio/mean": 0.994260311126709, "sampling/importance_sampling_ratio/min": 0.9890740513801575, "sampling/sampling_logp_difference/max": 0.009920040145516396, "sampling/sampling_logp_difference/mean": 0.0014032658655196428, "step": 1595, "step_time": 4.710463204995904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05945271300151944, "epoch": 0.01596, "grad_norm": 0.0007372022373601794, "kl": 0.4214518591761589, "learning_rate": 9.998873787094061e-06, "loss": 0.0017, "step": 1596, "step_time": 2.0164260780002223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 169.5, "completions/mean_terminated_length": 169.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.057262735441327095, "epoch": 0.01597, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005889787571504712, "kl": 0.4397938884794712, "learning_rate": 9.998872342840975e-06, "loss": 0.0015, "num_tokens": 14135385.0, "reward": 0.5701153874397278, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5701153874397278, "rewards/rollout_reward_func/std": 0.25193455815315247, "sampling/importance_sampling_ratio/max": 0.9959167838096619, "sampling/importance_sampling_ratio/mean": 0.9922871589660645, "sampling/importance_sampling_ratio/min": 0.9889630675315857, "sampling/sampling_logp_difference/max": 0.007482787594199181, "sampling/sampling_logp_difference/mean": 0.0014204964973032475, "step": 1597, "step_time": 5.325257299999066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.058366827201098204, "epoch": 0.01598, "grad_norm": 0.0006134824361652136, "kl": 0.4395486079156399, "learning_rate": 9.99887089766257e-06, "loss": 0.0015, "step": 1598, "step_time": 2.0454689920079545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 163.78125, "completions/mean_terminated_length": 163.78125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2356561105698347, "epoch": 0.01599, "frac_reward_zero_std": 0.75, "grad_norm": 0.014562271535396576, "kl": 0.47211289033293724, "learning_rate": 9.998869451558841e-06, "loss": 0.03, "num_tokens": 14151450.0, "reward": 0.5773557424545288, "reward_std": 0.015365973114967346, "rewards/rollout_reward_func/mean": 0.5773557424545288, "rewards/rollout_reward_func/std": 0.2584358751773834, "sampling/importance_sampling_ratio/max": 0.9994654059410095, "sampling/importance_sampling_ratio/mean": 0.9620994329452515, "sampling/importance_sampling_ratio/min": 0.017057349905371666, "sampling/sampling_logp_difference/max": 1.4740445613861084, "sampling/sampling_logp_difference/mean": 0.024465268477797508, "step": 1599, "step_time": 4.351640524997492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2351872893050313, "epoch": 0.016, "grad_norm": 0.014627019874751568, "kl": 0.46680518239736557, "learning_rate": 9.998868004529796e-06, "loss": 0.0299, "step": 1600, "step_time": 2.0544905019996804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 167.125, "completions/mean_terminated_length": 167.125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 1.2961809560656548, "epoch": 0.01601, "frac_reward_zero_std": 0.5, "grad_norm": 0.08204995095729828, "kl": 0.6577580124139786, "learning_rate": 9.998866556575428e-06, "loss": 0.0084, "num_tokens": 14167478.0, "reward": 0.599951982498169, "reward_std": 0.09670598059892654, "rewards/rollout_reward_func/mean": 0.599951982498169, "rewards/rollout_reward_func/std": 0.33299294114112854, "sampling/importance_sampling_ratio/max": 2.2191684246063232, "sampling/importance_sampling_ratio/mean": 0.7739925384521484, "sampling/importance_sampling_ratio/min": 7.903094001449062e-07, "sampling/sampling_logp_difference/max": 3.735076904296875, "sampling/sampling_logp_difference/mean": 0.25976061820983887, "step": 1601, "step_time": 4.312867407999875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3046577759087086, "epoch": 0.01602, "grad_norm": 0.08272890746593475, "kl": 0.6397097222507, "learning_rate": 9.99886510769574e-06, "loss": 0.0083, "step": 1602, "step_time": 2.485636744007934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3637107564136386, "epoch": 0.01603, "frac_reward_zero_std": 0.75, "grad_norm": 0.011701562441885471, "kl": 0.4943734481930733, "learning_rate": 9.998863657890735e-06, "loss": 0.0209, "num_tokens": 14184262.0, "reward": 0.6686730980873108, "reward_std": 0.00679910508915782, "rewards/rollout_reward_func/mean": 0.6686730980873108, "rewards/rollout_reward_func/std": 0.22539456188678741, "sampling/importance_sampling_ratio/max": 0.9987789392471313, "sampling/importance_sampling_ratio/mean": 0.9608043432235718, "sampling/importance_sampling_ratio/min": 0.00015468210040125996, "sampling/sampling_logp_difference/max": 2.1172213554382324, "sampling/sampling_logp_difference/mean": 0.048034776002168655, "step": 1603, "step_time": 4.877081606005959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36549858655780554, "epoch": 0.01604, "grad_norm": 0.011921585537493229, "kl": 0.48932747915387154, "learning_rate": 9.998862207160411e-06, "loss": 0.0209, "step": 1604, "step_time": 2.0084278650028864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 186.77418518066406, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.42524967109784484, "epoch": 0.01605, "frac_reward_zero_std": 0.5, "grad_norm": 0.024884114041924477, "kl": 0.4978194609284401, "learning_rate": 9.998860755504768e-06, "loss": -0.064, "num_tokens": 14201106.0, "reward": 0.6736586093902588, "reward_std": 0.030228806659579277, "rewards/rollout_reward_func/mean": 0.6736586093902588, "rewards/rollout_reward_func/std": 0.2285366654396057, "sampling/importance_sampling_ratio/max": 0.9985799193382263, "sampling/importance_sampling_ratio/mean": 0.9263734817504883, "sampling/importance_sampling_ratio/min": 0.0007234254735521972, "sampling/sampling_logp_difference/max": 2.1309423446655273, "sampling/sampling_logp_difference/mean": 0.06533303111791611, "step": 1605, "step_time": 4.560288209002465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42632849141955376, "epoch": 0.01606, "grad_norm": 0.02594292163848877, "kl": 0.4964250810444355, "learning_rate": 9.998859302923806e-06, "loss": -0.0639, "step": 1606, "step_time": 2.0159571539770695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 163.53125, "completions/mean_terminated_length": 163.53125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.24373211059719324, "epoch": 0.01607, "frac_reward_zero_std": 0.75, "grad_norm": 0.01656782627105713, "kl": 0.43088165670633316, "learning_rate": 9.998857849417527e-06, "loss": 0.0297, "num_tokens": 14217019.0, "reward": 0.6837692260742188, "reward_std": 0.013598206453025341, "rewards/rollout_reward_func/mean": 0.6837692260742188, "rewards/rollout_reward_func/std": 0.26017728447914124, "sampling/importance_sampling_ratio/max": 0.9940141439437866, "sampling/importance_sampling_ratio/mean": 0.9576483964920044, "sampling/importance_sampling_ratio/min": 0.016339246183633804, "sampling/sampling_logp_difference/max": 2.0710744857788086, "sampling/sampling_logp_difference/mean": 0.031025489792227745, "step": 1607, "step_time": 4.383842392991937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24613092467188835, "epoch": 0.01608, "grad_norm": 0.0174630805850029, "kl": 0.4282905235886574, "learning_rate": 9.99885639498593e-06, "loss": 0.0297, "step": 1608, "step_time": 2.5240046480030287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3130874102935195, "epoch": 0.01609, "frac_reward_zero_std": 0.75, "grad_norm": 0.011862512677907944, "kl": 0.4756753593683243, "learning_rate": 9.998854939629016e-06, "loss": -0.0178, "num_tokens": 14232623.0, "reward": 0.8599711656570435, "reward_std": 0.04011470824480057, "rewards/rollout_reward_func/mean": 0.8599711656570435, "rewards/rollout_reward_func/std": 0.3884803354740143, "sampling/importance_sampling_ratio/max": 0.9984871745109558, "sampling/importance_sampling_ratio/mean": 0.9596363306045532, "sampling/importance_sampling_ratio/min": 0.0017899556551128626, "sampling/sampling_logp_difference/max": 3.035900115966797, "sampling/sampling_logp_difference/mean": 0.05124123767018318, "step": 1609, "step_time": 4.769030663002923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3152229683473706, "epoch": 0.0161, "grad_norm": 0.01201721839606762, "kl": 0.4733271114528179, "learning_rate": 9.998853483346786e-06, "loss": -0.0178, "step": 1610, "step_time": 1.9698972570113256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 126.125, "completions/mean_terminated_length": 126.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5164403514936566, "epoch": 0.01611, "frac_reward_zero_std": 0.5, "grad_norm": 0.007649358827620745, "kl": 0.5022316202521324, "learning_rate": 9.998852026139239e-06, "loss": 0.0019, "num_tokens": 14247603.0, "reward": 0.697163462638855, "reward_std": 0.025428634136915207, "rewards/rollout_reward_func/mean": 0.697163462638855, "rewards/rollout_reward_func/std": 0.2626844346523285, "sampling/importance_sampling_ratio/max": 0.9987930655479431, "sampling/importance_sampling_ratio/mean": 0.9282856583595276, "sampling/importance_sampling_ratio/min": 0.0007655049557797611, "sampling/sampling_logp_difference/max": 2.2669858932495117, "sampling/sampling_logp_difference/mean": 0.07885953783988953, "step": 1611, "step_time": 4.180220237998583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5123906349763274, "epoch": 0.01612, "grad_norm": 0.007755273953080177, "kl": 0.4937969148159027, "learning_rate": 9.998850568006378e-06, "loss": 0.0019, "step": 1612, "step_time": 2.0273851189995185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 216.15625, "completions/mean_terminated_length": 216.15625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.4586460329592228, "epoch": 0.01613, "frac_reward_zero_std": 0.75, "grad_norm": 0.13764682412147522, "kl": 0.4231082946062088, "learning_rate": 9.998849108948197e-06, "loss": 0.0217, "num_tokens": 14265200.0, "reward": 0.8451154232025146, "reward_std": 0.033085864037275314, "rewards/rollout_reward_func/mean": 0.8451154232025146, "rewards/rollout_reward_func/std": 0.43953922390937805, "sampling/importance_sampling_ratio/max": 0.9931365251541138, "sampling/importance_sampling_ratio/mean": 0.9303134679794312, "sampling/importance_sampling_ratio/min": 6.077887633182399e-07, "sampling/sampling_logp_difference/max": 2.3437986373901367, "sampling/sampling_logp_difference/mean": 0.08264368027448654, "step": 1613, "step_time": 4.474727430002531 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.4593418873846531, "epoch": 0.01614, "grad_norm": 0.0485382117331028, "kl": 0.4075237140059471, "learning_rate": 9.998847648964704e-06, "loss": 0.021, "step": 1614, "step_time": 2.559003846014093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 125.71875, "completions/mean_terminated_length": 125.71875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.4252858543768525, "epoch": 0.01615, "frac_reward_zero_std": 0.75, "grad_norm": 0.015626652166247368, "kl": 0.41939424723386765, "learning_rate": 9.998846188055896e-06, "loss": -0.0177, "num_tokens": 14279991.0, "reward": 0.7945672869682312, "reward_std": 0.008838835172355175, "rewards/rollout_reward_func/mean": 0.7945672869682312, "rewards/rollout_reward_func/std": 0.23478065431118011, "sampling/importance_sampling_ratio/max": 0.9952402710914612, "sampling/importance_sampling_ratio/mean": 0.9508869051933289, "sampling/importance_sampling_ratio/min": 2.5201818061759695e-05, "sampling/sampling_logp_difference/max": 2.3313493728637695, "sampling/sampling_logp_difference/mean": 0.08458604663610458, "step": 1615, "step_time": 4.307738140996662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42697056755423546, "epoch": 0.01616, "grad_norm": 0.016020331531763077, "kl": 0.4179730452597141, "learning_rate": 9.998844726221773e-06, "loss": -0.0176, "step": 1616, "step_time": 2.0259338059840957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5042393505573273, "epoch": 0.01617, "frac_reward_zero_std": 0.75, "grad_norm": 0.2902114987373352, "kl": 0.4363833963871002, "learning_rate": 9.998843263462335e-06, "loss": 0.0072, "num_tokens": 14296307.0, "reward": 0.7825673222541809, "reward_std": 0.041410788893699646, "rewards/rollout_reward_func/mean": 0.7825673222541809, "rewards/rollout_reward_func/std": 0.41681596636772156, "sampling/importance_sampling_ratio/max": 1.0016933679580688, "sampling/importance_sampling_ratio/mean": 0.9035465717315674, "sampling/importance_sampling_ratio/min": 0.00013755075633525848, "sampling/sampling_logp_difference/max": 1.9459617137908936, "sampling/sampling_logp_difference/mean": 0.06649766862392426, "step": 1617, "step_time": 4.330438561009942 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.5117197660729289, "epoch": 0.01618, "grad_norm": 0.1068914532661438, "kl": 0.43096377328038216, "learning_rate": 9.998841799777583e-06, "loss": 0.0065, "step": 1618, "step_time": 2.009166036004899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 125.8125, "completions/mean_terminated_length": 125.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.538640639744699, "epoch": 0.01619, "frac_reward_zero_std": 0.5, "grad_norm": 0.058634284883737564, "kl": 0.44473908469080925, "learning_rate": 9.99884033516752e-06, "loss": -0.0114, "num_tokens": 14311245.0, "reward": 0.6560096144676208, "reward_std": 0.04098184406757355, "rewards/rollout_reward_func/mean": 0.6560096144676208, "rewards/rollout_reward_func/std": 0.07796401530504227, "sampling/importance_sampling_ratio/max": 1.0016664266586304, "sampling/importance_sampling_ratio/mean": 0.9042127728462219, "sampling/importance_sampling_ratio/min": 0.04458983987569809, "sampling/sampling_logp_difference/max": 1.5247306823730469, "sampling/sampling_logp_difference/mean": 0.06332369893789291, "step": 1619, "step_time": 3.8430882030006615 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.5350023629143834, "epoch": 0.0162, "grad_norm": 0.04019222781062126, "kl": 0.4405010938644409, "learning_rate": 9.998838869632141e-06, "loss": -0.0118, "step": 1620, "step_time": 2.4476448969944613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 122.59375, "completions/mean_terminated_length": 122.59375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.7043745927512646, "epoch": 0.01621, "frac_reward_zero_std": 0.5, "grad_norm": 0.06707695871591568, "kl": 0.4736953116953373, "learning_rate": 9.998837403171451e-06, "loss": -0.041, "num_tokens": 14325936.0, "reward": 0.7747260332107544, "reward_std": 0.0410257950425148, "rewards/rollout_reward_func/mean": 0.7747260332107544, "rewards/rollout_reward_func/std": 0.18068495392799377, "sampling/importance_sampling_ratio/max": 0.996343195438385, "sampling/importance_sampling_ratio/mean": 0.8760992288589478, "sampling/importance_sampling_ratio/min": 2.1642947558575543e-06, "sampling/sampling_logp_difference/max": 2.140406370162964, "sampling/sampling_logp_difference/mean": 0.10983167588710785, "step": 1621, "step_time": 4.602018688005046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5978295169770718, "epoch": 0.01622, "grad_norm": 0.0694376677274704, "kl": 0.4722418375313282, "learning_rate": 9.998835935785448e-06, "loss": -0.0409, "step": 1622, "step_time": 2.06672823199915 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 128.125, "completions/mean_terminated_length": 128.125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.6302276719361544, "epoch": 0.01623, "frac_reward_zero_std": 0.5, "grad_norm": 1.5574156045913696, "kl": 0.4106874018907547, "learning_rate": 9.998834467474132e-06, "loss": -0.0204, "num_tokens": 14340860.0, "reward": 0.6246634721755981, "reward_std": 0.3873923420906067, "rewards/rollout_reward_func/mean": 0.6246634721755981, "rewards/rollout_reward_func/std": 0.6149543523788452, "sampling/importance_sampling_ratio/max": 0.9950653910636902, "sampling/importance_sampling_ratio/mean": 0.8188681602478027, "sampling/importance_sampling_ratio/min": 0.04823436960577965, "sampling/sampling_logp_difference/max": 1.7586917877197266, "sampling/sampling_logp_difference/mean": 0.09195601940155029, "step": 1623, "step_time": 3.8631052689888747 }, { "clip_ratio/high_max": 0.08333333488553762, "clip_ratio/high_mean": 0.06666666781529784, "clip_ratio/low_mean": 0.03437500027939677, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.10104166902601719, "entropy": 0.6648554503917694, "epoch": 0.01624, "grad_norm": 0.14131402969360352, "kl": 0.427560530602932, "learning_rate": 9.998832998237506e-06, "loss": -0.0244, "step": 1624, "step_time": 2.019404669008509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 123.96875, "completions/mean_terminated_length": 123.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2427003271877766, "epoch": 0.01625, "frac_reward_zero_std": 0.25, "grad_norm": 0.18654797971248627, "kl": 0.5058144703507423, "learning_rate": 9.998831528075567e-06, "loss": -0.0085, "num_tokens": 14355683.0, "reward": 0.5454326868057251, "reward_std": 0.07670151442289352, "rewards/rollout_reward_func/mean": 0.5454326868057251, "rewards/rollout_reward_func/std": 0.13374029099941254, "sampling/importance_sampling_ratio/max": 0.9992138147354126, "sampling/importance_sampling_ratio/mean": 0.7775299549102783, "sampling/importance_sampling_ratio/min": 0.009374494664371014, "sampling/sampling_logp_difference/max": 2.2829885482788086, "sampling/sampling_logp_difference/mean": 0.16542573273181915, "step": 1625, "step_time": 3.8892419710027752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.257827538996935, "epoch": 0.01626, "grad_norm": 0.22698809206485748, "kl": 0.5074626058340073, "learning_rate": 9.99883005698832e-06, "loss": -0.0083, "step": 1626, "step_time": 2.907643339007336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 143.46875, "completions/mean_terminated_length": 143.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7346504675224423, "epoch": 0.01627, "frac_reward_zero_std": 0.25, "grad_norm": 0.07867883145809174, "kl": 0.5689259618520737, "learning_rate": 9.998828584975761e-06, "loss": -0.0687, "num_tokens": 14371130.0, "reward": 0.7200480699539185, "reward_std": 0.271679162979126, "rewards/rollout_reward_func/mean": 0.7200480699539185, "rewards/rollout_reward_func/std": 0.5210517644882202, "sampling/importance_sampling_ratio/max": 1.001997470855713, "sampling/importance_sampling_ratio/mean": 0.8735301494598389, "sampling/importance_sampling_ratio/min": 0.0016947800759226084, "sampling/sampling_logp_difference/max": 1.9527630805969238, "sampling/sampling_logp_difference/mean": 0.1020752489566803, "step": 1627, "step_time": 3.9582696559955366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.7311782911419868, "epoch": 0.01628, "grad_norm": 0.07437741011381149, "kl": 0.5805548615753651, "learning_rate": 9.99882711203789e-06, "loss": -0.0689, "step": 1628, "step_time": 2.032205242998316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 125.5625, "completions/mean_terminated_length": 125.5625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6742340968921781, "epoch": 0.01629, "frac_reward_zero_std": 0.5, "grad_norm": 0.02998986840248108, "kl": 0.447395384311676, "learning_rate": 9.998825638174712e-06, "loss": -0.0337, "num_tokens": 14385916.0, "reward": 0.603365421295166, "reward_std": 0.036091458052396774, "rewards/rollout_reward_func/mean": 0.603365421295166, "rewards/rollout_reward_func/std": 0.3428930640220642, "sampling/importance_sampling_ratio/max": 0.9952331185340881, "sampling/importance_sampling_ratio/mean": 0.8962413668632507, "sampling/importance_sampling_ratio/min": 0.0008724400540813804, "sampling/sampling_logp_difference/max": 2.2016384601593018, "sampling/sampling_logp_difference/mean": 0.10359837859869003, "step": 1629, "step_time": 3.8964333519979846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.670283448882401, "epoch": 0.0163, "grad_norm": 0.027444710955023766, "kl": 0.4517272375524044, "learning_rate": 9.998824163386222e-06, "loss": -0.0338, "step": 1630, "step_time": 2.016261623983155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 165.28125, "completions/mean_terminated_length": 165.28125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 1.2291342904791236, "epoch": 0.01631, "frac_reward_zero_std": 0.0, "grad_norm": 0.06099647283554077, "kl": 0.4640192873775959, "learning_rate": 9.998822687672424e-06, "loss": -0.1087, "num_tokens": 14401885.0, "reward": 0.7973605990409851, "reward_std": 0.1276465207338333, "rewards/rollout_reward_func/mean": 0.7973605990409851, "rewards/rollout_reward_func/std": 0.31196486949920654, "sampling/importance_sampling_ratio/max": 1.001054286956787, "sampling/importance_sampling_ratio/mean": 0.8389266729354858, "sampling/importance_sampling_ratio/min": 1.1340020719327604e-12, "sampling/sampling_logp_difference/max": 17.642498016357422, "sampling/sampling_logp_difference/mean": 0.34996724128723145, "step": 1631, "step_time": 4.3799942530094995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.214497926644981, "epoch": 0.01632, "grad_norm": 0.03000664710998535, "kl": 0.4749721623957157, "learning_rate": 9.998821211033318e-06, "loss": -0.1089, "step": 1632, "step_time": 2.9191867479894427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 119.6875, "completions/mean_terminated_length": 119.6875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.7641952959820628, "epoch": 0.01633, "frac_reward_zero_std": 0.25, "grad_norm": 0.036799658089876175, "kl": 0.6333133466541767, "learning_rate": 9.998819733468902e-06, "loss": -0.0721, "num_tokens": 14416603.0, "reward": 0.6182259321212769, "reward_std": 0.061601847410202026, "rewards/rollout_reward_func/mean": 0.6182259321212769, "rewards/rollout_reward_func/std": 0.17628228664398193, "sampling/importance_sampling_ratio/max": 1.0055298805236816, "sampling/importance_sampling_ratio/mean": 0.8760542869567871, "sampling/importance_sampling_ratio/min": 2.06709746541911e-10, "sampling/sampling_logp_difference/max": 15.907676696777344, "sampling/sampling_logp_difference/mean": 0.23320557177066803, "step": 1633, "step_time": 3.8806539030119893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7716668602079153, "epoch": 0.01634, "grad_norm": 0.04094137251377106, "kl": 0.6588070392608643, "learning_rate": 9.99881825497918e-06, "loss": -0.0722, "step": 1634, "step_time": 1.9983198980044108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 148.71875, "completions/mean_terminated_length": 148.71875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2690704707056284, "epoch": 0.01635, "frac_reward_zero_std": 0.5, "grad_norm": 0.21821197867393494, "kl": 1.3226813115179539, "learning_rate": 9.998816775564148e-06, "loss": 0.0123, "num_tokens": 14432130.0, "reward": 0.7957740426063538, "reward_std": 0.04619310796260834, "rewards/rollout_reward_func/mean": 0.7957740426063538, "rewards/rollout_reward_func/std": 0.2978156507015228, "sampling/importance_sampling_ratio/max": 1.0002317428588867, "sampling/importance_sampling_ratio/mean": 0.939120888710022, "sampling/importance_sampling_ratio/min": 0.02675854042172432, "sampling/sampling_logp_difference/max": 2.2107393741607666, "sampling/sampling_logp_difference/mean": 0.03766815364360809, "step": 1635, "step_time": 3.8654801879893057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26763260923326015, "epoch": 0.01636, "grad_norm": 0.18686600029468536, "kl": 1.2563439346849918, "learning_rate": 9.99881529522381e-06, "loss": 0.0122, "step": 1636, "step_time": 1.9810358910035575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 120.15625, "completions/mean_terminated_length": 120.15625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2923016110435128, "epoch": 0.01637, "frac_reward_zero_std": 0.75, "grad_norm": 0.003761129919439554, "kl": 0.45571452751755714, "learning_rate": 9.998813813958166e-06, "loss": -0.0272, "num_tokens": 14446743.0, "reward": 0.6339423656463623, "reward_std": 0.004079459700733423, "rewards/rollout_reward_func/mean": 0.6339423656463623, "rewards/rollout_reward_func/std": 0.2639208436012268, "sampling/importance_sampling_ratio/max": 1.0029915571212769, "sampling/importance_sampling_ratio/mean": 0.9653534889221191, "sampling/importance_sampling_ratio/min": 0.007092129904776812, "sampling/sampling_logp_difference/max": 1.6322431564331055, "sampling/sampling_logp_difference/mean": 0.03352070227265358, "step": 1637, "step_time": 4.303709603991592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29158998653292656, "epoch": 0.01638, "grad_norm": 0.004099390935152769, "kl": 0.45990634337067604, "learning_rate": 9.998812331767214e-06, "loss": -0.0272, "step": 1638, "step_time": 2.420461237001291 }, { "clip_ratio/high_max": 0.030357143841683865, "clip_ratio/high_mean": 0.015178571920841932, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015178571920841932, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 168.3125, "completions/mean_terminated_length": 168.3125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.15995486127212644, "epoch": 0.01639, "frac_reward_zero_std": 0.75, "grad_norm": 0.26700475811958313, "kl": 0.5023795366287231, "learning_rate": 9.998810848650955e-06, "loss": 0.0273, "num_tokens": 14462841.0, "reward": 0.810115396976471, "reward_std": 0.03560846298933029, "rewards/rollout_reward_func/mean": 0.810115396976471, "rewards/rollout_reward_func/std": 0.19684143364429474, "sampling/importance_sampling_ratio/max": 1.1564370393753052, "sampling/importance_sampling_ratio/mean": 0.9472212195396423, "sampling/importance_sampling_ratio/min": 0.024866880849003792, "sampling/sampling_logp_difference/max": 2.1975009441375732, "sampling/sampling_logp_difference/mean": 0.03920410946011543, "step": 1639, "step_time": 4.089196515989897 }, { "clip_ratio/high_max": 0.07321428880095482, "clip_ratio/high_mean": 0.03660714440047741, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06785714440047741, "entropy": 0.1926282150670886, "epoch": 0.0164, "grad_norm": 0.589480996131897, "kl": 0.4161998964846134, "learning_rate": 9.998809364609392e-06, "loss": 0.0246, "step": 1640, "step_time": 2.0077027110164636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.04373912280425429, "epoch": 0.01641, "frac_reward_zero_std": 1.0, "grad_norm": 0.00034878370934166014, "kl": 0.4099658653140068, "learning_rate": 9.998807879642522e-06, "loss": 0.0014, "num_tokens": 14479329.0, "reward": 1.051269292831421, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.051269292831421, "rewards/rollout_reward_func/std": 0.29209673404693604, "sampling/importance_sampling_ratio/max": 1.000630259513855, "sampling/importance_sampling_ratio/mean": 0.9974589347839355, "sampling/importance_sampling_ratio/min": 0.9943800568580627, "sampling/sampling_logp_difference/max": 0.003349575214087963, "sampling/sampling_logp_difference/mean": 0.0009209117852151394, "step": 1641, "step_time": 4.286653157985711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04458820587024093, "epoch": 0.01642, "grad_norm": 0.00035893573658540845, "kl": 0.40989698842167854, "learning_rate": 9.998806393750348e-06, "loss": 0.0014, "step": 1642, "step_time": 1.9932675969903357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 141.5, "completions/mean_terminated_length": 141.5, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3221914144232869, "epoch": 0.01643, "frac_reward_zero_std": 0.75, "grad_norm": 0.010169126093387604, "kl": 0.5316776409745216, "learning_rate": 9.99880490693287e-06, "loss": -0.0271, "num_tokens": 14494681.0, "reward": 0.5567788481712341, "reward_std": 0.009110798127949238, "rewards/rollout_reward_func/mean": 0.5567788481712341, "rewards/rollout_reward_func/std": 0.1342538744211197, "sampling/importance_sampling_ratio/max": 1.0091873407363892, "sampling/importance_sampling_ratio/mean": 0.9678084850311279, "sampling/importance_sampling_ratio/min": 1.7977196620222458e-08, "sampling/sampling_logp_difference/max": 3.8123714923858643, "sampling/sampling_logp_difference/mean": 0.10907471179962158, "step": 1643, "step_time": 4.298730901013187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3233955088071525, "epoch": 0.01644, "grad_norm": 0.010907415300607681, "kl": 0.5362570881843567, "learning_rate": 9.998803419190087e-06, "loss": -0.027, "step": 1644, "step_time": 2.4329507279835525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 165.15625, "completions/mean_terminated_length": 165.15625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.27294400753453374, "epoch": 0.01645, "frac_reward_zero_std": 1.0, "grad_norm": 0.004423621576279402, "kl": 0.44808347150683403, "learning_rate": 9.998801930522e-06, "loss": 0.0015, "num_tokens": 14510678.0, "reward": 0.5778077244758606, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5778077244758606, "rewards/rollout_reward_func/std": 0.17093384265899658, "sampling/importance_sampling_ratio/max": 1.0008670091629028, "sampling/importance_sampling_ratio/mean": 0.9651947617530823, "sampling/importance_sampling_ratio/min": 0.002271342324092984, "sampling/sampling_logp_difference/max": 1.5563685894012451, "sampling/sampling_logp_difference/mean": 0.038954958319664, "step": 1645, "step_time": 4.159815717990568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27337276469916105, "epoch": 0.01646, "grad_norm": 0.004528095945715904, "kl": 0.4503181576728821, "learning_rate": 9.99880044092861e-06, "loss": 0.0015, "step": 1646, "step_time": 2.0374173559976043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 193.84375, "completions/mean_terminated_length": 193.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5456366869620979, "epoch": 0.01647, "frac_reward_zero_std": 0.5, "grad_norm": 0.06002478674054146, "kl": 0.4772130325436592, "learning_rate": 9.998798950409914e-06, "loss": -0.0197, "num_tokens": 14527593.0, "reward": 0.6793831586837769, "reward_std": 0.05401615798473358, "rewards/rollout_reward_func/mean": 0.6793831586837769, "rewards/rollout_reward_func/std": 0.15158051252365112, "sampling/importance_sampling_ratio/max": 1.0402240753173828, "sampling/importance_sampling_ratio/mean": 0.954964280128479, "sampling/importance_sampling_ratio/min": 1.3201418383336109e-25, "sampling/sampling_logp_difference/max": 6.141820907592773, "sampling/sampling_logp_difference/mean": 0.20143887400627136, "step": 1647, "step_time": 5.064349601016147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5471173729747534, "epoch": 0.01648, "grad_norm": 0.07637438923120499, "kl": 0.4713536538183689, "learning_rate": 9.998797458965917e-06, "loss": -0.0201, "step": 1648, "step_time": 2.1059628989969497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.23237937223166227, "epoch": 0.01649, "frac_reward_zero_std": 0.75, "grad_norm": 0.007916375063359737, "kl": 0.3901042826473713, "learning_rate": 9.998795966596618e-06, "loss": -0.0271, "num_tokens": 14543785.0, "reward": 0.9331731200218201, "reward_std": 0.00679910508915782, "rewards/rollout_reward_func/mean": 0.9331731200218201, "rewards/rollout_reward_func/std": 0.331132709980011, "sampling/importance_sampling_ratio/max": 1.0039284229278564, "sampling/importance_sampling_ratio/mean": 0.96630859375, "sampling/importance_sampling_ratio/min": 0.015436514280736446, "sampling/sampling_logp_difference/max": 1.6162909269332886, "sampling/sampling_logp_difference/mean": 0.0226912759244442, "step": 1649, "step_time": 4.494690341998648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23213453777134418, "epoch": 0.0165, "grad_norm": 0.008218486793339252, "kl": 0.3876722753047943, "learning_rate": 9.998794473302016e-06, "loss": -0.0271, "step": 1650, "step_time": 2.4837583209955483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 216.78125, "completions/mean_terminated_length": 216.78125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.38855371810495853, "epoch": 0.01651, "frac_reward_zero_std": 0.75, "grad_norm": 0.003271017223596573, "kl": 0.3765592761337757, "learning_rate": 9.998792979082112e-06, "loss": -0.0272, "num_tokens": 14561402.0, "reward": 0.9970288872718811, "reward_std": 0.019064685329794884, "rewards/rollout_reward_func/mean": 0.9970288872718811, "rewards/rollout_reward_func/std": 0.32612282037734985, "sampling/importance_sampling_ratio/max": 1.0007147789001465, "sampling/importance_sampling_ratio/mean": 0.965604841709137, "sampling/importance_sampling_ratio/min": 1.0990867097731075e-13, "sampling/sampling_logp_difference/max": 3.7494707107543945, "sampling/sampling_logp_difference/mean": 0.11775944381952286, "step": 1651, "step_time": 4.114733738002542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39012091141194105, "epoch": 0.01652, "grad_norm": 0.0033509943168610334, "kl": 0.37626874819397926, "learning_rate": 9.998791483936908e-06, "loss": -0.0272, "step": 1652, "step_time": 1.9981415620131884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5914763072505593, "epoch": 0.01653, "frac_reward_zero_std": 0.5, "grad_norm": 0.011247335001826286, "kl": 0.43749186769127846, "learning_rate": 9.998789987866401e-06, "loss": -0.0365, "num_tokens": 14577598.0, "reward": 0.6902115345001221, "reward_std": 0.02421874925494194, "rewards/rollout_reward_func/mean": 0.6902115345001221, "rewards/rollout_reward_func/std": 0.4469403028488159, "sampling/importance_sampling_ratio/max": 1.0034072399139404, "sampling/importance_sampling_ratio/mean": 0.9054701924324036, "sampling/importance_sampling_ratio/min": 0.0031281232368201017, "sampling/sampling_logp_difference/max": 1.9710662364959717, "sampling/sampling_logp_difference/mean": 0.08830015361309052, "step": 1653, "step_time": 4.166647997997643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5983728272840381, "epoch": 0.01654, "grad_norm": 0.011081582866609097, "kl": 0.4350884109735489, "learning_rate": 9.998788490870594e-06, "loss": -0.0365, "step": 1654, "step_time": 2.0256496560032247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 211.46875, "completions/mean_terminated_length": 211.46875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.9136735331267118, "epoch": 0.01655, "frac_reward_zero_std": 0.25, "grad_norm": 0.14565089344978333, "kl": 0.6985633783042431, "learning_rate": 9.998786992949487e-06, "loss": -0.1127, "num_tokens": 14595045.0, "reward": 0.9434086084365845, "reward_std": 0.15589989721775055, "rewards/rollout_reward_func/mean": 0.9434086084365845, "rewards/rollout_reward_func/std": 0.4413900375366211, "sampling/importance_sampling_ratio/max": 1.012640118598938, "sampling/importance_sampling_ratio/mean": 0.8588886260986328, "sampling/importance_sampling_ratio/min": 1.1813652060597595e-17, "sampling/sampling_logp_difference/max": 3.7322723865509033, "sampling/sampling_logp_difference/mean": 0.2642812728881836, "step": 1655, "step_time": 4.786853332996543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.9162686145864427, "epoch": 0.01656, "grad_norm": 0.05275396257638931, "kl": 0.7270644791424274, "learning_rate": 9.998785494103079e-06, "loss": -0.1127, "step": 1656, "step_time": 2.5175733379946905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 170.71875, "completions/mean_terminated_length": 170.71875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3680077535100281, "epoch": 0.01657, "frac_reward_zero_std": 0.5, "grad_norm": 0.02707384154200554, "kl": 0.482878003269434, "learning_rate": 9.998783994331374e-06, "loss": -0.0451, "num_tokens": 14611276.0, "reward": 0.5176923274993896, "reward_std": 0.038074977695941925, "rewards/rollout_reward_func/mean": 0.5176923274993896, "rewards/rollout_reward_func/std": 0.17982707917690277, "sampling/importance_sampling_ratio/max": 1.008945345878601, "sampling/importance_sampling_ratio/mean": 0.9369041323661804, "sampling/importance_sampling_ratio/min": 0.016079917550086975, "sampling/sampling_logp_difference/max": 1.4552565813064575, "sampling/sampling_logp_difference/mean": 0.04194989055395126, "step": 1657, "step_time": 4.167251557999407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3753788978792727, "epoch": 0.01658, "grad_norm": 0.02640848234295845, "kl": 0.480174221098423, "learning_rate": 9.998782493634366e-06, "loss": -0.0451, "step": 1658, "step_time": 2.0149880569952074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2576492391526699, "epoch": 0.01659, "frac_reward_zero_std": 0.75, "grad_norm": 0.010437586344778538, "kl": 0.47468771040439606, "learning_rate": 9.998780992012062e-06, "loss": 0.03, "num_tokens": 14626580.0, "reward": 0.5635576844215393, "reward_std": 0.006799104157835245, "rewards/rollout_reward_func/mean": 0.5635576844215393, "rewards/rollout_reward_func/std": 0.2376491129398346, "sampling/importance_sampling_ratio/max": 1.0027496814727783, "sampling/importance_sampling_ratio/mean": 0.9662439823150635, "sampling/importance_sampling_ratio/min": 0.008216417394578457, "sampling/sampling_logp_difference/max": 1.6576560735702515, "sampling/sampling_logp_difference/mean": 0.02664181962609291, "step": 1659, "step_time": 3.9913445960119134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2591304937377572, "epoch": 0.0166, "grad_norm": 0.010410912334918976, "kl": 0.4748901091516018, "learning_rate": 9.998779489464459e-06, "loss": 0.03, "step": 1660, "step_time": 1.9967428900126833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 210.28125, "completions/mean_terminated_length": 210.28125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.43437638645991683, "epoch": 0.01661, "frac_reward_zero_std": 0.5, "grad_norm": 0.022684236988425255, "kl": 0.41154925152659416, "learning_rate": 9.998777985991559e-06, "loss": -0.0571, "num_tokens": 14643933.0, "reward": 0.7593461275100708, "reward_std": 0.05625343322753906, "rewards/rollout_reward_func/mean": 0.7593461275100708, "rewards/rollout_reward_func/std": 0.4382111132144928, "sampling/importance_sampling_ratio/max": 1.000802755355835, "sampling/importance_sampling_ratio/mean": 0.9056593775749207, "sampling/importance_sampling_ratio/min": 0.0006069388473406434, "sampling/sampling_logp_difference/max": 2.3624353408813477, "sampling/sampling_logp_difference/mean": 0.06753575801849365, "step": 1661, "step_time": 5.381336105005175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43395763682201505, "epoch": 0.01662, "grad_norm": 0.02333071082830429, "kl": 0.41141940653324127, "learning_rate": 9.99877648159336e-06, "loss": -0.0571, "step": 1662, "step_time": 2.0539697550048004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 194.25, "completions/mean_terminated_length": 194.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06697547435760498, "epoch": 0.01663, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005970705533400178, "kl": 0.3856481499969959, "learning_rate": 9.998774976269862e-06, "loss": 0.0014, "num_tokens": 14660829.0, "reward": 0.8965384364128113, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8965384364128113, "rewards/rollout_reward_func/std": 0.38712644577026367, "sampling/importance_sampling_ratio/max": 0.9984861612319946, "sampling/importance_sampling_ratio/mean": 0.9956026077270508, "sampling/importance_sampling_ratio/min": 0.9893456697463989, "sampling/sampling_logp_difference/max": 0.0032997187227010727, "sampling/sampling_logp_difference/mean": 0.00098998355679214, "step": 1663, "step_time": 4.330903430018225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06597891030833125, "epoch": 0.01664, "grad_norm": 0.0005936587695032358, "kl": 0.3857930265367031, "learning_rate": 9.99877347002107e-06, "loss": 0.0014, "step": 1664, "step_time": 2.0016310710052494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 191.0, "completions/mean_terminated_length": 191.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.6754239583387971, "epoch": 0.01665, "frac_reward_zero_std": 0.25, "grad_norm": 0.018697116523981094, "kl": 0.6334062144160271, "learning_rate": 9.99877196284698e-06, "loss": -0.1036, "num_tokens": 14677621.0, "reward": 0.9409375190734863, "reward_std": 0.09721098840236664, "rewards/rollout_reward_func/mean": 0.9409375190734863, "rewards/rollout_reward_func/std": 0.3226146101951599, "sampling/importance_sampling_ratio/max": 1.0022799968719482, "sampling/importance_sampling_ratio/mean": 0.8773738145828247, "sampling/importance_sampling_ratio/min": 6.09831404290162e-05, "sampling/sampling_logp_difference/max": 2.0901994705200195, "sampling/sampling_logp_difference/mean": 0.11080824583768845, "step": 1665, "step_time": 4.547685213015939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6765040149912238, "epoch": 0.01666, "grad_norm": 0.017282186076045036, "kl": 0.6518928930163383, "learning_rate": 9.998770454747594e-06, "loss": -0.1036, "step": 1666, "step_time": 2.4943321579994517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 260.59375, "completions/mean_terminated_length": 260.59375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.7536971848458052, "epoch": 0.01667, "frac_reward_zero_std": 0.5, "grad_norm": 0.30332425236701965, "kl": 0.6376325711607933, "learning_rate": 9.998768945722913e-06, "loss": -0.0487, "num_tokens": 14696672.0, "reward": 0.813003420829773, "reward_std": 0.1703890562057495, "rewards/rollout_reward_func/mean": 0.813003420829773, "rewards/rollout_reward_func/std": 0.4532789885997772, "sampling/importance_sampling_ratio/max": 1.196662425994873, "sampling/importance_sampling_ratio/mean": 0.8554368615150452, "sampling/importance_sampling_ratio/min": 7.381624077103972e-21, "sampling/sampling_logp_difference/max": 4.343324661254883, "sampling/sampling_logp_difference/mean": 0.21385399997234344, "step": 1667, "step_time": 5.086301235991414 }, { "clip_ratio/high_max": 0.0390625, "clip_ratio/high_mean": 0.01953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01953125, "entropy": 0.7208072086796165, "epoch": 0.01668, "grad_norm": 0.06452329456806183, "kl": 0.6254254803061485, "learning_rate": 9.998767435772937e-06, "loss": -0.0488, "step": 1668, "step_time": 2.0479925819963682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 189.75, "completions/mean_terminated_length": 189.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.05677026975899935, "epoch": 0.01669, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004899698542430997, "kl": 0.39595382288098335, "learning_rate": 9.998765924897663e-06, "loss": 0.0015, "num_tokens": 14713480.0, "reward": 0.811269223690033, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.811269223690033, "rewards/rollout_reward_func/std": 0.35708701610565186, "sampling/importance_sampling_ratio/max": 1.0005254745483398, "sampling/importance_sampling_ratio/mean": 0.9967142343521118, "sampling/importance_sampling_ratio/min": 0.9920945167541504, "sampling/sampling_logp_difference/max": 0.0036665834486484528, "sampling/sampling_logp_difference/mean": 0.0008236063877120614, "step": 1669, "step_time": 4.507585981991724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05567025253549218, "epoch": 0.0167, "grad_norm": 0.00047707202611491084, "kl": 0.396132942289114, "learning_rate": 9.998764413097096e-06, "loss": 0.0015, "step": 1670, "step_time": 2.03696153400233 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 258.9375, "completions/mean_terminated_length": 258.9375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.10541397426277399, "epoch": 0.01671, "frac_reward_zero_std": 0.75, "grad_norm": 1.3092721700668335, "kl": 0.3457186333835125, "learning_rate": 9.998762900371235e-06, "loss": 0.0119, "num_tokens": 14732446.0, "reward": 1.1449471712112427, "reward_std": 0.02560541220009327, "rewards/rollout_reward_func/mean": 1.1449471712112427, "rewards/rollout_reward_func/std": 0.29922524094581604, "sampling/importance_sampling_ratio/max": 1.2933754920959473, "sampling/importance_sampling_ratio/mean": 0.9348765015602112, "sampling/importance_sampling_ratio/min": 0.5864077210426331, "sampling/sampling_logp_difference/max": 0.3606832027435303, "sampling/sampling_logp_difference/mean": 0.014457973651587963, "step": 1671, "step_time": 4.831586749998678 }, { "clip_ratio/high_max": 0.06250000279396772, "clip_ratio/high_mean": 0.03125000139698386, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04017857275903225, "entropy": 0.08395966561511159, "epoch": 0.01672, "grad_norm": 0.0067216488532722, "kl": 0.35907773301005363, "learning_rate": 9.998761386720078e-06, "loss": 0.0094, "step": 1672, "step_time": 2.4816742689945386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 191.875, "completions/mean_terminated_length": 191.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.23706111032515764, "epoch": 0.01673, "frac_reward_zero_std": 0.75, "grad_norm": 0.00976625457406044, "kl": 0.40467748790979385, "learning_rate": 9.998759872143631e-06, "loss": -0.0273, "num_tokens": 14749298.0, "reward": 1.0268797874450684, "reward_std": 0.007519804872572422, "rewards/rollout_reward_func/mean": 1.0268797874450684, "rewards/rollout_reward_func/std": 0.3180871307849884, "sampling/importance_sampling_ratio/max": 1.0116902589797974, "sampling/importance_sampling_ratio/mean": 0.9685956835746765, "sampling/importance_sampling_ratio/min": 0.006675340235233307, "sampling/sampling_logp_difference/max": 1.6188290119171143, "sampling/sampling_logp_difference/mean": 0.02696186862885952, "step": 1673, "step_time": 4.874229481014481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2338730627670884, "epoch": 0.01674, "grad_norm": 0.012989322654902935, "kl": 0.3993794769048691, "learning_rate": 9.998758356641888e-06, "loss": -0.0273, "step": 1674, "step_time": 2.0280301140082884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 160.9375, "completions/mean_terminated_length": 160.9375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.7566219866275787, "epoch": 0.01675, "frac_reward_zero_std": 0.25, "grad_norm": 0.042914118617773056, "kl": 0.7498400881886482, "learning_rate": 9.998756840214852e-06, "loss": -0.0216, "num_tokens": 14765216.0, "reward": 0.7515384554862976, "reward_std": 0.10004840046167374, "rewards/rollout_reward_func/mean": 0.7515384554862976, "rewards/rollout_reward_func/std": 0.3066772222518921, "sampling/importance_sampling_ratio/max": 1.009278416633606, "sampling/importance_sampling_ratio/mean": 0.881372332572937, "sampling/importance_sampling_ratio/min": 5.1721911198399084e-09, "sampling/sampling_logp_difference/max": 3.1215600967407227, "sampling/sampling_logp_difference/mean": 0.16349215805530548, "step": 1675, "step_time": 4.176781291003863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7554710456170142, "epoch": 0.01676, "grad_norm": 0.04349241405725479, "kl": 0.7371115908026695, "learning_rate": 9.998755322862524e-06, "loss": -0.0216, "step": 1676, "step_time": 2.017406407983799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 193.375, "completions/mean_terminated_length": 193.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7357202316634357, "epoch": 0.01677, "frac_reward_zero_std": 0.5, "grad_norm": 0.010891104117035866, "kl": 0.46071699261665344, "learning_rate": 9.998753804584904e-06, "loss": -0.0644, "num_tokens": 14782028.0, "reward": 0.6413798332214355, "reward_std": 0.02676195092499256, "rewards/rollout_reward_func/mean": 0.6413798332214355, "rewards/rollout_reward_func/std": 0.4698534905910492, "sampling/importance_sampling_ratio/max": 0.9997189044952393, "sampling/importance_sampling_ratio/mean": 0.9039921760559082, "sampling/importance_sampling_ratio/min": 1.1883444763037266e-16, "sampling/sampling_logp_difference/max": 4.165182113647461, "sampling/sampling_logp_difference/mean": 0.25601091980934143, "step": 1677, "step_time": 4.443326734020957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7409604266285896, "epoch": 0.01678, "grad_norm": 0.0110849067568779, "kl": 0.4580945000052452, "learning_rate": 9.998752285381994e-06, "loss": -0.0644, "step": 1678, "step_time": 2.5347168509979383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 144.3125, "completions/mean_terminated_length": 144.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.25667170714586973, "epoch": 0.01679, "frac_reward_zero_std": 0.5, "grad_norm": 0.042733319103717804, "kl": 0.5132411792874336, "learning_rate": 9.99875076525379e-06, "loss": -0.0545, "num_tokens": 14797446.0, "reward": 0.6553447246551514, "reward_std": 0.18755054473876953, "rewards/rollout_reward_func/mean": 0.6553447246551514, "rewards/rollout_reward_func/std": 0.3978564441204071, "sampling/importance_sampling_ratio/max": 1.0062850713729858, "sampling/importance_sampling_ratio/mean": 0.9421016573905945, "sampling/importance_sampling_ratio/min": 0.005157920066267252, "sampling/sampling_logp_difference/max": 2.4107306003570557, "sampling/sampling_logp_difference/mean": 0.03786223381757736, "step": 1679, "step_time": 4.701184521007235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2571807708591223, "epoch": 0.0168, "grad_norm": 0.04354768991470337, "kl": 0.5142122842371464, "learning_rate": 9.998749244200294e-06, "loss": -0.0545, "step": 1680, "step_time": 2.0373338099889224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 194.625, "completions/mean_terminated_length": 194.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.28542014956474304, "epoch": 0.01681, "frac_reward_zero_std": 0.5, "grad_norm": 0.019083531573414803, "kl": 0.46871475502848625, "learning_rate": 9.99874772222151e-06, "loss": -0.0528, "num_tokens": 14814354.0, "reward": 0.6426634788513184, "reward_std": 0.24680745601654053, "rewards/rollout_reward_func/mean": 0.6426634788513184, "rewards/rollout_reward_func/std": 0.4838615655899048, "sampling/importance_sampling_ratio/max": 1.00473952293396, "sampling/importance_sampling_ratio/mean": 0.9399945735931396, "sampling/importance_sampling_ratio/min": 0.00020031025633215904, "sampling/sampling_logp_difference/max": 2.1697030067443848, "sampling/sampling_logp_difference/mean": 0.06175077334046364, "step": 1681, "step_time": 4.345105783002509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2843237309716642, "epoch": 0.01682, "grad_norm": 0.01594693399965763, "kl": 0.48044802993535995, "learning_rate": 9.998746199317436e-06, "loss": -0.0529, "step": 1682, "step_time": 2.048321706992283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 218.0, "completions/mean_terminated_length": 218.0, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.04452353157103062, "epoch": 0.01683, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004449286207091063, "kl": 0.39904503524303436, "learning_rate": 9.99874467548807e-06, "loss": 0.0016, "num_tokens": 14832066.0, "reward": 0.8870384693145752, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8870384693145752, "rewards/rollout_reward_func/std": 0.25153592228889465, "sampling/importance_sampling_ratio/max": 1.0058401823043823, "sampling/importance_sampling_ratio/mean": 0.9986358880996704, "sampling/importance_sampling_ratio/min": 0.9951729774475098, "sampling/sampling_logp_difference/max": 0.006484858691692352, "sampling/sampling_logp_difference/mean": 0.0008345672395080328, "step": 1683, "step_time": 4.310308932996122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04454898973926902, "epoch": 0.01684, "grad_norm": 0.0005900561809539795, "kl": 0.3989572711288929, "learning_rate": 9.998743150733417e-06, "loss": 0.0016, "step": 1684, "step_time": 2.495391626012861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.12214335333555937, "epoch": 0.01685, "frac_reward_zero_std": 0.75, "grad_norm": 0.0449875183403492, "kl": 0.5735266283154488, "learning_rate": 9.998741625053473e-06, "loss": 0.029, "num_tokens": 14849898.0, "reward": 0.3940769135951996, "reward_std": 0.02719641663134098, "rewards/rollout_reward_func/mean": 0.3940769135951996, "rewards/rollout_reward_func/std": 0.2990172505378723, "sampling/importance_sampling_ratio/max": 1.0186325311660767, "sampling/importance_sampling_ratio/mean": 0.9756447672843933, "sampling/importance_sampling_ratio/min": 0.10662177950143814, "sampling/sampling_logp_difference/max": 1.9913500547409058, "sampling/sampling_logp_difference/mean": 0.015949217602610588, "step": 1685, "step_time": 4.582284215000982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12151590827852488, "epoch": 0.01686, "grad_norm": 0.044543638825416565, "kl": 0.5632866993546486, "learning_rate": 9.998740098448241e-06, "loss": 0.0289, "step": 1686, "step_time": 2.0239551090053283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 236.3125, "completions/mean_terminated_length": 236.3125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.09599531022831798, "epoch": 0.01687, "frac_reward_zero_std": 0.75, "grad_norm": 0.04883258417248726, "kl": 0.5639933347702026, "learning_rate": 9.998738570917722e-06, "loss": 0.0374, "num_tokens": 14868172.0, "reward": 0.7612452507019043, "reward_std": 0.012877500616014004, "rewards/rollout_reward_func/mean": 0.7612452507019043, "rewards/rollout_reward_func/std": 0.4377114772796631, "sampling/importance_sampling_ratio/max": 0.9992247819900513, "sampling/importance_sampling_ratio/mean": 0.96738600730896, "sampling/importance_sampling_ratio/min": 0.12111088633537292, "sampling/sampling_logp_difference/max": 2.0467545986175537, "sampling/sampling_logp_difference/mean": 0.009828576818108559, "step": 1687, "step_time": 4.580618720996426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09632757864892483, "epoch": 0.01688, "grad_norm": 0.04895729199051857, "kl": 0.5529632121324539, "learning_rate": 9.998737042461912e-06, "loss": 0.0373, "step": 1688, "step_time": 2.067326969001442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 214.375, "completions/mean_terminated_length": 214.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.17184777790680528, "epoch": 0.01689, "frac_reward_zero_std": 0.75, "grad_norm": 0.0035560328979045153, "kl": 0.4211619272828102, "learning_rate": 9.998735513080816e-06, "loss": -0.0173, "num_tokens": 14885656.0, "reward": 0.6069615483283997, "reward_std": 0.027196412906050682, "rewards/rollout_reward_func/mean": 0.6069615483283997, "rewards/rollout_reward_func/std": 0.48694726824760437, "sampling/importance_sampling_ratio/max": 1.0020039081573486, "sampling/importance_sampling_ratio/mean": 0.9668413996696472, "sampling/importance_sampling_ratio/min": 0.007420883513987064, "sampling/sampling_logp_difference/max": 2.0004987716674805, "sampling/sampling_logp_difference/mean": 0.025617968291044235, "step": 1689, "step_time": 4.334537225004169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17198825674131513, "epoch": 0.0169, "grad_norm": 0.003504199907183647, "kl": 0.41536829620599747, "learning_rate": 9.99873398277443e-06, "loss": -0.0173, "step": 1690, "step_time": 2.485164491990872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 146.75, "completions/mean_terminated_length": 146.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.05619809636846185, "epoch": 0.01691, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004173341440036893, "kl": 0.42542845383286476, "learning_rate": 9.998732451542762e-06, "loss": 0.0013, "num_tokens": 14901176.0, "reward": 0.813076913356781, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.813076913356781, "rewards/rollout_reward_func/std": 0.13502545654773712, "sampling/importance_sampling_ratio/max": 1.0079078674316406, "sampling/importance_sampling_ratio/mean": 0.9995163679122925, "sampling/importance_sampling_ratio/min": 0.9961833953857422, "sampling/sampling_logp_difference/max": 0.00933164730668068, "sampling/sampling_logp_difference/mean": 0.000911973649635911, "step": 1691, "step_time": 4.397742091983673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.056537825614213943, "epoch": 0.01692, "grad_norm": 0.0004145194252487272, "kl": 0.42542770132422447, "learning_rate": 9.998730919385805e-06, "loss": 0.0013, "step": 1692, "step_time": 1.9999046930024633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.04917157115414739, "epoch": 0.01693, "frac_reward_zero_std": 1.0, "grad_norm": 0.00039159602602012455, "kl": 0.4149145632982254, "learning_rate": 9.998729386303562e-06, "loss": 0.0014, "num_tokens": 14917376.0, "reward": 0.7265384793281555, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7265384793281555, "rewards/rollout_reward_func/std": 0.1651855856180191, "sampling/importance_sampling_ratio/max": 1.0035942792892456, "sampling/importance_sampling_ratio/mean": 0.9983206987380981, "sampling/importance_sampling_ratio/min": 0.9951483607292175, "sampling/sampling_logp_difference/max": 0.004319202154874802, "sampling/sampling_logp_difference/mean": 0.0008083455613814294, "step": 1693, "step_time": 4.056851527006074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04987045610323548, "epoch": 0.01694, "grad_norm": 0.0003993718419224024, "kl": 0.41481373086571693, "learning_rate": 9.998727852296032e-06, "loss": 0.0014, "step": 1694, "step_time": 2.0197658049946767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 235.0625, "completions/mean_terminated_length": 235.0625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.38327648118138313, "epoch": 0.01695, "frac_reward_zero_std": 0.5, "grad_norm": 0.037824518978595734, "kl": 0.44036583602428436, "learning_rate": 9.99872631736322e-06, "loss": 0.012, "num_tokens": 14935666.0, "reward": 0.7926249504089355, "reward_std": 0.0685349553823471, "rewards/rollout_reward_func/mean": 0.7926249504089355, "rewards/rollout_reward_func/std": 0.55532306432724, "sampling/importance_sampling_ratio/max": 1.0097907781600952, "sampling/importance_sampling_ratio/mean": 0.9405000805854797, "sampling/importance_sampling_ratio/min": 0.0062793465331196785, "sampling/sampling_logp_difference/max": 1.6850368976593018, "sampling/sampling_logp_difference/mean": 0.03937206044793129, "step": 1695, "step_time": 4.655012486990017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38434983789920807, "epoch": 0.01696, "grad_norm": 0.03724164143204689, "kl": 0.44208771362900734, "learning_rate": 9.998724781505122e-06, "loss": 0.012, "step": 1696, "step_time": 2.9671493130153976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 169.5, "completions/mean_terminated_length": 169.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3744680332019925, "epoch": 0.01697, "frac_reward_zero_std": 0.5, "grad_norm": 2.4177207946777344, "kl": 0.41499701142311096, "learning_rate": 9.998723244721738e-06, "loss": -0.0083, "num_tokens": 14951826.0, "reward": 0.5976442098617554, "reward_std": 0.04555399715900421, "rewards/rollout_reward_func/mean": 0.5976442098617554, "rewards/rollout_reward_func/std": 0.2468123733997345, "sampling/importance_sampling_ratio/max": 2.032768487930298, "sampling/importance_sampling_ratio/mean": 0.916813313961029, "sampling/importance_sampling_ratio/min": 0.00619856221601367, "sampling/sampling_logp_difference/max": 1.4704030752182007, "sampling/sampling_logp_difference/mean": 0.06624675542116165, "step": 1697, "step_time": 4.122943410999142 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.08333333535119891, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.08854166883975267, "entropy": 0.6267013773322105, "epoch": 0.01698, "grad_norm": 0.10420946031808853, "kl": 0.418423168361187, "learning_rate": 9.99872170701307e-06, "loss": -0.0148, "step": 1698, "step_time": 2.0314436919943546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 190.6875, "completions/mean_terminated_length": 190.6875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.22297216579318047, "epoch": 0.01699, "frac_reward_zero_std": 0.75, "grad_norm": 0.0087937256321311, "kl": 0.44826363772153854, "learning_rate": 9.99872016837912e-06, "loss": -0.0169, "num_tokens": 14968664.0, "reward": 0.8365576863288879, "reward_std": 0.006799111608415842, "rewards/rollout_reward_func/mean": 0.8365576863288879, "rewards/rollout_reward_func/std": 0.3013283908367157, "sampling/importance_sampling_ratio/max": 1.0004924535751343, "sampling/importance_sampling_ratio/mean": 0.9672209024429321, "sampling/importance_sampling_ratio/min": 0.040535587817430496, "sampling/sampling_logp_difference/max": 1.8606069087982178, "sampling/sampling_logp_difference/mean": 0.017458483576774597, "step": 1699, "step_time": 4.17785912900581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22307574655860662, "epoch": 0.017, "grad_norm": 0.008837814442813396, "kl": 0.4541296400129795, "learning_rate": 9.998718628819883e-06, "loss": -0.0168, "step": 1700, "step_time": 2.0257163800124545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 194.5, "completions/mean_terminated_length": 194.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06764232600107789, "epoch": 0.01701, "frac_reward_zero_std": 1.0, "grad_norm": 0.000606645829975605, "kl": 0.4116152562201023, "learning_rate": 9.998717088335366e-06, "loss": 0.0015, "num_tokens": 14985656.0, "reward": 0.8036538362503052, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8036538362503052, "rewards/rollout_reward_func/std": 0.3269372880458832, "sampling/importance_sampling_ratio/max": 1.0027682781219482, "sampling/importance_sampling_ratio/mean": 0.9968173503875732, "sampling/importance_sampling_ratio/min": 0.9941591024398804, "sampling/sampling_logp_difference/max": 0.005205769091844559, "sampling/sampling_logp_difference/mean": 0.0010425953660160303, "step": 1701, "step_time": 4.379832127997361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06894785864278674, "epoch": 0.01702, "grad_norm": 0.0006092392723076046, "kl": 0.4114433526992798, "learning_rate": 9.998715546925565e-06, "loss": 0.0015, "step": 1702, "step_time": 2.9496992870117538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 245.25, "completions/mean_terminated_length": 245.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.061283010989427567, "epoch": 0.01703, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006165958475321531, "kl": 0.3756319433450699, "learning_rate": 9.998714004590481e-06, "loss": 0.0016, "num_tokens": 15004296.0, "reward": 0.9629230499267578, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9629230499267578, "rewards/rollout_reward_func/std": 0.4949914813041687, "sampling/importance_sampling_ratio/max": 1.0013890266418457, "sampling/importance_sampling_ratio/mean": 0.9963913559913635, "sampling/importance_sampling_ratio/min": 0.9933030009269714, "sampling/sampling_logp_difference/max": 0.005997989326715469, "sampling/sampling_logp_difference/mean": 0.0008596397237852216, "step": 1703, "step_time": 4.377227097997093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06230418058112264, "epoch": 0.01704, "grad_norm": 0.0006268388824537396, "kl": 0.3754698745906353, "learning_rate": 9.998712461330117e-06, "loss": 0.0016, "step": 1704, "step_time": 2.018104107999534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.16045693773776293, "epoch": 0.01705, "frac_reward_zero_std": 0.5, "grad_norm": 0.0526445135474205, "kl": 0.5225632078945637, "learning_rate": 9.99871091714447e-06, "loss": -0.0451, "num_tokens": 15021560.0, "reward": 0.8990384340286255, "reward_std": 0.24204808473587036, "rewards/rollout_reward_func/mean": 0.8990384340286255, "rewards/rollout_reward_func/std": 0.5671510100364685, "sampling/importance_sampling_ratio/max": 1.0051597356796265, "sampling/importance_sampling_ratio/mean": 0.9459241628646851, "sampling/importance_sampling_ratio/min": 0.12795858085155487, "sampling/sampling_logp_difference/max": 1.9259960651397705, "sampling/sampling_logp_difference/mean": 0.0190387275069952, "step": 1705, "step_time": 4.284631795009773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1603955957107246, "epoch": 0.01706, "grad_norm": 0.056102875620126724, "kl": 0.5087658762931824, "learning_rate": 9.998709372033544e-06, "loss": -0.045, "step": 1706, "step_time": 2.012067895004293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 208.75, "completions/mean_terminated_length": 208.75, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2986727519892156, "epoch": 0.01707, "frac_reward_zero_std": 0.75, "grad_norm": 0.03843521326780319, "kl": 0.509579211473465, "learning_rate": 9.998707825997335e-06, "loss": 0.0379, "num_tokens": 15038920.0, "reward": 0.33141347765922546, "reward_std": 0.04231315478682518, "rewards/rollout_reward_func/mean": 0.33141347765922546, "rewards/rollout_reward_func/std": 0.15654557943344116, "sampling/importance_sampling_ratio/max": 0.999606728553772, "sampling/importance_sampling_ratio/mean": 0.936455488204956, "sampling/importance_sampling_ratio/min": 0.0037050882820039988, "sampling/sampling_logp_difference/max": 1.8655736446380615, "sampling/sampling_logp_difference/mean": 0.035573944449424744, "step": 1707, "step_time": 4.228179347999685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30041892314329743, "epoch": 0.01708, "grad_norm": 0.0390002615749836, "kl": 0.5037496872246265, "learning_rate": 9.998706279035846e-06, "loss": 0.0379, "step": 1708, "step_time": 2.8583529989919043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 187.5, "completions/mean_terminated_length": 187.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.24457608349621296, "epoch": 0.01709, "frac_reward_zero_std": 0.75, "grad_norm": 0.009551143273711205, "kl": 0.399872500449419, "learning_rate": 9.998704731149078e-06, "loss": -0.0171, "num_tokens": 15055600.0, "reward": 0.5619423389434814, "reward_std": 0.006799102760851383, "rewards/rollout_reward_func/mean": 0.5619423389434814, "rewards/rollout_reward_func/std": 0.3368881046772003, "sampling/importance_sampling_ratio/max": 1.0023878812789917, "sampling/importance_sampling_ratio/mean": 0.9658031463623047, "sampling/importance_sampling_ratio/min": 0.03776935860514641, "sampling/sampling_logp_difference/max": 1.869966745376587, "sampling/sampling_logp_difference/mean": 0.01800326630473137, "step": 1709, "step_time": 4.098448352997366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2469344651326537, "epoch": 0.0171, "grad_norm": 0.008821941912174225, "kl": 0.40693310648202896, "learning_rate": 9.998703182337028e-06, "loss": -0.0171, "step": 1710, "step_time": 1.9878741409847862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 195.5, "completions/mean_terminated_length": 195.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.06759660132229328, "epoch": 0.01711, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006194260786287487, "kl": 0.40397434309124947, "learning_rate": 9.998701632599702e-06, "loss": 0.0015, "num_tokens": 15072592.0, "reward": 0.944038450717926, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.944038450717926, "rewards/rollout_reward_func/std": 0.3748054504394531, "sampling/importance_sampling_ratio/max": 1.0040696859359741, "sampling/importance_sampling_ratio/mean": 0.997165322303772, "sampling/importance_sampling_ratio/min": 0.9934503436088562, "sampling/sampling_logp_difference/max": 0.004836335778236389, "sampling/sampling_logp_difference/mean": 0.0008447661530226469, "step": 1711, "step_time": 4.058718088002934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06798206642270088, "epoch": 0.01712, "grad_norm": 0.0006200972711667418, "kl": 0.4039200693368912, "learning_rate": 9.998700081937093e-06, "loss": 0.0015, "step": 1712, "step_time": 2.0038776519941166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 170.0625, "completions/mean_terminated_length": 170.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.9154915828257799, "epoch": 0.01713, "frac_reward_zero_std": 0.25, "grad_norm": 0.02625667117536068, "kl": 0.4679213836789131, "learning_rate": 9.99869853034921e-06, "loss": 0.0347, "num_tokens": 15088746.0, "reward": 0.7588870525360107, "reward_std": 0.03818625211715698, "rewards/rollout_reward_func/mean": 0.7588870525360107, "rewards/rollout_reward_func/std": 0.44947487115859985, "sampling/importance_sampling_ratio/max": 1.0026131868362427, "sampling/importance_sampling_ratio/mean": 0.874229371547699, "sampling/importance_sampling_ratio/min": 1.151143513844488e-09, "sampling/sampling_logp_difference/max": 12.172870635986328, "sampling/sampling_logp_difference/mean": 0.20948541164398193, "step": 1713, "step_time": 4.199032732991327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9231565520167351, "epoch": 0.01714, "grad_norm": 0.025766875594854355, "kl": 0.45987583324313164, "learning_rate": 9.998696977836046e-06, "loss": 0.0346, "step": 1714, "step_time": 2.487446921019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 166.375, "completions/mean_terminated_length": 166.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.23684251587837934, "epoch": 0.01715, "frac_reward_zero_std": 0.75, "grad_norm": 0.016041312366724014, "kl": 0.44067759811878204, "learning_rate": 9.998695424397606e-06, "loss": 0.0202, "num_tokens": 15104694.0, "reward": 0.7422307729721069, "reward_std": 0.01359820831567049, "rewards/rollout_reward_func/mean": 0.7422307729721069, "rewards/rollout_reward_func/std": 0.4298645257949829, "sampling/importance_sampling_ratio/max": 0.9983764290809631, "sampling/importance_sampling_ratio/mean": 0.9643443822860718, "sampling/importance_sampling_ratio/min": 0.018322806805372238, "sampling/sampling_logp_difference/max": 1.875434398651123, "sampling/sampling_logp_difference/mean": 0.02538393624126911, "step": 1715, "step_time": 4.063817543006735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24130593799054623, "epoch": 0.01716, "grad_norm": 0.01556242536753416, "kl": 0.4370507411658764, "learning_rate": 9.998693870033887e-06, "loss": 0.0202, "step": 1716, "step_time": 1.997096697014058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 159.625, "completions/mean_terminated_length": 159.625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.36583282332867384, "epoch": 0.01717, "frac_reward_zero_std": 0.75, "grad_norm": 0.02441132813692093, "kl": 0.5093170516192913, "learning_rate": 9.998692314744891e-06, "loss": -0.0341, "num_tokens": 15120426.0, "reward": 0.40971633791923523, "reward_std": 0.1577799916267395, "rewards/rollout_reward_func/mean": 0.40971633791923523, "rewards/rollout_reward_func/std": 0.30861636996269226, "sampling/importance_sampling_ratio/max": 0.9973319172859192, "sampling/importance_sampling_ratio/mean": 0.933010458946228, "sampling/importance_sampling_ratio/min": 0.010163449682295322, "sampling/sampling_logp_difference/max": 2.0521559715270996, "sampling/sampling_logp_difference/mean": 0.04118757322430611, "step": 1717, "step_time": 4.059108517001732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3653391133993864, "epoch": 0.01718, "grad_norm": 0.024733996018767357, "kl": 0.5085781663656235, "learning_rate": 9.99869075853062e-06, "loss": -0.0341, "step": 1718, "step_time": 2.0161715620051837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 212.5, "completions/mean_terminated_length": 212.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.16963788028806448, "epoch": 0.01719, "frac_reward_zero_std": 1.0, "grad_norm": 0.015016177669167519, "kl": 0.404816459864378, "learning_rate": 9.998689201391071e-06, "loss": 0.0016, "num_tokens": 15137906.0, "reward": 0.38807690143585205, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.38807690143585205, "rewards/rollout_reward_func/std": 0.10908772796392441, "sampling/importance_sampling_ratio/max": 1.0521104335784912, "sampling/importance_sampling_ratio/mean": 0.9403842091560364, "sampling/importance_sampling_ratio/min": 0.08571969717741013, "sampling/sampling_logp_difference/max": 1.432899832725525, "sampling/sampling_logp_difference/mean": 0.023861682042479515, "step": 1719, "step_time": 4.2662431500066305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1753991087898612, "epoch": 0.0172, "grad_norm": 0.02188171073794365, "kl": 0.41875849291682243, "learning_rate": 9.998687643326247e-06, "loss": 0.0017, "step": 1720, "step_time": 2.935613817004196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 191.5625, "completions/mean_terminated_length": 191.5625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.54151994548738, "epoch": 0.01721, "frac_reward_zero_std": 0.5, "grad_norm": 0.5790280699729919, "kl": 0.6366363018751144, "learning_rate": 9.99868608433615e-06, "loss": 0.0283, "num_tokens": 15154860.0, "reward": 0.6994985342025757, "reward_std": 0.033899277448654175, "rewards/rollout_reward_func/mean": 0.6994985342025757, "rewards/rollout_reward_func/std": 0.29611510038375854, "sampling/importance_sampling_ratio/max": 1.0020942687988281, "sampling/importance_sampling_ratio/mean": 0.8982974290847778, "sampling/importance_sampling_ratio/min": 2.5646781214320926e-16, "sampling/sampling_logp_difference/max": 3.297682762145996, "sampling/sampling_logp_difference/mean": 0.1851041615009308, "step": 1721, "step_time": 4.28589358300087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.07291666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07291666883975267, "entropy": 0.8636560952290893, "epoch": 0.01722, "grad_norm": 0.22485525906085968, "kl": 0.6528852432966232, "learning_rate": 9.998684524420775e-06, "loss": 0.0263, "step": 1722, "step_time": 2.0312947170095867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 234.03125, "completions/mean_terminated_length": 234.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8138817856088281, "epoch": 0.01723, "frac_reward_zero_std": 0.25, "grad_norm": 0.06809303909540176, "kl": 0.447437547147274, "learning_rate": 9.998682963580126e-06, "loss": -0.0275, "num_tokens": 15173061.0, "reward": 0.5286716222763062, "reward_std": 0.2123005986213684, "rewards/rollout_reward_func/mean": 0.5286716222763062, "rewards/rollout_reward_func/std": 0.3247484564781189, "sampling/importance_sampling_ratio/max": 1.006402850151062, "sampling/importance_sampling_ratio/mean": 0.8471424579620361, "sampling/importance_sampling_ratio/min": 2.7138508812640794e-05, "sampling/sampling_logp_difference/max": 1.951966643333435, "sampling/sampling_logp_difference/mean": 0.11080383509397507, "step": 1723, "step_time": 4.6653890470042825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7990145022049546, "epoch": 0.01724, "grad_norm": 0.07265368103981018, "kl": 0.44142184033989906, "learning_rate": 9.998681401814203e-06, "loss": -0.0277, "step": 1724, "step_time": 2.065690743998857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 128.78125, "completions/mean_terminated_length": 128.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.712526572868228, "epoch": 0.01725, "frac_reward_zero_std": 0.25, "grad_norm": 0.10111408680677414, "kl": 0.504967987537384, "learning_rate": 9.998679839123007e-06, "loss": -0.0213, "num_tokens": 15188126.0, "reward": 0.6267788410186768, "reward_std": 0.08226915448904037, "rewards/rollout_reward_func/mean": 0.6267788410186768, "rewards/rollout_reward_func/std": 0.29834866523742676, "sampling/importance_sampling_ratio/max": 1.007355809211731, "sampling/importance_sampling_ratio/mean": 0.9198408126831055, "sampling/importance_sampling_ratio/min": 1.2080114686341403e-09, "sampling/sampling_logp_difference/max": 3.505340576171875, "sampling/sampling_logp_difference/mean": 0.16872259974479675, "step": 1725, "step_time": 4.386343616999511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7072789873927832, "epoch": 0.01726, "grad_norm": 0.098878875374794, "kl": 0.49345763027668, "learning_rate": 9.998678275506536e-06, "loss": -0.0214, "step": 1726, "step_time": 2.014395456018974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 212.0, "completions/mean_terminated_length": 212.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.11420039087533951, "epoch": 0.01727, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012119164457544684, "kl": 0.3698524497449398, "learning_rate": 9.998676710964793e-06, "loss": 0.0015, "num_tokens": 15205622.0, "reward": 0.8967692255973816, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8967692255973816, "rewards/rollout_reward_func/std": 0.38943788409233093, "sampling/importance_sampling_ratio/max": 1.0024669170379639, "sampling/importance_sampling_ratio/mean": 0.9857826828956604, "sampling/importance_sampling_ratio/min": 0.9689094424247742, "sampling/sampling_logp_difference/max": 0.017125606536865234, "sampling/sampling_logp_difference/mean": 0.002289790892973542, "step": 1727, "step_time": 4.34295397499227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12252762168645859, "epoch": 0.01728, "grad_norm": 0.001310003106482327, "kl": 0.36847182363271713, "learning_rate": 9.998675145497776e-06, "loss": 0.0015, "step": 1728, "step_time": 2.017722471005982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.40187790244817734, "epoch": 0.01729, "frac_reward_zero_std": 0.5, "grad_norm": 0.07951133698225021, "kl": 0.39977674186229706, "learning_rate": 9.998673579105487e-06, "loss": 0.0072, "num_tokens": 15220310.0, "reward": 0.34254807233810425, "reward_std": 0.015637939795851707, "rewards/rollout_reward_func/mean": 0.34254807233810425, "rewards/rollout_reward_func/std": 0.20888805389404297, "sampling/importance_sampling_ratio/max": 1.0515397787094116, "sampling/importance_sampling_ratio/mean": 0.9378179311752319, "sampling/importance_sampling_ratio/min": 0.0056472173891961575, "sampling/sampling_logp_difference/max": 1.674485445022583, "sampling/sampling_logp_difference/mean": 0.04949873685836792, "step": 1729, "step_time": 3.8381216660054633 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.4122457653284073, "epoch": 0.0173, "grad_norm": 0.055186402052640915, "kl": 0.39107729867100716, "learning_rate": 9.998672011787927e-06, "loss": 0.0068, "step": 1730, "step_time": 1.9832923959984328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 234.5625, "completions/mean_terminated_length": 234.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.988484088331461, "epoch": 0.01731, "frac_reward_zero_std": 0.0, "grad_norm": 0.19548462331295013, "kl": 0.4489138536155224, "learning_rate": 9.998670443545094e-06, "loss": -0.0982, "num_tokens": 15238552.0, "reward": 0.7235240936279297, "reward_std": 0.5537341237068176, "rewards/rollout_reward_func/mean": 0.7235240936279297, "rewards/rollout_reward_func/std": 0.7212475538253784, "sampling/importance_sampling_ratio/max": 0.9863024353981018, "sampling/importance_sampling_ratio/mean": 0.7548806667327881, "sampling/importance_sampling_ratio/min": 0.0019519649213179946, "sampling/sampling_logp_difference/max": 1.86152982711792, "sampling/sampling_logp_difference/mean": 0.1412922888994217, "step": 1731, "step_time": 5.082343752997986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9759183824062347, "epoch": 0.01732, "grad_norm": 0.16515153646469116, "kl": 0.4463721439242363, "learning_rate": 9.99866887437699e-06, "loss": -0.0985, "step": 1732, "step_time": 2.0761289750007563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 267.15625, "completions/mean_terminated_length": 267.15625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.6930030193179846, "epoch": 0.01733, "frac_reward_zero_std": 0.25, "grad_norm": 0.0729670599102974, "kl": 0.3629106879234314, "learning_rate": 9.998667304283616e-06, "loss": -0.0316, "num_tokens": 15257893.0, "reward": 0.9870768785476685, "reward_std": 0.19863790273666382, "rewards/rollout_reward_func/mean": 0.9870768785476685, "rewards/rollout_reward_func/std": 0.5113009810447693, "sampling/importance_sampling_ratio/max": 0.9919124841690063, "sampling/importance_sampling_ratio/mean": 0.8046774864196777, "sampling/importance_sampling_ratio/min": 1.9012988961009114e-09, "sampling/sampling_logp_difference/max": 13.890120506286621, "sampling/sampling_logp_difference/mean": 0.1372475028038025, "step": 1733, "step_time": 4.616130155009159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6893630996346474, "epoch": 0.01734, "grad_norm": 0.07337739318609238, "kl": 0.3614082708954811, "learning_rate": 9.998665733264971e-06, "loss": -0.0317, "step": 1734, "step_time": 2.0355121520042303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 122.125, "completions/mean_terminated_length": 122.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.42043156921863556, "epoch": 0.01735, "frac_reward_zero_std": 0.75, "grad_norm": 0.08900536596775055, "kl": 0.43445683270692825, "learning_rate": 9.998664161321058e-06, "loss": -0.0013, "num_tokens": 15272569.0, "reward": 0.47331732511520386, "reward_std": 0.04591786488890648, "rewards/rollout_reward_func/mean": 0.47331732511520386, "rewards/rollout_reward_func/std": 0.1292051374912262, "sampling/importance_sampling_ratio/max": 1.0180391073226929, "sampling/importance_sampling_ratio/mean": 0.9302884340286255, "sampling/importance_sampling_ratio/min": 0.038270048797130585, "sampling/sampling_logp_difference/max": 1.621798038482666, "sampling/sampling_logp_difference/mean": 0.03869013488292694, "step": 1735, "step_time": 3.9233817310159793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4190717376768589, "epoch": 0.01736, "grad_norm": 0.09355803579092026, "kl": 0.4320792406797409, "learning_rate": 9.998662588451873e-06, "loss": -0.0018, "step": 1736, "step_time": 2.022327617974952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 165.78125, "completions/mean_terminated_length": 165.78125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.31664688140153885, "epoch": 0.01737, "frac_reward_zero_std": 0.5, "grad_norm": 0.07937102764844894, "kl": 0.3808601200580597, "learning_rate": 9.99866101465742e-06, "loss": -0.038, "num_tokens": 15288698.0, "reward": 0.9020192623138428, "reward_std": 0.046505868434906006, "rewards/rollout_reward_func/mean": 0.9020192623138428, "rewards/rollout_reward_func/std": 0.2991913855075836, "sampling/importance_sampling_ratio/max": 0.9959672689437866, "sampling/importance_sampling_ratio/mean": 0.9397785663604736, "sampling/importance_sampling_ratio/min": 0.08278917521238327, "sampling/sampling_logp_difference/max": 1.348799467086792, "sampling/sampling_logp_difference/mean": 0.023603565990924835, "step": 1737, "step_time": 4.932026962997043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3157038725912571, "epoch": 0.01738, "grad_norm": 0.07664042711257935, "kl": 0.3821520358324051, "learning_rate": 9.998659439937697e-06, "loss": -0.0382, "step": 1738, "step_time": 2.01451081099367 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 255.0, "completions/mean_terminated_length": 255.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.9756291750818491, "epoch": 0.01739, "frac_reward_zero_std": 0.0, "grad_norm": 0.06890659034252167, "kl": 0.43000686541199684, "learning_rate": 9.998657864292708e-06, "loss": -0.0736, "num_tokens": 15307538.0, "reward": 0.32682546973228455, "reward_std": 0.3044935464859009, "rewards/rollout_reward_func/mean": 0.32682546973228455, "rewards/rollout_reward_func/std": 0.40428048372268677, "sampling/importance_sampling_ratio/max": 1.0944840908050537, "sampling/importance_sampling_ratio/mean": 0.7430999279022217, "sampling/importance_sampling_ratio/min": 3.537044879209361e-09, "sampling/sampling_logp_difference/max": 3.692075252532959, "sampling/sampling_logp_difference/mean": 0.15037459135055542, "step": 1739, "step_time": 4.942558909977379 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.95363624766469, "epoch": 0.0174, "grad_norm": 0.0694432258605957, "kl": 0.4260658994317055, "learning_rate": 9.998656287722448e-06, "loss": -0.0743, "step": 1740, "step_time": 2.105182101004175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 190.1875, "completions/mean_terminated_length": 190.1875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.45202447660267353, "epoch": 0.01741, "frac_reward_zero_std": 0.25, "grad_norm": 0.16555562615394592, "kl": 0.43779537081718445, "learning_rate": 9.998654710226923e-06, "loss": -0.0069, "num_tokens": 15324248.0, "reward": 0.8574182987213135, "reward_std": 0.27865803241729736, "rewards/rollout_reward_func/mean": 0.8574182987213135, "rewards/rollout_reward_func/std": 0.48771771788597107, "sampling/importance_sampling_ratio/max": 0.9978069067001343, "sampling/importance_sampling_ratio/mean": 0.8847292065620422, "sampling/importance_sampling_ratio/min": 0.21871329843997955, "sampling/sampling_logp_difference/max": 1.410558819770813, "sampling/sampling_logp_difference/mean": 0.03652844950556755, "step": 1741, "step_time": 4.24186911599827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41735171526670456, "epoch": 0.01742, "grad_norm": 0.15750813484191895, "kl": 0.44476528838276863, "learning_rate": 9.998653131806129e-06, "loss": -0.0077, "step": 1742, "step_time": 2.046753083996009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 170.875, "completions/mean_terminated_length": 170.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.288775984197855, "epoch": 0.01743, "frac_reward_zero_std": 0.75, "grad_norm": 0.01852676272392273, "kl": 0.40105951949954033, "learning_rate": 9.998651552460068e-06, "loss": -0.0178, "num_tokens": 15340340.0, "reward": 0.7515528798103333, "reward_std": 0.010824700817465782, "rewards/rollout_reward_func/mean": 0.7515528798103333, "rewards/rollout_reward_func/std": 0.18189223110675812, "sampling/importance_sampling_ratio/max": 1.001623511314392, "sampling/importance_sampling_ratio/mean": 0.9379572868347168, "sampling/importance_sampling_ratio/min": 0.048236750066280365, "sampling/sampling_logp_difference/max": 1.7908093929290771, "sampling/sampling_logp_difference/mean": 0.027970556169748306, "step": 1743, "step_time": 5.097300191002432 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0096726194024086, "entropy": 0.2710824627429247, "epoch": 0.01744, "grad_norm": 0.017495298758149147, "kl": 0.40543780103325844, "learning_rate": 9.998649972188742e-06, "loss": -0.0178, "step": 1744, "step_time": 2.0264488329994492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 191.40625, "completions/mean_terminated_length": 191.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.369287334382534, "epoch": 0.01745, "frac_reward_zero_std": 0.25, "grad_norm": 0.13775302469730377, "kl": 0.42370031028985977, "learning_rate": 9.998648390992147e-06, "loss": -0.026, "num_tokens": 15357233.0, "reward": 0.7257547974586487, "reward_std": 0.17659993469715118, "rewards/rollout_reward_func/mean": 0.7257547974586487, "rewards/rollout_reward_func/std": 0.5342155694961548, "sampling/importance_sampling_ratio/max": 1.012713074684143, "sampling/importance_sampling_ratio/mean": 0.9185342788696289, "sampling/importance_sampling_ratio/min": 1.8320743810917861e-09, "sampling/sampling_logp_difference/max": 16.010255813598633, "sampling/sampling_logp_difference/mean": 0.1170935109257698, "step": 1745, "step_time": 4.251670890007517 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.3510068552568555, "epoch": 0.01746, "grad_norm": 0.09370867162942886, "kl": 0.4257640242576599, "learning_rate": 9.998646808870288e-06, "loss": -0.0266, "step": 1746, "step_time": 2.046354372992937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.34510838706046343, "epoch": 0.01747, "frac_reward_zero_std": 0.25, "grad_norm": 0.09437357634305954, "kl": 0.4314890280365944, "learning_rate": 9.998645225823162e-06, "loss": -0.0711, "num_tokens": 15375393.0, "reward": 0.9229807257652283, "reward_std": 0.26482394337654114, "rewards/rollout_reward_func/mean": 0.9229807257652283, "rewards/rollout_reward_func/std": 0.399507611989975, "sampling/importance_sampling_ratio/max": 1.0007312297821045, "sampling/importance_sampling_ratio/mean": 0.8854250907897949, "sampling/importance_sampling_ratio/min": 0.1370140016078949, "sampling/sampling_logp_difference/max": 1.7125847339630127, "sampling/sampling_logp_difference/mean": 0.03202638775110245, "step": 1747, "step_time": 4.400357181999425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3329862551763654, "epoch": 0.01748, "grad_norm": 0.08833570033311844, "kl": 0.4450780786573887, "learning_rate": 9.998643641850773e-06, "loss": -0.0718, "step": 1748, "step_time": 2.081234282995865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 159.46875, "completions/mean_terminated_length": 159.46875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.21156027168035507, "epoch": 0.01749, "frac_reward_zero_std": 0.5, "grad_norm": 0.11544597893953323, "kl": 0.43371811509132385, "learning_rate": 9.99864205695312e-06, "loss": 0.0132, "num_tokens": 15391408.0, "reward": 0.6438941955566406, "reward_std": 0.09558644890785217, "rewards/rollout_reward_func/mean": 0.6438941955566406, "rewards/rollout_reward_func/std": 0.36191174387931824, "sampling/importance_sampling_ratio/max": 1.0041810274124146, "sampling/importance_sampling_ratio/mean": 0.9501972794532776, "sampling/importance_sampling_ratio/min": 0.21196332573890686, "sampling/sampling_logp_difference/max": 1.5452840328216553, "sampling/sampling_logp_difference/mean": 0.015035000629723072, "step": 1749, "step_time": 5.217422578993137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20911389496177435, "epoch": 0.0175, "grad_norm": 0.1169562116265297, "kl": 0.44089698418974876, "learning_rate": 9.998640471130202e-06, "loss": 0.013, "step": 1750, "step_time": 1.998495533989626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 153.8125, "completions/mean_terminated_length": 153.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.474196401424706, "epoch": 0.01751, "frac_reward_zero_std": 0.25, "grad_norm": 0.10758908838033676, "kl": 0.46035129949450493, "learning_rate": 9.998638884382021e-06, "loss": -0.0283, "num_tokens": 15407218.0, "reward": 0.8620120286941528, "reward_std": 0.12007009238004684, "rewards/rollout_reward_func/mean": 0.8620120286941528, "rewards/rollout_reward_func/std": 0.27813923358917236, "sampling/importance_sampling_ratio/max": 1.007938027381897, "sampling/importance_sampling_ratio/mean": 0.8932405710220337, "sampling/importance_sampling_ratio/min": 0.0008304569637402892, "sampling/sampling_logp_difference/max": 3.340865135192871, "sampling/sampling_logp_difference/mean": 0.07209265232086182, "step": 1751, "step_time": 4.167160446995695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4807064300402999, "epoch": 0.01752, "grad_norm": 0.10778996348381042, "kl": 0.46959253028035164, "learning_rate": 9.998637296708576e-06, "loss": -0.0282, "step": 1752, "step_time": 2.058358013004181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 245.9375, "completions/mean_terminated_length": 245.9375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.37803055811673403, "epoch": 0.01753, "frac_reward_zero_std": 0.5, "grad_norm": 0.1951420158147812, "kl": 0.36863426864147186, "learning_rate": 9.998635708109868e-06, "loss": -0.037, "num_tokens": 15425768.0, "reward": 0.8110000491142273, "reward_std": 0.018029311671853065, "rewards/rollout_reward_func/mean": 0.8110000491142273, "rewards/rollout_reward_func/std": 0.5121763348579407, "sampling/importance_sampling_ratio/max": 0.9999522566795349, "sampling/importance_sampling_ratio/mean": 0.9135450124740601, "sampling/importance_sampling_ratio/min": 7.147051655920222e-05, "sampling/sampling_logp_difference/max": 4.037410736083984, "sampling/sampling_logp_difference/mean": 0.06410009413957596, "step": 1753, "step_time": 4.8382214560042485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.37451296485960484, "epoch": 0.01754, "grad_norm": 0.15951815247535706, "kl": 0.3711574710905552, "learning_rate": 9.998634118585898e-06, "loss": -0.0376, "step": 1754, "step_time": 2.51884807299939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 123.0625, "completions/mean_terminated_length": 123.0625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2790043940767646, "epoch": 0.01755, "frac_reward_zero_std": 0.0, "grad_norm": 0.373698890209198, "kl": 0.5170216634869576, "learning_rate": 9.998632528136664e-06, "loss": 0.005, "num_tokens": 15440586.0, "reward": 0.7401922941207886, "reward_std": 0.07590723037719727, "rewards/rollout_reward_func/mean": 0.7401922941207886, "rewards/rollout_reward_func/std": 0.2840173840522766, "sampling/importance_sampling_ratio/max": 1.0050253868103027, "sampling/importance_sampling_ratio/mean": 0.9417450428009033, "sampling/importance_sampling_ratio/min": 0.15759027004241943, "sampling/sampling_logp_difference/max": 1.785563588142395, "sampling/sampling_logp_difference/mean": 0.0266650952398777, "step": 1755, "step_time": 4.331361855001887 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 0.28343069832772017, "epoch": 0.01756, "grad_norm": 0.3404405415058136, "kl": 0.5089765787124634, "learning_rate": 9.998630936762168e-06, "loss": 0.0019, "step": 1756, "step_time": 1.991969781993248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 195.1875, "completions/mean_terminated_length": 195.1875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.1951498193666339, "epoch": 0.01757, "frac_reward_zero_std": 0.5, "grad_norm": 0.07335655391216278, "kl": 0.35643190145492554, "learning_rate": 9.998629344462411e-06, "loss": -0.0014, "num_tokens": 15457512.0, "reward": 0.983024001121521, "reward_std": 0.020329318940639496, "rewards/rollout_reward_func/mean": 0.983024001121521, "rewards/rollout_reward_func/std": 0.31911471486091614, "sampling/importance_sampling_ratio/max": 1.001659631729126, "sampling/importance_sampling_ratio/mean": 0.9456677436828613, "sampling/importance_sampling_ratio/min": 0.49645695090293884, "sampling/sampling_logp_difference/max": 0.45619189739227295, "sampling/sampling_logp_difference/mean": 0.012928520329296589, "step": 1757, "step_time": 4.321497099008411 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.20345377922058105, "epoch": 0.01758, "grad_norm": 0.11473013460636139, "kl": 0.34681858867406845, "learning_rate": 9.998627751237395e-06, "loss": -0.0015, "step": 1758, "step_time": 2.0487167929968564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 195.8125, "completions/mean_terminated_length": 195.8125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.38423866406083107, "epoch": 0.01759, "frac_reward_zero_std": 0.25, "grad_norm": 0.07282320410013199, "kl": 0.4336170516908169, "learning_rate": 9.998626157087118e-06, "loss": -0.0324, "num_tokens": 15474602.0, "reward": 0.926677942276001, "reward_std": 0.09729386121034622, "rewards/rollout_reward_func/mean": 0.926677942276001, "rewards/rollout_reward_func/std": 0.26710689067840576, "sampling/importance_sampling_ratio/max": 1.052276849746704, "sampling/importance_sampling_ratio/mean": 0.9101758003234863, "sampling/importance_sampling_ratio/min": 0.08054553717374802, "sampling/sampling_logp_difference/max": 1.9926174879074097, "sampling/sampling_logp_difference/mean": 0.039609987288713455, "step": 1759, "step_time": 4.3387800799973775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3867973331362009, "epoch": 0.0176, "grad_norm": 0.07326608896255493, "kl": 0.43370579928159714, "learning_rate": 9.998624562011578e-06, "loss": -0.0323, "step": 1760, "step_time": 2.5459854370055837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 180.125, "completions/mean_terminated_length": 180.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.37476349994540215, "epoch": 0.01761, "frac_reward_zero_std": 0.25, "grad_norm": 0.356594055891037, "kl": 0.3775424212217331, "learning_rate": 9.998622966010781e-06, "loss": -0.0585, "num_tokens": 15491046.0, "reward": 1.0276249647140503, "reward_std": 0.1362428069114685, "rewards/rollout_reward_func/mean": 1.0276249647140503, "rewards/rollout_reward_func/std": 0.1695733666419983, "sampling/importance_sampling_ratio/max": 1.1053968667984009, "sampling/importance_sampling_ratio/mean": 0.9313703775405884, "sampling/importance_sampling_ratio/min": 0.030171066522598267, "sampling/sampling_logp_difference/max": 1.825290560722351, "sampling/sampling_logp_difference/mean": 0.03687788546085358, "step": 1761, "step_time": 4.948696605017176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38402676582336426, "epoch": 0.01762, "grad_norm": 0.3610471189022064, "kl": 0.3846072666347027, "learning_rate": 9.998621369084724e-06, "loss": -0.0603, "step": 1762, "step_time": 2.0643604319993756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 279.5625, "completions/mean_terminated_length": 279.5625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.24266948457807302, "epoch": 0.01763, "frac_reward_zero_std": 0.25, "grad_norm": 0.13327531516551971, "kl": 0.4720405377447605, "learning_rate": 9.998619771233407e-06, "loss": -0.0601, "num_tokens": 15510616.0, "reward": 0.7708653807640076, "reward_std": 0.0283298809081316, "rewards/rollout_reward_func/mean": 0.7708653807640076, "rewards/rollout_reward_func/std": 0.4549866020679474, "sampling/importance_sampling_ratio/max": 1.0103222131729126, "sampling/importance_sampling_ratio/mean": 0.9268355369567871, "sampling/importance_sampling_ratio/min": 0.0795130655169487, "sampling/sampling_logp_difference/max": 1.4684109687805176, "sampling/sampling_logp_difference/mean": 0.020711246877908707, "step": 1763, "step_time": 4.577317078983469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.017299107741564512, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017299107741564512, "entropy": 0.24976100958883762, "epoch": 0.01764, "grad_norm": 0.14812125265598297, "kl": 0.504746675491333, "learning_rate": 9.998618172456832e-06, "loss": -0.0597, "step": 1764, "step_time": 2.0356356289994437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 137.46875, "completions/mean_terminated_length": 137.46875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.1678012888878584, "epoch": 0.01765, "frac_reward_zero_std": 0.75, "grad_norm": 0.08705423772335052, "kl": 0.37609635666012764, "learning_rate": 9.998616572754998e-06, "loss": -0.0284, "num_tokens": 15525639.0, "reward": 0.5804375410079956, "reward_std": 0.014318910427391529, "rewards/rollout_reward_func/mean": 0.5804375410079956, "rewards/rollout_reward_func/std": 0.16627994179725647, "sampling/importance_sampling_ratio/max": 1.0052661895751953, "sampling/importance_sampling_ratio/mean": 0.9639570713043213, "sampling/importance_sampling_ratio/min": 0.29709991812705994, "sampling/sampling_logp_difference/max": 1.192205786705017, "sampling/sampling_logp_difference/mean": 0.010397640988230705, "step": 1765, "step_time": 4.145337739013485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 0.16080769710242748, "epoch": 0.01766, "grad_norm": 0.010549750179052353, "kl": 0.38962171971797943, "learning_rate": 9.998614972127907e-06, "loss": -0.0292, "step": 1766, "step_time": 2.514904942996509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 194.875, "completions/mean_terminated_length": 194.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2199552054516971, "epoch": 0.01767, "frac_reward_zero_std": 0.5, "grad_norm": 0.14426781237125397, "kl": 0.47263678908348083, "learning_rate": 9.99861337057556e-06, "loss": -0.0519, "num_tokens": 15542699.0, "reward": 1.1166346073150635, "reward_std": 0.054773587733507156, "rewards/rollout_reward_func/mean": 1.1166346073150635, "rewards/rollout_reward_func/std": 0.3344707489013672, "sampling/importance_sampling_ratio/max": 0.9981911182403564, "sampling/importance_sampling_ratio/mean": 0.9536033868789673, "sampling/importance_sampling_ratio/min": 0.013388740830123425, "sampling/sampling_logp_difference/max": 2.3722753524780273, "sampling/sampling_logp_difference/mean": 0.02733566239476204, "step": 1767, "step_time": 4.7462518729953445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.22657026909291744, "epoch": 0.01768, "grad_norm": 0.07449585944414139, "kl": 0.5116374157369137, "learning_rate": 9.998611768097952e-06, "loss": -0.0528, "step": 1768, "step_time": 2.0467915140106925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 146.65625, "completions/mean_terminated_length": 146.65625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.5241316389292479, "epoch": 0.01769, "frac_reward_zero_std": 0.0, "grad_norm": 0.36061108112335205, "kl": 2.607178121805191, "learning_rate": 9.998610164695089e-06, "loss": -0.0456, "num_tokens": 15558184.0, "reward": 0.3740384578704834, "reward_std": 0.18008309602737427, "rewards/rollout_reward_func/mean": 0.3740384578704834, "rewards/rollout_reward_func/std": 0.2920316457748413, "sampling/importance_sampling_ratio/max": 1.0156255960464478, "sampling/importance_sampling_ratio/mean": 0.8717056512832642, "sampling/importance_sampling_ratio/min": 0.004635704215615988, "sampling/sampling_logp_difference/max": 3.0135438442230225, "sampling/sampling_logp_difference/mean": 0.08431798219680786, "step": 1769, "step_time": 4.097119782993104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5413967855274677, "epoch": 0.0177, "grad_norm": 0.24839435517787933, "kl": 1.922764752060175, "learning_rate": 9.99860856036697e-06, "loss": -0.047, "step": 1770, "step_time": 2.057465800004138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 185.3125, "completions/mean_terminated_length": 185.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3371766656637192, "epoch": 0.01771, "frac_reward_zero_std": 0.5, "grad_norm": 0.5091437697410583, "kl": 0.4239334836602211, "learning_rate": 9.998606955113595e-06, "loss": 0.0326, "num_tokens": 15574938.0, "reward": 0.5961250066757202, "reward_std": 0.08343169838190079, "rewards/rollout_reward_func/mean": 0.5961250066757202, "rewards/rollout_reward_func/std": 0.2110246866941452, "sampling/importance_sampling_ratio/max": 1.0450307130813599, "sampling/importance_sampling_ratio/mean": 0.9191174507141113, "sampling/importance_sampling_ratio/min": 0.16121245920658112, "sampling/sampling_logp_difference/max": 1.9370977878570557, "sampling/sampling_logp_difference/mean": 0.03867939114570618, "step": 1771, "step_time": 4.591037578007672 }, { "clip_ratio/high_max": 0.06666666828095913, "clip_ratio/high_mean": 0.033333334140479565, "clip_ratio/low_mean": 0.0515625006519258, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.08489583386108279, "entropy": 0.3190859127789736, "epoch": 0.01772, "grad_norm": 0.14865823090076447, "kl": 0.39821746200323105, "learning_rate": 9.998605348934965e-06, "loss": 0.0301, "step": 1772, "step_time": 2.506331417003821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 149.9375, "completions/mean_terminated_length": 149.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6094068172387779, "epoch": 0.01773, "frac_reward_zero_std": 0.25, "grad_norm": 1.1888964176177979, "kl": 0.4462558291852474, "learning_rate": 9.99860374183108e-06, "loss": -0.0198, "num_tokens": 15590560.0, "reward": 0.5352452397346497, "reward_std": 0.03989000990986824, "rewards/rollout_reward_func/mean": 0.5352452397346497, "rewards/rollout_reward_func/std": 0.23032525181770325, "sampling/importance_sampling_ratio/max": 1.0635770559310913, "sampling/importance_sampling_ratio/mean": 0.9554369449615479, "sampling/importance_sampling_ratio/min": 1.3425392804641855e-16, "sampling/sampling_logp_difference/max": 3.155796766281128, "sampling/sampling_logp_difference/mean": 0.17995239794254303, "step": 1773, "step_time": 4.958013255003607 }, { "clip_ratio/high_max": 0.140625, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.8158172531984746, "epoch": 0.01774, "grad_norm": 0.1068691536784172, "kl": 0.5003527700901031, "learning_rate": 9.99860213380194e-06, "loss": -0.0275, "step": 1774, "step_time": 2.0313613019970944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 147.6875, "completions/mean_terminated_length": 147.6875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2986103342846036, "epoch": 0.01775, "frac_reward_zero_std": 0.5, "grad_norm": 0.07032158225774765, "kl": 0.6356205195188522, "learning_rate": 9.998600524847546e-06, "loss": 0.0383, "num_tokens": 15605966.0, "reward": 0.834086537361145, "reward_std": 0.03222774714231491, "rewards/rollout_reward_func/mean": 0.834086537361145, "rewards/rollout_reward_func/std": 0.21738599240779877, "sampling/importance_sampling_ratio/max": 1.0068082809448242, "sampling/importance_sampling_ratio/mean": 0.9387609958648682, "sampling/importance_sampling_ratio/min": 0.014987066388130188, "sampling/sampling_logp_difference/max": 2.5866851806640625, "sampling/sampling_logp_difference/mean": 0.043409377336502075, "step": 1775, "step_time": 4.007094461012457 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.31060298159718513, "epoch": 0.01776, "grad_norm": 0.05086847022175789, "kl": 0.6010648049414158, "learning_rate": 9.998598914967897e-06, "loss": 0.0379, "step": 1776, "step_time": 2.012188424007036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 127.28125, "completions/mean_terminated_length": 127.28125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6442126836627722, "epoch": 0.01777, "frac_reward_zero_std": 0.25, "grad_norm": 1.5881752967834473, "kl": 0.5735668241977692, "learning_rate": 9.998597304162995e-06, "loss": -0.0041, "num_tokens": 15620719.0, "reward": 0.5824519395828247, "reward_std": 0.03769136592745781, "rewards/rollout_reward_func/mean": 0.5824519395828247, "rewards/rollout_reward_func/std": 0.1558459848165512, "sampling/importance_sampling_ratio/max": 1.0158472061157227, "sampling/importance_sampling_ratio/mean": 0.8682119250297546, "sampling/importance_sampling_ratio/min": 0.006893342826515436, "sampling/sampling_logp_difference/max": 2.1055402755737305, "sampling/sampling_logp_difference/mean": 0.072187140583992, "step": 1777, "step_time": 4.7077613579895115 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.09843750298023224, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.10468750260770321, "entropy": 0.8620949238538742, "epoch": 0.01778, "grad_norm": 0.14529576897621155, "kl": 0.742113996297121, "learning_rate": 9.99859569243284e-06, "loss": -0.0118, "step": 1778, "step_time": 2.0408575950132217 }, { "clip_ratio/high_max": 0.07083333469927311, "clip_ratio/high_mean": 0.035416667349636555, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.035416667349636555, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 152.125, "completions/mean_terminated_length": 152.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.9582629390060902, "epoch": 0.01779, "frac_reward_zero_std": 0.5, "grad_norm": 0.5106397867202759, "kl": 0.4434632211923599, "learning_rate": 9.998594079777434e-06, "loss": -0.0217, "num_tokens": 15636379.0, "reward": 0.7262499928474426, "reward_std": 0.033646583557128906, "rewards/rollout_reward_func/mean": 0.7262499928474426, "rewards/rollout_reward_func/std": 0.27711406350135803, "sampling/importance_sampling_ratio/max": 0.9933736324310303, "sampling/importance_sampling_ratio/mean": 0.7563750743865967, "sampling/importance_sampling_ratio/min": 7.60476748240535e-09, "sampling/sampling_logp_difference/max": 3.6175317764282227, "sampling/sampling_logp_difference/mean": 0.2228117287158966, "step": 1779, "step_time": 4.582846034994873 }, { "clip_ratio/high_max": 0.13125000149011612, "clip_ratio/high_mean": 0.06562500074505806, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06562500074505806, "entropy": 0.774036230519414, "epoch": 0.0178, "grad_norm": 0.055503807961940765, "kl": 0.46575893834233284, "learning_rate": 9.998592466196772e-06, "loss": -0.0217, "step": 1780, "step_time": 2.010519406016101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 128.1875, "completions/mean_terminated_length": 128.1875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5323016569018364, "epoch": 0.01781, "frac_reward_zero_std": 0.0, "grad_norm": 0.09435834735631943, "kl": 0.9755665585398674, "learning_rate": 9.998590851690861e-06, "loss": -0.0925, "num_tokens": 15651425.0, "reward": 0.7154375314712524, "reward_std": 0.11556518077850342, "rewards/rollout_reward_func/mean": 0.7154375314712524, "rewards/rollout_reward_func/std": 0.23942700028419495, "sampling/importance_sampling_ratio/max": 0.9984215497970581, "sampling/importance_sampling_ratio/mean": 0.8088136315345764, "sampling/importance_sampling_ratio/min": 1.776128072741856e-12, "sampling/sampling_logp_difference/max": 4.055654525756836, "sampling/sampling_logp_difference/mean": 0.3956659138202667, "step": 1781, "step_time": 4.067850381987228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.538614111021161, "epoch": 0.01782, "grad_norm": 0.09916391223669052, "kl": 0.9198756106197834, "learning_rate": 9.998589236259698e-06, "loss": -0.0926, "step": 1782, "step_time": 2.027700206002919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 203.75, "completions/mean_terminated_length": 203.75, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.5376739222556353, "epoch": 0.01783, "frac_reward_zero_std": 0.75, "grad_norm": 0.07675786316394806, "kl": 0.39846261963248253, "learning_rate": 9.998587619903282e-06, "loss": -0.0319, "num_tokens": 15668713.0, "reward": 0.9424230456352234, "reward_std": 0.03883064538240433, "rewards/rollout_reward_func/mean": 0.9424230456352234, "rewards/rollout_reward_func/std": 0.3331458270549774, "sampling/importance_sampling_ratio/max": 1.0290018320083618, "sampling/importance_sampling_ratio/mean": 0.8766186237335205, "sampling/importance_sampling_ratio/min": 0.00488324835896492, "sampling/sampling_logp_difference/max": 2.790243625640869, "sampling/sampling_logp_difference/mean": 0.06590183079242706, "step": 1783, "step_time": 4.834874972002581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5281331911683083, "epoch": 0.01784, "grad_norm": 0.08747158199548721, "kl": 0.3953961879014969, "learning_rate": 9.998586002621617e-06, "loss": -0.0318, "step": 1784, "step_time": 2.0504269799857866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 162.96875, "completions/mean_terminated_length": 162.96875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 1.118532881140709, "epoch": 0.01785, "frac_reward_zero_std": 0.0, "grad_norm": 0.7801359295845032, "kl": 0.606229618191719, "learning_rate": 9.998584384414703e-06, "loss": -0.0009, "num_tokens": 15684696.0, "reward": 0.5262914896011353, "reward_std": 0.12377430498600006, "rewards/rollout_reward_func/mean": 0.5262914896011353, "rewards/rollout_reward_func/std": 0.23953735828399658, "sampling/importance_sampling_ratio/max": 1.0652656555175781, "sampling/importance_sampling_ratio/mean": 0.7404412031173706, "sampling/importance_sampling_ratio/min": 0.01580747775733471, "sampling/sampling_logp_difference/max": 2.4100759029388428, "sampling/sampling_logp_difference/mean": 0.1389535367488861, "step": 1785, "step_time": 4.774445558010484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.1262648832052946, "clip_ratio/low_min": 0.0386904776096344, "clip_ratio/region_mean": 0.1262648832052946, "entropy": 1.347781777381897, "epoch": 0.01786, "grad_norm": 0.22748436033725739, "kl": 0.6002737022936344, "learning_rate": 9.998582765282537e-06, "loss": -0.0086, "step": 1786, "step_time": 2.0320446900077513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 108.375, "completions/mean_terminated_length": 108.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7560489820316434, "epoch": 0.01787, "frac_reward_zero_std": 0.5, "grad_norm": 0.05700504779815674, "kl": 0.5338969379663467, "learning_rate": 9.998581145225121e-06, "loss": -0.0272, "num_tokens": 15698932.0, "reward": 0.6737018823623657, "reward_std": 0.03919924050569534, "rewards/rollout_reward_func/mean": 0.6737018823623657, "rewards/rollout_reward_func/std": 0.2810392677783966, "sampling/importance_sampling_ratio/max": 0.9935449361801147, "sampling/importance_sampling_ratio/mean": 0.9049569368362427, "sampling/importance_sampling_ratio/min": 1.0405391549284104e-05, "sampling/sampling_logp_difference/max": 1.9628398418426514, "sampling/sampling_logp_difference/mean": 0.1358056664466858, "step": 1787, "step_time": 4.014358582004206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.030505951959639788, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030505951959639788, "entropy": 0.8056024061515927, "epoch": 0.01788, "grad_norm": 0.050676312297582626, "kl": 0.5645789504051208, "learning_rate": 9.998579524242457e-06, "loss": -0.0268, "step": 1788, "step_time": 2.032750105994637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 218.0, "completions/mean_terminated_length": 218.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.07162105944007635, "epoch": 0.01789, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010432356502860785, "kl": 0.3864123858511448, "learning_rate": 9.998577902334544e-06, "loss": 0.0016, "num_tokens": 15716676.0, "reward": 0.5949615240097046, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5949615240097046, "rewards/rollout_reward_func/std": 0.44116970896720886, "sampling/importance_sampling_ratio/max": 0.9965081214904785, "sampling/importance_sampling_ratio/mean": 0.989724338054657, "sampling/importance_sampling_ratio/min": 0.981755256652832, "sampling/sampling_logp_difference/max": 0.00907769426703453, "sampling/sampling_logp_difference/mean": 0.001692711841315031, "step": 1789, "step_time": 4.812187944990001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06895327009260654, "epoch": 0.0179, "grad_norm": 0.000960555684287101, "kl": 0.38695884123444557, "learning_rate": 9.998576279501383e-06, "loss": 0.0016, "step": 1790, "step_time": 2.501898232003441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 192.125, "completions/mean_terminated_length": 192.125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.607009369879961, "epoch": 0.01791, "frac_reward_zero_std": 0.5, "grad_norm": 0.023700330406427383, "kl": 0.6440675556659698, "learning_rate": 9.998574655742974e-06, "loss": 0.0214, "num_tokens": 15733504.0, "reward": 0.5978269577026367, "reward_std": 0.016045883297920227, "rewards/rollout_reward_func/mean": 0.5978269577026367, "rewards/rollout_reward_func/std": 0.21370410919189453, "sampling/importance_sampling_ratio/max": 0.9954745173454285, "sampling/importance_sampling_ratio/mean": 0.9268141984939575, "sampling/importance_sampling_ratio/min": 2.9099444631697224e-08, "sampling/sampling_logp_difference/max": 3.9701194763183594, "sampling/sampling_logp_difference/mean": 0.13954952359199524, "step": 1791, "step_time": 4.560935613997572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6052023069933057, "epoch": 0.01792, "grad_norm": 0.02202407270669937, "kl": 0.6417863555252552, "learning_rate": 9.998573031059315e-06, "loss": 0.0213, "step": 1792, "step_time": 2.0445947859916487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 116.34375, "completions/mean_terminated_length": 116.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5458786934614182, "epoch": 0.01793, "frac_reward_zero_std": 0.5, "grad_norm": 0.0205821692943573, "kl": 0.7749922275543213, "learning_rate": 9.998571405450413e-06, "loss": -0.0543, "num_tokens": 15748083.0, "reward": 0.608846127986908, "reward_std": 0.18819917738437653, "rewards/rollout_reward_func/mean": 0.608846127986908, "rewards/rollout_reward_func/std": 0.3454754948616028, "sampling/importance_sampling_ratio/max": 0.9975652098655701, "sampling/importance_sampling_ratio/mean": 0.9298704862594604, "sampling/importance_sampling_ratio/min": 5.06073040792549e-20, "sampling/sampling_logp_difference/max": 15.740422248840332, "sampling/sampling_logp_difference/mean": 0.22919753193855286, "step": 1793, "step_time": 4.119575631004409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5456349039450288, "epoch": 0.01794, "grad_norm": 0.020545853301882744, "kl": 0.7725185379385948, "learning_rate": 9.99856977891626e-06, "loss": -0.0544, "step": 1794, "step_time": 1.9956217499930062 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008630952797830105, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 215.96875, "completions/mean_terminated_length": 215.96875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.42425063252449036, "epoch": 0.01795, "frac_reward_zero_std": 0.75, "grad_norm": 0.014358115382492542, "kl": 0.5164012536406517, "learning_rate": 9.998568151456864e-06, "loss": -0.0348, "num_tokens": 15765730.0, "reward": 0.7740480899810791, "reward_std": 0.24983204901218414, "rewards/rollout_reward_func/mean": 0.7740480899810791, "rewards/rollout_reward_func/std": 0.6485621333122253, "sampling/importance_sampling_ratio/max": 0.9958323240280151, "sampling/importance_sampling_ratio/mean": 0.9261847734451294, "sampling/importance_sampling_ratio/min": 0.0005685956566594541, "sampling/sampling_logp_difference/max": 2.736327648162842, "sampling/sampling_logp_difference/mean": 0.05999905616044998, "step": 1795, "step_time": 4.974990242015338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 0.43087383452802896, "epoch": 0.01796, "grad_norm": 0.015317928045988083, "kl": 0.5275931619107723, "learning_rate": 9.998566523072222e-06, "loss": -0.0348, "step": 1796, "step_time": 2.481187774996215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 209.53125, "completions/mean_terminated_length": 209.53125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.9207319733686745, "epoch": 0.01797, "frac_reward_zero_std": 0.5, "grad_norm": 0.030261829495429993, "kl": 0.5719858594238758, "learning_rate": 9.998564893762333e-06, "loss": -0.0625, "num_tokens": 15783115.0, "reward": 1.0107018947601318, "reward_std": 0.07662753760814667, "rewards/rollout_reward_func/mean": 1.0107018947601318, "rewards/rollout_reward_func/std": 0.16448451578617096, "sampling/importance_sampling_ratio/max": 0.9958405494689941, "sampling/importance_sampling_ratio/mean": 0.8996385335922241, "sampling/importance_sampling_ratio/min": 1.9667223946798808e-41, "sampling/sampling_logp_difference/max": 3.5549182891845703, "sampling/sampling_logp_difference/mean": 0.41344591975212097, "step": 1797, "step_time": 4.375464952994662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9183539059013128, "epoch": 0.01798, "grad_norm": 0.028128432109951973, "kl": 0.5748494602739811, "learning_rate": 9.9985632635272e-06, "loss": -0.0625, "step": 1798, "step_time": 2.0433075489927432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 95.25, "completions/mean_terminated_length": 95.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.1141324769705534, "epoch": 0.01799, "frac_reward_zero_std": 1.0, "grad_norm": 0.000755276414565742, "kl": 0.5219279415905476, "learning_rate": 9.99856163236682e-06, "loss": 0.0013, "num_tokens": 15796875.0, "reward": 0.7334614992141724, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7334614992141724, "rewards/rollout_reward_func/std": 0.2380896657705307, "sampling/importance_sampling_ratio/max": 1.0024679899215698, "sampling/importance_sampling_ratio/mean": 0.9898255467414856, "sampling/importance_sampling_ratio/min": 0.9759829044342041, "sampling/sampling_logp_difference/max": 0.013444900512695312, "sampling/sampling_logp_difference/mean": 0.0031451331451535225, "step": 1799, "step_time": 3.5191034130111802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11240849364548922, "epoch": 0.018, "grad_norm": 0.000744141754694283, "kl": 0.5222207866609097, "learning_rate": 9.9985600002812e-06, "loss": 0.0013, "step": 1800, "step_time": 1.9339168300139136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 291.5, "completions/mean_terminated_length": 291.5, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "entropy": 0.0447161216288805, "epoch": 0.01801, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005721841007471085, "kl": 0.3427642323076725, "learning_rate": 9.998558367270333e-06, "loss": 0.0017, "num_tokens": 15816939.0, "reward": 1.1145000457763672, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.1145000457763672, "rewards/rollout_reward_func/std": 0.454674631357193, "sampling/importance_sampling_ratio/max": 0.9946674108505249, "sampling/importance_sampling_ratio/mean": 0.9907739162445068, "sampling/importance_sampling_ratio/min": 0.9846312403678894, "sampling/sampling_logp_difference/max": 0.008480869233608246, "sampling/sampling_logp_difference/mean": 0.0012162867933511734, "step": 1801, "step_time": 4.968908103990543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04470987478271127, "epoch": 0.01802, "grad_norm": 0.000572496501263231, "kl": 0.34276140481233597, "learning_rate": 9.998556733334223e-06, "loss": 0.0017, "step": 1802, "step_time": 2.4942154210002627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 187.0, "completions/mean_terminated_length": 187.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.06301710056141019, "epoch": 0.01803, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006459178403019905, "kl": 0.37044738605618477, "learning_rate": 9.99855509847287e-06, "loss": 0.0014, "num_tokens": 15833603.0, "reward": 1.1004999876022339, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.1004999876022339, "rewards/rollout_reward_func/std": 0.15589870512485504, "sampling/importance_sampling_ratio/max": 0.9950162768363953, "sampling/importance_sampling_ratio/mean": 0.9914703369140625, "sampling/importance_sampling_ratio/min": 0.9875515699386597, "sampling/sampling_logp_difference/max": 0.007079709321260452, "sampling/sampling_logp_difference/mean": 0.0014957627281546593, "step": 1803, "step_time": 4.2604971030086745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06289453012868762, "epoch": 0.01804, "grad_norm": 0.0006401927676051855, "kl": 0.3704841546714306, "learning_rate": 9.998553462686273e-06, "loss": 0.0014, "step": 1804, "step_time": 2.0199952769908123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10937910247594118, "epoch": 0.01805, "frac_reward_zero_std": 1.0, "grad_norm": 0.000361104030162096, "kl": 0.8377995938062668, "learning_rate": 9.998551825974435e-06, "loss": 0.0013, "num_tokens": 15846243.0, "reward": 0.8019230365753174, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8019230365753174, "rewards/rollout_reward_func/std": 0.0710674524307251, "sampling/importance_sampling_ratio/max": 0.9965323209762573, "sampling/importance_sampling_ratio/mean": 0.989295482635498, "sampling/importance_sampling_ratio/min": 0.983875036239624, "sampling/sampling_logp_difference/max": 0.01173313707113266, "sampling/sampling_logp_difference/mean": 0.004527487326413393, "step": 1805, "step_time": 3.696392810008547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10974500887095928, "epoch": 0.01806, "grad_norm": 0.0003617673646658659, "kl": 0.837845042347908, "learning_rate": 9.998550188337355e-06, "loss": 0.0013, "step": 1806, "step_time": 2.438372859003721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 73.09375, "completions/mean_terminated_length": 73.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6290906984359026, "epoch": 0.01807, "frac_reward_zero_std": 0.5, "grad_norm": 0.016752487048506737, "kl": 0.8121193945407867, "learning_rate": 9.998548549775034e-06, "loss": -0.0364, "num_tokens": 15859294.0, "reward": 0.5415865182876587, "reward_std": 0.14346109330654144, "rewards/rollout_reward_func/mean": 0.5415865182876587, "rewards/rollout_reward_func/std": 0.3391818404197693, "sampling/importance_sampling_ratio/max": 0.9973056316375732, "sampling/importance_sampling_ratio/mean": 0.927802324295044, "sampling/importance_sampling_ratio/min": 1.1032646398234647e-05, "sampling/sampling_logp_difference/max": 2.846585512161255, "sampling/sampling_logp_difference/mean": 0.15875966846942902, "step": 1807, "step_time": 3.720137694996083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6253773402422667, "epoch": 0.01808, "grad_norm": 0.016465775668621063, "kl": 0.807304136455059, "learning_rate": 9.99854691028747e-06, "loss": -0.0364, "step": 1808, "step_time": 2.469385828997474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 171.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.0778215560130775, "epoch": 0.01809, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007509238203056157, "kl": 0.4040655829012394, "learning_rate": 9.998545269874669e-06, "loss": 0.0014, "num_tokens": 15875622.0, "reward": 0.8602308034896851, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8602308034896851, "rewards/rollout_reward_func/std": 0.31951355934143066, "sampling/importance_sampling_ratio/max": 1.0095857381820679, "sampling/importance_sampling_ratio/mean": 0.9926142692565918, "sampling/importance_sampling_ratio/min": 0.979347825050354, "sampling/sampling_logp_difference/max": 0.012904446572065353, "sampling/sampling_logp_difference/mean": 0.0019681474659591913, "step": 1809, "step_time": 4.172904013001244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07643847446888685, "epoch": 0.0181, "grad_norm": 0.000760392053052783, "kl": 0.4042060784995556, "learning_rate": 9.998543628536625e-06, "loss": 0.0014, "step": 1810, "step_time": 2.0418478320061695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 214.125, "completions/mean_terminated_length": 214.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.13141424488276243, "epoch": 0.01811, "frac_reward_zero_std": 1.0, "grad_norm": 0.006452680565416813, "kl": 0.4213404394686222, "learning_rate": 9.99854198627334e-06, "loss": 0.0017, "num_tokens": 15893098.0, "reward": 0.7395384311676025, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7395384311676025, "rewards/rollout_reward_func/std": 0.3425893783569336, "sampling/importance_sampling_ratio/max": 1.0062581300735474, "sampling/importance_sampling_ratio/mean": 0.9707505702972412, "sampling/importance_sampling_ratio/min": 0.19860216975212097, "sampling/sampling_logp_difference/max": 1.4925765991210938, "sampling/sampling_logp_difference/mean": 0.010082520544528961, "step": 1811, "step_time": 4.22187104299519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12956335954368114, "epoch": 0.01812, "grad_norm": 0.006331609562039375, "kl": 0.4186905063688755, "learning_rate": 9.998540343084819e-06, "loss": 0.0017, "step": 1812, "step_time": 2.419619640990277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 169.0, "completions/mean_terminated_length": 169.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.06417627818882465, "epoch": 0.01813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005283235805109143, "kl": 0.5231276378035545, "learning_rate": 9.998538698971056e-06, "loss": 0.0018, "num_tokens": 15909274.0, "reward": 0.7196153998374939, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7196153998374939, "rewards/rollout_reward_func/std": 0.2259266972541809, "sampling/importance_sampling_ratio/max": 0.995257556438446, "sampling/importance_sampling_ratio/mean": 0.9911104440689087, "sampling/importance_sampling_ratio/min": 0.981183648109436, "sampling/sampling_logp_difference/max": 0.008802274242043495, "sampling/sampling_logp_difference/mean": 0.0016634974163025618, "step": 1813, "step_time": 4.087865791989316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06349074561148882, "epoch": 0.01814, "grad_norm": 0.00052534049609676, "kl": 0.5232033021748066, "learning_rate": 9.998537053932055e-06, "loss": 0.0018, "step": 1814, "step_time": 2.0150331720215036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.07389157358556986, "epoch": 0.01815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005416821804828942, "kl": 0.46398771554231644, "learning_rate": 9.998535407967817e-06, "loss": 0.0015, "num_tokens": 15924546.0, "reward": 0.5534615516662598, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5534615516662598, "rewards/rollout_reward_func/std": 0.13535019755363464, "sampling/importance_sampling_ratio/max": 0.9966593980789185, "sampling/importance_sampling_ratio/mean": 0.9916329979896545, "sampling/importance_sampling_ratio/min": 0.9838358759880066, "sampling/sampling_logp_difference/max": 0.009001857601106167, "sampling/sampling_logp_difference/mean": 0.0019966275431215763, "step": 1815, "step_time": 4.375202963987249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07382690440863371, "epoch": 0.01816, "grad_norm": 0.0005448500742204487, "kl": 0.4639495275914669, "learning_rate": 9.998533761078339e-06, "loss": 0.0015, "step": 1816, "step_time": 1.9268505499931052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 126.75, "completions/mean_terminated_length": 126.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4341825069859624, "epoch": 0.01817, "frac_reward_zero_std": 0.75, "grad_norm": 0.006442430894821882, "kl": 0.5686687678098679, "learning_rate": 9.998532113263626e-06, "loss": -0.0176, "num_tokens": 15939370.0, "reward": 0.926682710647583, "reward_std": 0.008838835172355175, "rewards/rollout_reward_func/mean": 0.926682710647583, "rewards/rollout_reward_func/std": 0.23142392933368683, "sampling/importance_sampling_ratio/max": 0.9966980814933777, "sampling/importance_sampling_ratio/mean": 0.9602018594741821, "sampling/importance_sampling_ratio/min": 0.00014015779015608132, "sampling/sampling_logp_difference/max": 2.4743120670318604, "sampling/sampling_logp_difference/mean": 0.09095249325037003, "step": 1817, "step_time": 4.03472602499096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4380249120295048, "epoch": 0.01818, "grad_norm": 0.006604512222111225, "kl": 0.5701523870229721, "learning_rate": 9.998530464523674e-06, "loss": -0.0176, "step": 1818, "step_time": 2.485359628997685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 218.375, "completions/mean_terminated_length": 218.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.34966579219326377, "epoch": 0.01819, "frac_reward_zero_std": 0.5, "grad_norm": 0.012545828707516193, "kl": 0.5064473487436771, "learning_rate": 9.998528814858486e-06, "loss": -0.0643, "num_tokens": 15957150.0, "reward": 0.787844717502594, "reward_std": 0.032530996948480606, "rewards/rollout_reward_func/mean": 0.787844717502594, "rewards/rollout_reward_func/std": 0.3319413661956787, "sampling/importance_sampling_ratio/max": 0.9962795376777649, "sampling/importance_sampling_ratio/mean": 0.9301623702049255, "sampling/importance_sampling_ratio/min": 8.2103353634011e-05, "sampling/sampling_logp_difference/max": 2.568478584289551, "sampling/sampling_logp_difference/mean": 0.06839548796415329, "step": 1819, "step_time": 4.444656818988733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3510061916895211, "epoch": 0.0182, "grad_norm": 0.012252595275640488, "kl": 0.5091801583766937, "learning_rate": 9.998527164268062e-06, "loss": -0.0643, "step": 1820, "step_time": 2.056335963003221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 171.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.07878339011222124, "epoch": 0.01821, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006847594049759209, "kl": 0.40732454508543015, "learning_rate": 9.998525512752404e-06, "loss": 0.0014, "num_tokens": 15973302.0, "reward": 0.9079999923706055, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9079999923706055, "rewards/rollout_reward_func/std": 0.30211740732192993, "sampling/importance_sampling_ratio/max": 0.99781733751297, "sampling/importance_sampling_ratio/mean": 0.9913929104804993, "sampling/importance_sampling_ratio/min": 0.9850137233734131, "sampling/sampling_logp_difference/max": 0.008478103205561638, "sampling/sampling_logp_difference/mean": 0.001800863421522081, "step": 1821, "step_time": 4.52097269299702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07782284170389175, "epoch": 0.01822, "grad_norm": 0.0006891799857839942, "kl": 0.40738094970583916, "learning_rate": 9.998523860311507e-06, "loss": 0.0014, "step": 1822, "step_time": 1.9797496749815764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 149.34375, "completions/mean_terminated_length": 149.34375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.43324063904583454, "epoch": 0.01823, "frac_reward_zero_std": 0.75, "grad_norm": 0.03440657630562782, "kl": 0.6603268980979919, "learning_rate": 9.998522206945376e-06, "loss": -0.0169, "num_tokens": 15988905.0, "reward": 0.8563942909240723, "reward_std": 0.05371291935443878, "rewards/rollout_reward_func/mean": 0.8563942909240723, "rewards/rollout_reward_func/std": 0.2102876454591751, "sampling/importance_sampling_ratio/max": 1.012883186340332, "sampling/importance_sampling_ratio/mean": 0.9667877554893494, "sampling/importance_sampling_ratio/min": 2.34569171198018e-07, "sampling/sampling_logp_difference/max": 2.4793334007263184, "sampling/sampling_logp_difference/mean": 0.1141449511051178, "step": 1823, "step_time": 4.161388523010828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4315394898876548, "epoch": 0.01824, "grad_norm": 0.0322251170873642, "kl": 0.6422561481595039, "learning_rate": 9.998520552654011e-06, "loss": -0.017, "step": 1824, "step_time": 2.501396060004481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06949082668870687, "epoch": 0.01825, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006017207051627338, "kl": 0.46811795607209206, "learning_rate": 9.998518897437413e-06, "loss": 0.0018, "num_tokens": 16005689.0, "reward": 0.5508846044540405, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5508846044540405, "rewards/rollout_reward_func/std": 0.08382628858089447, "sampling/importance_sampling_ratio/max": 0.9940906167030334, "sampling/importance_sampling_ratio/mean": 0.9910340309143066, "sampling/importance_sampling_ratio/min": 0.986189603805542, "sampling/sampling_logp_difference/max": 0.007986078038811684, "sampling/sampling_logp_difference/mean": 0.0016919014742597938, "step": 1825, "step_time": 4.422756293002749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0690786805935204, "epoch": 0.01826, "grad_norm": 0.0005984540330246091, "kl": 0.4681472182273865, "learning_rate": 9.99851724129558e-06, "loss": 0.0018, "step": 1826, "step_time": 2.4823347380079213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 168.625, "completions/mean_terminated_length": 168.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2672035554423928, "epoch": 0.01827, "frac_reward_zero_std": 0.75, "grad_norm": 0.006837132852524519, "kl": 0.48943662643432617, "learning_rate": 9.998515584228514e-06, "loss": -0.0366, "num_tokens": 16021765.0, "reward": 0.7642725706100464, "reward_std": 0.01706983521580696, "rewards/rollout_reward_func/mean": 0.7642725706100464, "rewards/rollout_reward_func/std": 0.3852689266204834, "sampling/importance_sampling_ratio/max": 0.9992600083351135, "sampling/importance_sampling_ratio/mean": 0.9604560732841492, "sampling/importance_sampling_ratio/min": 0.00041306312778033316, "sampling/sampling_logp_difference/max": 2.197638750076294, "sampling/sampling_logp_difference/mean": 0.03767923265695572, "step": 1827, "step_time": 4.73256675999437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26360216550529003, "epoch": 0.01828, "grad_norm": 0.006576851941645145, "kl": 0.49020877107977867, "learning_rate": 9.998513926236215e-06, "loss": -0.0366, "step": 1828, "step_time": 2.0728007790021366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 154.03125, "completions/mean_terminated_length": 154.03125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.2680189348757267, "epoch": 0.01829, "frac_reward_zero_std": 0.75, "grad_norm": 0.10575207322835922, "kl": 0.7450478412210941, "learning_rate": 9.998512267318682e-06, "loss": 0.0478, "num_tokens": 16037406.0, "reward": 0.6572499871253967, "reward_std": 0.032466042786836624, "rewards/rollout_reward_func/mean": 0.6572499871253967, "rewards/rollout_reward_func/std": 0.31666386127471924, "sampling/importance_sampling_ratio/max": 1.0087242126464844, "sampling/importance_sampling_ratio/mean": 0.9395098090171814, "sampling/importance_sampling_ratio/min": 0.0004988516448065639, "sampling/sampling_logp_difference/max": 2.472071886062622, "sampling/sampling_logp_difference/mean": 0.0444612093269825, "step": 1829, "step_time": 4.447820374007279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26898949733003974, "epoch": 0.0183, "grad_norm": 0.11255382001399994, "kl": 0.7434818521142006, "learning_rate": 9.998510607475919e-06, "loss": 0.0477, "step": 1830, "step_time": 2.4673820069947396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 146.5, "completions/mean_terminated_length": 146.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0764787420630455, "epoch": 0.01831, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007986626587808132, "kl": 0.5087558031082153, "learning_rate": 9.998508946707923e-06, "loss": 0.0014, "num_tokens": 16052950.0, "reward": 0.7106153964996338, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7106153964996338, "rewards/rollout_reward_func/std": 0.4152238965034485, "sampling/importance_sampling_ratio/max": 1.029675006866455, "sampling/importance_sampling_ratio/mean": 0.9987537860870361, "sampling/importance_sampling_ratio/min": 0.9858525395393372, "sampling/sampling_logp_difference/max": 0.03234837204217911, "sampling/sampling_logp_difference/mean": 0.0033361860550940037, "step": 1831, "step_time": 4.267704339996271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0795456999912858, "epoch": 0.01832, "grad_norm": 0.0009582577040418983, "kl": 0.5080909319221973, "learning_rate": 9.998507285014698e-06, "loss": 0.0014, "step": 1832, "step_time": 2.4904667220034753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 103.5625, "completions/mean_terminated_length": 103.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.30167997535318136, "epoch": 0.01833, "frac_reward_zero_std": 0.75, "grad_norm": 0.0206905547529459, "kl": 0.5739747695624828, "learning_rate": 9.99850562239624e-06, "loss": 0.0206, "num_tokens": 16067088.0, "reward": 0.7039086818695068, "reward_std": 0.041515327990055084, "rewards/rollout_reward_func/mean": 0.7039086818695068, "rewards/rollout_reward_func/std": 0.16138608753681183, "sampling/importance_sampling_ratio/max": 1.0076920986175537, "sampling/importance_sampling_ratio/mean": 0.9615186452865601, "sampling/importance_sampling_ratio/min": 0.001730403397232294, "sampling/sampling_logp_difference/max": 2.3978681564331055, "sampling/sampling_logp_difference/mean": 0.03999948129057884, "step": 1833, "step_time": 4.276315417999285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3038995647802949, "epoch": 0.01834, "grad_norm": 0.02384144999086857, "kl": 0.576205063611269, "learning_rate": 9.998503958852552e-06, "loss": 0.0206, "step": 1834, "step_time": 2.0338358049993985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 150.9375, "completions/mean_terminated_length": 150.9375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7702925382182002, "epoch": 0.01835, "frac_reward_zero_std": 0.5, "grad_norm": 0.08776463568210602, "kl": 0.47745632380247116, "learning_rate": 9.998502294383634e-06, "loss": -0.0216, "num_tokens": 16082598.0, "reward": 0.6810096502304077, "reward_std": 0.04724594205617905, "rewards/rollout_reward_func/mean": 0.6810096502304077, "rewards/rollout_reward_func/std": 0.19547255337238312, "sampling/importance_sampling_ratio/max": 1.0105012655258179, "sampling/importance_sampling_ratio/mean": 0.8504141569137573, "sampling/importance_sampling_ratio/min": 0.0072446661069989204, "sampling/sampling_logp_difference/max": 1.8469884395599365, "sampling/sampling_logp_difference/mean": 0.09862721711397171, "step": 1835, "step_time": 4.089262061017507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.772846395149827, "epoch": 0.01836, "grad_norm": 0.09084147959947586, "kl": 0.47533198818564415, "learning_rate": 9.998500628989486e-06, "loss": -0.0215, "step": 1836, "step_time": 2.497603818992502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 152.5625, "completions/mean_terminated_length": 152.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2836808543652296, "epoch": 0.01837, "frac_reward_zero_std": 0.5, "grad_norm": 0.10569048672914505, "kl": 0.8567136339843273, "learning_rate": 9.998498962670108e-06, "loss": 0.0053, "num_tokens": 16098304.0, "reward": 0.6261909008026123, "reward_std": 0.06972752511501312, "rewards/rollout_reward_func/mean": 0.6261909008026123, "rewards/rollout_reward_func/std": 0.2161407470703125, "sampling/importance_sampling_ratio/max": 1.0583925247192383, "sampling/importance_sampling_ratio/mean": 0.9455404281616211, "sampling/importance_sampling_ratio/min": 0.011067201383411884, "sampling/sampling_logp_difference/max": 2.2905735969543457, "sampling/sampling_logp_difference/mean": 0.04675890877842903, "step": 1837, "step_time": 4.428950592999172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2790939714759588, "epoch": 0.01838, "grad_norm": 0.11233440786600113, "kl": 0.8343697227537632, "learning_rate": 9.998497295425503e-06, "loss": 0.0051, "step": 1838, "step_time": 2.5035648409975693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 148.8125, "completions/mean_terminated_length": 148.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4017516141757369, "epoch": 0.01839, "frac_reward_zero_std": 0.75, "grad_norm": 0.08549637347459793, "kl": 0.5696414969861507, "learning_rate": 9.998495627255667e-06, "loss": -0.0274, "num_tokens": 16113922.0, "reward": 0.8188422918319702, "reward_std": 0.04928095266222954, "rewards/rollout_reward_func/mean": 0.8188422918319702, "rewards/rollout_reward_func/std": 0.28637194633483887, "sampling/importance_sampling_ratio/max": 1.2132354974746704, "sampling/importance_sampling_ratio/mean": 0.8844810128211975, "sampling/importance_sampling_ratio/min": 0.03650189936161041, "sampling/sampling_logp_difference/max": 2.255100727081299, "sampling/sampling_logp_difference/mean": 0.055600203573703766, "step": 1839, "step_time": 4.556623230993864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.38068378902971745, "epoch": 0.0184, "grad_norm": 0.04395931214094162, "kl": 0.5685921683907509, "learning_rate": 9.998493958160605e-06, "loss": -0.0275, "step": 1840, "step_time": 2.0301449180042255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 143.15625, "completions/mean_terminated_length": 143.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.13102310989052057, "epoch": 0.01841, "frac_reward_zero_std": 0.75, "grad_norm": 0.01787453703582287, "kl": 0.6338301822543144, "learning_rate": 9.998492288140316e-06, "loss": -0.0351, "num_tokens": 16129391.0, "reward": 0.7646634578704834, "reward_std": 0.22515912353992462, "rewards/rollout_reward_func/mean": 0.7646634578704834, "rewards/rollout_reward_func/std": 0.4954085052013397, "sampling/importance_sampling_ratio/max": 0.9954907298088074, "sampling/importance_sampling_ratio/mean": 0.9607778787612915, "sampling/importance_sampling_ratio/min": 0.06273533403873444, "sampling/sampling_logp_difference/max": 2.4541702270507812, "sampling/sampling_logp_difference/mean": 0.01733565703034401, "step": 1841, "step_time": 4.7556423820133205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13249920029193163, "epoch": 0.01842, "grad_norm": 0.020875234156847, "kl": 0.6269007325172424, "learning_rate": 9.998490617194797e-06, "loss": -0.035, "step": 1842, "step_time": 2.0376083930023015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 122.625, "completions/mean_terminated_length": 122.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.529268448241055, "epoch": 0.01843, "frac_reward_zero_std": 0.25, "grad_norm": 0.19232282042503357, "kl": 0.590038388967514, "learning_rate": 9.998488945324055e-06, "loss": 0.0265, "num_tokens": 16144171.0, "reward": 0.6934182643890381, "reward_std": 0.032458920031785965, "rewards/rollout_reward_func/mean": 0.6934182643890381, "rewards/rollout_reward_func/std": 0.17051775753498077, "sampling/importance_sampling_ratio/max": 1.0603293180465698, "sampling/importance_sampling_ratio/mean": 0.9174892902374268, "sampling/importance_sampling_ratio/min": 0.0010380331659689546, "sampling/sampling_logp_difference/max": 2.8393402099609375, "sampling/sampling_logp_difference/mean": 0.09195935726165771, "step": 1843, "step_time": 3.8583010509901214 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.5324276015162468, "epoch": 0.01844, "grad_norm": 0.14068399369716644, "kl": 0.5688976086676121, "learning_rate": 9.998487272528084e-06, "loss": 0.0257, "step": 1844, "step_time": 2.4984979039945756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 296.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 171.3125, "completions/mean_terminated_length": 167.29031372070312, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.5305377002805471, "epoch": 0.01845, "frac_reward_zero_std": 0.5, "grad_norm": 0.12849073112010956, "kl": 0.44618403911590576, "learning_rate": 9.998485598806886e-06, "loss": 0.0064, "num_tokens": 16160389.0, "reward": 0.8327740430831909, "reward_std": 0.024028033018112183, "rewards/rollout_reward_func/mean": 0.8327740430831909, "rewards/rollout_reward_func/std": 0.21756243705749512, "sampling/importance_sampling_ratio/max": 0.9987653493881226, "sampling/importance_sampling_ratio/mean": 0.8998764753341675, "sampling/importance_sampling_ratio/min": 4.7342933000012015e-15, "sampling/sampling_logp_difference/max": 3.178434133529663, "sampling/sampling_logp_difference/mean": 0.1882445067167282, "step": 1845, "step_time": 4.512876126995252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5350579917430878, "epoch": 0.01846, "grad_norm": 0.09410015493631363, "kl": 0.4482900835573673, "learning_rate": 9.998483924160465e-06, "loss": 0.0065, "step": 1846, "step_time": 2.027770816006523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 172.65625, "completions/mean_terminated_length": 172.65625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4130310667678714, "epoch": 0.01847, "frac_reward_zero_std": 0.5, "grad_norm": 0.054721374064683914, "kl": 0.4758390374481678, "learning_rate": 9.998482248588818e-06, "loss": 0.0127, "num_tokens": 16176650.0, "reward": 0.5769563913345337, "reward_std": 0.044158194214105606, "rewards/rollout_reward_func/mean": 0.5769563913345337, "rewards/rollout_reward_func/std": 0.2001723051071167, "sampling/importance_sampling_ratio/max": 0.9931454658508301, "sampling/importance_sampling_ratio/mean": 0.8973784446716309, "sampling/importance_sampling_ratio/min": 0.00017488058074377477, "sampling/sampling_logp_difference/max": 2.064946174621582, "sampling/sampling_logp_difference/mean": 0.06519398093223572, "step": 1847, "step_time": 4.934800556002301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4186674915254116, "epoch": 0.01848, "grad_norm": 0.058133553713560104, "kl": 0.47025834023952484, "learning_rate": 9.998480572091946e-06, "loss": 0.0126, "step": 1848, "step_time": 2.0675476519973017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.11439583078026772, "epoch": 0.01849, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010323554743081331, "kl": 0.44912632182240486, "learning_rate": 9.998478894669849e-06, "loss": 0.0012, "num_tokens": 16191530.0, "reward": 0.629230797290802, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.629230797290802, "rewards/rollout_reward_func/std": 0.1354522854089737, "sampling/importance_sampling_ratio/max": 0.9968980550765991, "sampling/importance_sampling_ratio/mean": 0.987917423248291, "sampling/importance_sampling_ratio/min": 0.9801362752914429, "sampling/sampling_logp_difference/max": 0.013479027897119522, "sampling/sampling_logp_difference/mean": 0.002746197395026684, "step": 1849, "step_time": 3.817245919002744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1205925801768899, "epoch": 0.0185, "grad_norm": 0.0010877810418605804, "kl": 0.44821326062083244, "learning_rate": 9.99847721632253e-06, "loss": 0.0012, "step": 1850, "step_time": 2.4355796679956256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 122.84375, "completions/mean_terminated_length": 122.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5346739869564772, "epoch": 0.01851, "frac_reward_zero_std": 0.75, "grad_norm": 0.03411698713898659, "kl": 0.5734236687421799, "learning_rate": 9.998475537049986e-06, "loss": -0.0169, "num_tokens": 16206173.0, "reward": 0.8288941979408264, "reward_std": 0.01291829627007246, "rewards/rollout_reward_func/mean": 0.8288941979408264, "rewards/rollout_reward_func/std": 0.1559213548898697, "sampling/importance_sampling_ratio/max": 0.9924625158309937, "sampling/importance_sampling_ratio/mean": 0.946598470211029, "sampling/importance_sampling_ratio/min": 2.381076616361455e-17, "sampling/sampling_logp_difference/max": 3.013070583343506, "sampling/sampling_logp_difference/mean": 0.20365945994853973, "step": 1851, "step_time": 4.043986634009343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5369928032159805, "epoch": 0.01852, "grad_norm": 0.0340132862329483, "kl": 0.572734497487545, "learning_rate": 9.99847385685222e-06, "loss": -0.0169, "step": 1852, "step_time": 2.029857532994356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 220.84375, "completions/mean_terminated_length": 220.84375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.8188417498022318, "epoch": 0.01853, "frac_reward_zero_std": 0.5, "grad_norm": 0.2823730409145355, "kl": 0.4182545728981495, "learning_rate": 9.99847217572923e-06, "loss": -0.0432, "num_tokens": 16224032.0, "reward": 0.5415802597999573, "reward_std": 0.03689510375261307, "rewards/rollout_reward_func/mean": 0.5415802597999573, "rewards/rollout_reward_func/std": 0.2450719028711319, "sampling/importance_sampling_ratio/max": 1.1368924379348755, "sampling/importance_sampling_ratio/mean": 0.8718222379684448, "sampling/importance_sampling_ratio/min": 1.0969893396439723e-11, "sampling/sampling_logp_difference/max": 3.592146396636963, "sampling/sampling_logp_difference/mean": 0.21522147953510284, "step": 1853, "step_time": 5.126041606992658 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.018601191230118275, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023809524718672037, "entropy": 0.811290081590414, "epoch": 0.01854, "grad_norm": 0.08649981021881104, "kl": 0.4154808335006237, "learning_rate": 9.99847049368102e-06, "loss": -0.0442, "step": 1854, "step_time": 2.0507759990068735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 80.875, "completions/mean_terminated_length": 80.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.764069177210331, "epoch": 0.01855, "frac_reward_zero_std": 0.5, "grad_norm": 0.06570456922054291, "kl": 0.6762829311192036, "learning_rate": 9.998468810707585e-06, "loss": -0.0445, "num_tokens": 16237476.0, "reward": 0.8595432639122009, "reward_std": 0.062120452523231506, "rewards/rollout_reward_func/mean": 0.8595432639122009, "rewards/rollout_reward_func/std": 0.18314719200134277, "sampling/importance_sampling_ratio/max": 0.9906716346740723, "sampling/importance_sampling_ratio/mean": 0.9014307260513306, "sampling/importance_sampling_ratio/min": 0.0018518842989578843, "sampling/sampling_logp_difference/max": 1.595609188079834, "sampling/sampling_logp_difference/mean": 0.11395583301782608, "step": 1855, "step_time": 3.73535316100606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7503772918134928, "epoch": 0.01856, "grad_norm": 0.06737951934337616, "kl": 0.6739875674247742, "learning_rate": 9.99846712680893e-06, "loss": -0.0444, "step": 1856, "step_time": 2.465161428990541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 109.9375, "completions/mean_terminated_length": 109.9375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.9266717694699764, "epoch": 0.01857, "frac_reward_zero_std": 0.25, "grad_norm": 0.11000985652208328, "kl": 0.5445026569068432, "learning_rate": 9.998465441985055e-06, "loss": -0.0183, "num_tokens": 16251762.0, "reward": 0.6973221302032471, "reward_std": 0.1776612550020218, "rewards/rollout_reward_func/mean": 0.6973221302032471, "rewards/rollout_reward_func/std": 0.34682923555374146, "sampling/importance_sampling_ratio/max": 0.9991002082824707, "sampling/importance_sampling_ratio/mean": 0.8595411777496338, "sampling/importance_sampling_ratio/min": 4.5010690460185396e-14, "sampling/sampling_logp_difference/max": 3.4237430095672607, "sampling/sampling_logp_difference/mean": 0.2642748951911926, "step": 1857, "step_time": 3.9169064029993024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9153533279895782, "epoch": 0.01858, "grad_norm": 0.10735561698675156, "kl": 0.5526116341352463, "learning_rate": 9.998463756235958e-06, "loss": -0.0184, "step": 1858, "step_time": 2.04838225099229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 146.0, "completions/mean_terminated_length": 146.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1269538793712854, "epoch": 0.01859, "frac_reward_zero_std": 0.5, "grad_norm": 0.9898096919059753, "kl": 0.5307131111621857, "learning_rate": 9.998462069561643e-06, "loss": -0.0301, "num_tokens": 16267202.0, "reward": 0.7472307682037354, "reward_std": 0.026765499264001846, "rewards/rollout_reward_func/mean": 0.7472307682037354, "rewards/rollout_reward_func/std": 0.1908082365989685, "sampling/importance_sampling_ratio/max": 1.2215720415115356, "sampling/importance_sampling_ratio/mean": 0.8276365995407104, "sampling/importance_sampling_ratio/min": 1.5867993626039902e-16, "sampling/sampling_logp_difference/max": 3.756868362426758, "sampling/sampling_logp_difference/mean": 0.3590603172779083, "step": 1859, "step_time": 4.703816444991389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 1.1085984390228987, "epoch": 0.0186, "grad_norm": 0.033943988382816315, "kl": 0.5596286952495575, "learning_rate": 9.998460381962104e-06, "loss": -0.0334, "step": 1860, "step_time": 2.0361794710042886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 190.25, "completions/mean_terminated_length": 190.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.10854826308786869, "epoch": 0.01861, "frac_reward_zero_std": 1.0, "grad_norm": 0.001393773127347231, "kl": 0.3918749764561653, "learning_rate": 9.99845869343735e-06, "loss": 0.0014, "num_tokens": 16283970.0, "reward": 0.7928076982498169, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7928076982498169, "rewards/rollout_reward_func/std": 0.4407152533531189, "sampling/importance_sampling_ratio/max": 1.0054051876068115, "sampling/importance_sampling_ratio/mean": 0.9895444512367249, "sampling/importance_sampling_ratio/min": 0.9760613441467285, "sampling/sampling_logp_difference/max": 0.01641363650560379, "sampling/sampling_logp_difference/mean": 0.002804937306791544, "step": 1861, "step_time": 4.706790655996883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10558298509567976, "epoch": 0.01862, "grad_norm": 0.001448323018848896, "kl": 0.39188677817583084, "learning_rate": 9.998457003987377e-06, "loss": 0.0014, "step": 1862, "step_time": 2.0463128849951318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 186.90625, "completions/mean_terminated_length": 186.90625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.6084565725177526, "epoch": 0.01863, "frac_reward_zero_std": 0.25, "grad_norm": 0.31691431999206543, "kl": 0.5637575462460518, "learning_rate": 9.998455313612183e-06, "loss": -0.0864, "num_tokens": 16300631.0, "reward": 0.49060094356536865, "reward_std": 0.2362968474626541, "rewards/rollout_reward_func/mean": 0.49060094356536865, "rewards/rollout_reward_func/std": 0.41373395919799805, "sampling/importance_sampling_ratio/max": 1.0455217361450195, "sampling/importance_sampling_ratio/mean": 0.835981011390686, "sampling/importance_sampling_ratio/min": 0.006571777164936066, "sampling/sampling_logp_difference/max": 2.2005882263183594, "sampling/sampling_logp_difference/mean": 0.0723842978477478, "step": 1863, "step_time": 4.179432761993667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018229166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 0.605969738215208, "epoch": 0.01864, "grad_norm": 0.1900309920310974, "kl": 0.5811241716146469, "learning_rate": 9.998453622311773e-06, "loss": -0.0877, "step": 1864, "step_time": 2.0482046639881446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 101.125, "completions/mean_terminated_length": 101.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.40380058344453573, "epoch": 0.01865, "frac_reward_zero_std": 1.0, "grad_norm": 0.008143656887114048, "kl": 0.5837723948061466, "learning_rate": 9.998451930086144e-06, "loss": 0.0015, "num_tokens": 16314747.0, "reward": 0.5526922941207886, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5526922941207886, "rewards/rollout_reward_func/std": 0.12331675738096237, "sampling/importance_sampling_ratio/max": 0.9959505796432495, "sampling/importance_sampling_ratio/mean": 0.9593294262886047, "sampling/importance_sampling_ratio/min": 0.00023065971618052572, "sampling/sampling_logp_difference/max": 2.229065418243408, "sampling/sampling_logp_difference/mean": 0.06497113406658173, "step": 1865, "step_time": 4.227921122997941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39424263779073954, "epoch": 0.01866, "grad_norm": 0.00877801701426506, "kl": 0.589888870716095, "learning_rate": 9.998450236935298e-06, "loss": 0.0015, "step": 1866, "step_time": 2.0024139349989127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 168.25, "completions/mean_terminated_length": 168.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.47654558904469013, "epoch": 0.01867, "frac_reward_zero_std": 0.5, "grad_norm": 0.08292409032583237, "kl": 0.440631452947855, "learning_rate": 9.998448542859235e-06, "loss": -0.0583, "num_tokens": 16330811.0, "reward": 0.7972740530967712, "reward_std": 0.1304750293493271, "rewards/rollout_reward_func/mean": 0.7972740530967712, "rewards/rollout_reward_func/std": 0.338328093290329, "sampling/importance_sampling_ratio/max": 1.0232096910476685, "sampling/importance_sampling_ratio/mean": 0.9136037826538086, "sampling/importance_sampling_ratio/min": 0.0004740977310575545, "sampling/sampling_logp_difference/max": 2.8129477500915527, "sampling/sampling_logp_difference/mean": 0.08100935816764832, "step": 1867, "step_time": 4.658881109011418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 0.4764507282525301, "epoch": 0.01868, "grad_norm": 0.06523723900318146, "kl": 0.4546792395412922, "learning_rate": 9.998446847857956e-06, "loss": -0.0586, "step": 1868, "step_time": 2.045526530004281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 173.0625, "completions/mean_terminated_length": 173.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.316156511195004, "epoch": 0.01869, "frac_reward_zero_std": 0.5, "grad_norm": 0.028288420289754868, "kl": 0.8011792078614235, "learning_rate": 9.99844515193146e-06, "loss": -0.0516, "num_tokens": 16347117.0, "reward": 0.6132100820541382, "reward_std": 0.021380458027124405, "rewards/rollout_reward_func/mean": 0.6132100820541382, "rewards/rollout_reward_func/std": 0.1083410233259201, "sampling/importance_sampling_ratio/max": 0.9982364773750305, "sampling/importance_sampling_ratio/mean": 0.9373095035552979, "sampling/importance_sampling_ratio/min": 0.001142363529652357, "sampling/sampling_logp_difference/max": 2.1203062534332275, "sampling/sampling_logp_difference/mean": 0.046670932322740555, "step": 1869, "step_time": 4.334179445999325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3147274060174823, "epoch": 0.0187, "grad_norm": 0.027367250993847847, "kl": 0.8243935406208038, "learning_rate": 9.998443455079749e-06, "loss": -0.0516, "step": 1870, "step_time": 2.067378429994278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 149.375, "completions/mean_terminated_length": 149.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.41400428861379623, "epoch": 0.01871, "frac_reward_zero_std": 0.75, "grad_norm": 0.005128761287778616, "kl": 0.428500272333622, "learning_rate": 9.998441757302823e-06, "loss": -0.037, "num_tokens": 16362577.0, "reward": 0.6777111887931824, "reward_std": 0.02039826102554798, "rewards/rollout_reward_func/mean": 0.6777111887931824, "rewards/rollout_reward_func/std": 0.42848774790763855, "sampling/importance_sampling_ratio/max": 1.000076413154602, "sampling/importance_sampling_ratio/mean": 0.9645425081253052, "sampling/importance_sampling_ratio/min": 2.484063086415976e-25, "sampling/sampling_logp_difference/max": 3.8196959495544434, "sampling/sampling_logp_difference/mean": 0.23311635851860046, "step": 1871, "step_time": 5.092097326007206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41109921550378203, "epoch": 0.01872, "grad_norm": 0.005834635347127914, "kl": 0.43040285259485245, "learning_rate": 9.998440058600684e-06, "loss": -0.037, "step": 1872, "step_time": 2.069613733001461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 209.125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.4793073385953903, "epoch": 0.01873, "frac_reward_zero_std": 0.5, "grad_norm": 0.019112851470708847, "kl": 0.5667727589607239, "learning_rate": 9.998438358973328e-06, "loss": -0.0523, "num_tokens": 16379949.0, "reward": 0.7363317608833313, "reward_std": 0.15624091029167175, "rewards/rollout_reward_func/mean": 0.7363317608833313, "rewards/rollout_reward_func/std": 0.4756901264190674, "sampling/importance_sampling_ratio/max": 1.0027728080749512, "sampling/importance_sampling_ratio/mean": 0.9061826467514038, "sampling/importance_sampling_ratio/min": 8.359139769265056e-12, "sampling/sampling_logp_difference/max": 3.511259078979492, "sampling/sampling_logp_difference/mean": 0.14910171926021576, "step": 1873, "step_time": 4.878908343009243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4761122763156891, "epoch": 0.01874, "grad_norm": 0.016633519902825356, "kl": 0.5814446844160557, "learning_rate": 9.99843665842076e-06, "loss": -0.0523, "step": 1874, "step_time": 2.0606258590050857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 139.875, "completions/mean_terminated_length": 139.875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.24857933772727847, "epoch": 0.01875, "frac_reward_zero_std": 0.5, "grad_norm": 0.07569360733032227, "kl": 0.5981177128851414, "learning_rate": 9.998434956942979e-06, "loss": 0.0003, "num_tokens": 16395105.0, "reward": 0.7441490292549133, "reward_std": 0.15370053052902222, "rewards/rollout_reward_func/mean": 0.7441490292549133, "rewards/rollout_reward_func/std": 0.40003833174705505, "sampling/importance_sampling_ratio/max": 1.0054899454116821, "sampling/importance_sampling_ratio/mean": 0.940690279006958, "sampling/importance_sampling_ratio/min": 0.0901135578751564, "sampling/sampling_logp_difference/max": 2.3087382316589355, "sampling/sampling_logp_difference/mean": 0.03126627951860428, "step": 1875, "step_time": 4.092415920997155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24422855908051133, "epoch": 0.01876, "grad_norm": 0.08229925483465195, "kl": 0.6069999784231186, "learning_rate": 9.998433254539983e-06, "loss": 0.0001, "step": 1876, "step_time": 2.482957375003025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 139.5, "completions/mean_terminated_length": 139.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.7162463632412255, "epoch": 0.01877, "frac_reward_zero_std": 0.5, "grad_norm": 0.03320079669356346, "kl": 0.773718386888504, "learning_rate": 9.998431551211777e-06, "loss": -0.0635, "num_tokens": 16410249.0, "reward": 0.7015865445137024, "reward_std": 0.04834727570414543, "rewards/rollout_reward_func/mean": 0.7015865445137024, "rewards/rollout_reward_func/std": 0.08953527361154556, "sampling/importance_sampling_ratio/max": 0.9976072311401367, "sampling/importance_sampling_ratio/mean": 0.9027891755104065, "sampling/importance_sampling_ratio/min": 4.181065271779971e-09, "sampling/sampling_logp_difference/max": 3.89932918548584, "sampling/sampling_logp_difference/mean": 0.1800299882888794, "step": 1877, "step_time": 4.170638545001566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.720357172191143, "epoch": 0.01878, "grad_norm": 0.03322707489132881, "kl": 0.7942266836762428, "learning_rate": 9.998429846958356e-06, "loss": -0.0635, "step": 1878, "step_time": 1.99255466399336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 301.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 76.6875, "completions/mean_terminated_length": 69.45161437988281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5663700103759766, "epoch": 0.01879, "frac_reward_zero_std": 0.75, "grad_norm": 0.014412383548915386, "kl": 0.7721341624855995, "learning_rate": 9.998428141779723e-06, "loss": -0.027, "num_tokens": 16423559.0, "reward": 0.8870673179626465, "reward_std": 0.04419417306780815, "rewards/rollout_reward_func/mean": 0.8870673179626465, "rewards/rollout_reward_func/std": 0.2363341897726059, "sampling/importance_sampling_ratio/max": 0.9966514110565186, "sampling/importance_sampling_ratio/mean": 0.9595296382904053, "sampling/importance_sampling_ratio/min": 1.511462944396177e-39, "sampling/sampling_logp_difference/max": 14.731066703796387, "sampling/sampling_logp_difference/mean": 0.40281277894973755, "step": 1879, "step_time": 4.671270109989564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5670459680259228, "epoch": 0.0188, "grad_norm": 0.014562172815203667, "kl": 0.7719835788011551, "learning_rate": 9.998426435675879e-06, "loss": -0.027, "step": 1880, "step_time": 2.022275677001744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 196.1875, "completions/mean_terminated_length": 196.1875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.28510500164702535, "epoch": 0.01881, "frac_reward_zero_std": 0.5, "grad_norm": 0.029795676469802856, "kl": 0.5120834074914455, "learning_rate": 9.998424728646825e-06, "loss": -0.0536, "num_tokens": 16440661.0, "reward": 0.9843990802764893, "reward_std": 0.0551135316491127, "rewards/rollout_reward_func/mean": 0.9843990802764893, "rewards/rollout_reward_func/std": 0.37462565302848816, "sampling/importance_sampling_ratio/max": 1.0096142292022705, "sampling/importance_sampling_ratio/mean": 0.939700186252594, "sampling/importance_sampling_ratio/min": 0.033488791435956955, "sampling/sampling_logp_difference/max": 1.6346694231033325, "sampling/sampling_logp_difference/mean": 0.03430335223674774, "step": 1881, "step_time": 4.4200933770043775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2845757850445807, "epoch": 0.01882, "grad_norm": 0.03002302534878254, "kl": 0.5143070258200169, "learning_rate": 9.998423020692561e-06, "loss": -0.0536, "step": 1882, "step_time": 2.5373483009971096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 148.5, "completions/mean_terminated_length": 148.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1522873854264617, "epoch": 0.01883, "frac_reward_zero_std": 0.75, "grad_norm": 0.026411758735775948, "kl": 0.7444910854101181, "learning_rate": 9.998421311813087e-06, "loss": -0.0359, "num_tokens": 16456181.0, "reward": 0.7524600625038147, "reward_std": 0.011861721985042095, "rewards/rollout_reward_func/mean": 0.7524600625038147, "rewards/rollout_reward_func/std": 0.0636443942785263, "sampling/importance_sampling_ratio/max": 0.9976778626441956, "sampling/importance_sampling_ratio/mean": 0.9609168171882629, "sampling/importance_sampling_ratio/min": 0.003498293925076723, "sampling/sampling_logp_difference/max": 2.4456112384796143, "sampling/sampling_logp_difference/mean": 0.030706854537129402, "step": 1883, "step_time": 4.260180741002841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.153397835791111, "epoch": 0.01884, "grad_norm": 0.025256358087062836, "kl": 0.7400735169649124, "learning_rate": 9.998419602008402e-06, "loss": -0.0359, "step": 1884, "step_time": 2.049892386996362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 195.59375, "completions/mean_terminated_length": 195.59375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3836237727664411, "epoch": 0.01885, "frac_reward_zero_std": 0.5, "grad_norm": 0.07960289716720581, "kl": 0.5455694571137428, "learning_rate": 9.998417891278507e-06, "loss": -0.0027, "num_tokens": 16473120.0, "reward": 0.5275192260742188, "reward_std": 0.09740868955850601, "rewards/rollout_reward_func/mean": 0.5275192260742188, "rewards/rollout_reward_func/std": 0.27787014842033386, "sampling/importance_sampling_ratio/max": 0.9981346726417542, "sampling/importance_sampling_ratio/mean": 0.9068864583969116, "sampling/importance_sampling_ratio/min": 0.00028389031649567187, "sampling/sampling_logp_difference/max": 3.0693328380584717, "sampling/sampling_logp_difference/mean": 0.08490653336048126, "step": 1885, "step_time": 4.926965296996059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.38452542340382934, "epoch": 0.01886, "grad_norm": 0.033965662121772766, "kl": 0.5489743538200855, "learning_rate": 9.998416179623405e-06, "loss": -0.0028, "step": 1886, "step_time": 2.060001926998666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 114.25, "completions/mean_terminated_length": 114.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.048117775935679674, "epoch": 0.01887, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003048846556339413, "kl": 0.46790192276239395, "learning_rate": 9.998414467043094e-06, "loss": 0.0013, "num_tokens": 16487400.0, "reward": 0.6584615707397461, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6584615707397461, "rewards/rollout_reward_func/std": 0.185404434800148, "sampling/importance_sampling_ratio/max": 1.000402808189392, "sampling/importance_sampling_ratio/mean": 0.9966391324996948, "sampling/importance_sampling_ratio/min": 0.9911350607872009, "sampling/sampling_logp_difference/max": 0.005426748190075159, "sampling/sampling_logp_difference/mean": 0.0009446372278034687, "step": 1887, "step_time": 3.883698182005901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0487519153393805, "epoch": 0.01888, "grad_norm": 0.0003079542948398739, "kl": 0.46783898398280144, "learning_rate": 9.998412753537574e-06, "loss": 0.0013, "step": 1888, "step_time": 2.4536455560009927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.052180333994328976, "epoch": 0.01889, "frac_reward_zero_std": 1.0, "grad_norm": 0.00038379538455046713, "kl": 0.5431366451084614, "learning_rate": 9.99841103910685e-06, "loss": 0.0017, "num_tokens": 16502992.0, "reward": 0.7103846073150635, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7103846073150635, "rewards/rollout_reward_func/std": 0.18078194558620453, "sampling/importance_sampling_ratio/max": 0.9973956942558289, "sampling/importance_sampling_ratio/mean": 0.9954487085342407, "sampling/importance_sampling_ratio/min": 0.9920132756233215, "sampling/sampling_logp_difference/max": 0.0030827485024929047, "sampling/sampling_logp_difference/mean": 0.001055290806107223, "step": 1889, "step_time": 3.961797759991896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05223681079223752, "epoch": 0.0189, "grad_norm": 0.00038702835445292294, "kl": 0.5431329682469368, "learning_rate": 9.998409323750916e-06, "loss": 0.0017, "step": 1890, "step_time": 2.015464926007553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 100.375, "completions/mean_terminated_length": 100.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3347479244694114, "epoch": 0.01891, "frac_reward_zero_std": 0.75, "grad_norm": 0.2217414826154709, "kl": 0.5981850549578667, "learning_rate": 9.998407607469775e-06, "loss": -0.0062, "num_tokens": 16517148.0, "reward": 0.6289903521537781, "reward_std": 0.03521215543150902, "rewards/rollout_reward_func/mean": 0.6289903521537781, "rewards/rollout_reward_func/std": 0.22942006587982178, "sampling/importance_sampling_ratio/max": 1.0033005475997925, "sampling/importance_sampling_ratio/mean": 0.935382604598999, "sampling/importance_sampling_ratio/min": 1.8222564222014626e-06, "sampling/sampling_logp_difference/max": 4.250820159912109, "sampling/sampling_logp_difference/mean": 0.09326712787151337, "step": 1891, "step_time": 4.377329956012545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.3347232509404421, "epoch": 0.01892, "grad_norm": 0.004671974573284388, "kl": 0.6084891483187675, "learning_rate": 9.998405890263428e-06, "loss": -0.0064, "step": 1892, "step_time": 1.9992295689808088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.050030379090458155, "epoch": 0.01893, "frac_reward_zero_std": 1.0, "grad_norm": 0.00041961873648688197, "kl": 0.48673777282238007, "learning_rate": 9.998404172131876e-06, "loss": 0.0017, "num_tokens": 16533292.0, "reward": 0.5234615206718445, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5234615206718445, "rewards/rollout_reward_func/std": 0.08199368417263031, "sampling/importance_sampling_ratio/max": 0.9987377524375916, "sampling/importance_sampling_ratio/mean": 0.9953376054763794, "sampling/importance_sampling_ratio/min": 0.9920007586479187, "sampling/sampling_logp_difference/max": 0.004065744578838348, "sampling/sampling_logp_difference/mean": 0.0010448840912431479, "step": 1893, "step_time": 4.666146478994051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05032204510644078, "epoch": 0.01894, "grad_norm": 0.0004238585534039885, "kl": 0.4867190644145012, "learning_rate": 9.998402453075116e-06, "loss": 0.0017, "step": 1894, "step_time": 2.0099003099821857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 76.8125, "completions/mean_terminated_length": 76.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.298062811139971, "epoch": 0.01895, "frac_reward_zero_std": 0.75, "grad_norm": 0.0055020274594426155, "kl": 0.7651354297995567, "learning_rate": 9.998400733093154e-06, "loss": -0.0172, "num_tokens": 16546574.0, "reward": 0.801682710647583, "reward_std": 0.03331560641527176, "rewards/rollout_reward_func/mean": 0.801682710647583, "rewards/rollout_reward_func/std": 0.17300918698310852, "sampling/importance_sampling_ratio/max": 0.9984318017959595, "sampling/importance_sampling_ratio/mean": 0.9584780931472778, "sampling/importance_sampling_ratio/min": 0.01498245820403099, "sampling/sampling_logp_difference/max": 2.028062343597412, "sampling/sampling_logp_difference/mean": 0.03626564145088196, "step": 1895, "step_time": 3.7088248029976967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3031983426772058, "epoch": 0.01896, "grad_norm": 0.005316386930644512, "kl": 0.7650832012295723, "learning_rate": 9.998399012185986e-06, "loss": -0.0172, "step": 1896, "step_time": 2.409508945980633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 153.75, "completions/mean_terminated_length": 153.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.9122284715995193, "epoch": 0.01897, "frac_reward_zero_std": 0.5, "grad_norm": 0.02326016314327717, "kl": 0.6009248606860638, "learning_rate": 9.998397290353613e-06, "loss": -0.0028, "num_tokens": 16562286.0, "reward": 0.5698077082633972, "reward_std": 0.02029968425631523, "rewards/rollout_reward_func/mean": 0.5698077082633972, "rewards/rollout_reward_func/std": 0.11591754108667374, "sampling/importance_sampling_ratio/max": 1.0021737813949585, "sampling/importance_sampling_ratio/mean": 0.9039459228515625, "sampling/importance_sampling_ratio/min": 3.418455454791571e-19, "sampling/sampling_logp_difference/max": 3.3791956901550293, "sampling/sampling_logp_difference/mean": 0.3327474594116211, "step": 1897, "step_time": 4.224459237011615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9094810783863068, "epoch": 0.01898, "grad_norm": 0.02278478629887104, "kl": 0.5916555784642696, "learning_rate": 9.998395567596038e-06, "loss": -0.0028, "step": 1898, "step_time": 2.033010123996064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 122.8125, "completions/mean_terminated_length": 122.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8258634665980935, "epoch": 0.01899, "frac_reward_zero_std": 0.75, "grad_norm": 0.01706996001303196, "kl": 0.7250147685408592, "learning_rate": 9.99839384391326e-06, "loss": -0.0226, "num_tokens": 16576984.0, "reward": 0.9517788290977478, "reward_std": 0.03230110555887222, "rewards/rollout_reward_func/mean": 0.9517788290977478, "rewards/rollout_reward_func/std": 0.20019103586673737, "sampling/importance_sampling_ratio/max": 1.0011849403381348, "sampling/importance_sampling_ratio/mean": 0.9290322065353394, "sampling/importance_sampling_ratio/min": 1.8778198551672176e-08, "sampling/sampling_logp_difference/max": 4.1256022453308105, "sampling/sampling_logp_difference/mean": 0.2674088776111603, "step": 1899, "step_time": 4.591629416005162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8249654350802302, "epoch": 0.019, "grad_norm": 0.015643026679754257, "kl": 0.7109812498092651, "learning_rate": 9.998392119305277e-06, "loss": -0.0227, "step": 1900, "step_time": 2.001238976008608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 201.96875, "completions/mean_terminated_length": 201.96875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.21665163012221456, "epoch": 0.01901, "frac_reward_zero_std": 0.75, "grad_norm": 0.19964680075645447, "kl": 1.924531303346157, "learning_rate": 9.998390393772093e-06, "loss": -0.0127, "num_tokens": 16594271.0, "reward": 0.9979711771011353, "reward_std": 0.04011470824480057, "rewards/rollout_reward_func/mean": 0.9979711771011353, "rewards/rollout_reward_func/std": 0.3209986090660095, "sampling/importance_sampling_ratio/max": 1.0040056705474854, "sampling/importance_sampling_ratio/mean": 0.9652698040008545, "sampling/importance_sampling_ratio/min": 0.007275307085365057, "sampling/sampling_logp_difference/max": 1.9945718050003052, "sampling/sampling_logp_difference/mean": 0.03188048303127289, "step": 1901, "step_time": 4.296952518998296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2181171728298068, "epoch": 0.01902, "grad_norm": 0.14991076290607452, "kl": 1.5927940011024475, "learning_rate": 9.998388667313706e-06, "loss": -0.0137, "step": 1902, "step_time": 2.4195205320065725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 115.75, "completions/mean_terminated_length": 115.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.055546427611261606, "epoch": 0.01903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004424056096468121, "kl": 0.3955010212957859, "learning_rate": 9.99838693993012e-06, "loss": 0.0011, "num_tokens": 16608599.0, "reward": 0.698846161365509, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.698846161365509, "rewards/rollout_reward_func/std": 0.20292457938194275, "sampling/importance_sampling_ratio/max": 1.0022878646850586, "sampling/importance_sampling_ratio/mean": 0.9968251585960388, "sampling/importance_sampling_ratio/min": 0.9931704998016357, "sampling/sampling_logp_difference/max": 0.004990763962268829, "sampling/sampling_logp_difference/mean": 0.0011362950317561626, "step": 1903, "step_time": 3.800205048995849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.058466451708227396, "epoch": 0.01904, "grad_norm": 0.00046949440729804337, "kl": 0.39504722133278847, "learning_rate": 9.99838521162133e-06, "loss": 0.0011, "step": 1904, "step_time": 1.9521004209964303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 93.25, "completions/mean_terminated_length": 93.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7560877287760377, "epoch": 0.01905, "frac_reward_zero_std": 0.5, "grad_norm": 0.016339439898729324, "kl": 0.722108282148838, "learning_rate": 9.99838348238734e-06, "loss": -0.0504, "num_tokens": 16622295.0, "reward": 0.609326958656311, "reward_std": 0.0174343753606081, "rewards/rollout_reward_func/mean": 0.609326958656311, "rewards/rollout_reward_func/std": 0.1585754156112671, "sampling/importance_sampling_ratio/max": 0.9968568682670593, "sampling/importance_sampling_ratio/mean": 0.8947858810424805, "sampling/importance_sampling_ratio/min": 0.0021713529713451862, "sampling/sampling_logp_difference/max": 2.6216719150543213, "sampling/sampling_logp_difference/mean": 0.11849705874919891, "step": 1905, "step_time": 4.214570575008111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.762373554520309, "epoch": 0.01906, "grad_norm": 0.018260056152939796, "kl": 0.7005728781223297, "learning_rate": 9.99838175222815e-06, "loss": -0.0504, "step": 1906, "step_time": 2.006642860986176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 175.21875, "completions/mean_terminated_length": 175.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5313192708417773, "epoch": 0.01907, "frac_reward_zero_std": 0.5, "grad_norm": 0.032545056194067, "kl": 0.7077039107680321, "learning_rate": 9.99838002114376e-06, "loss": -0.0168, "num_tokens": 16638614.0, "reward": 0.6989374756813049, "reward_std": 0.06765107810497284, "rewards/rollout_reward_func/mean": 0.6989374756813049, "rewards/rollout_reward_func/std": 0.3107627034187317, "sampling/importance_sampling_ratio/max": 1.0006160736083984, "sampling/importance_sampling_ratio/mean": 0.9017525911331177, "sampling/importance_sampling_ratio/min": 5.204631088417955e-05, "sampling/sampling_logp_difference/max": 2.747938632965088, "sampling/sampling_logp_difference/mean": 0.10473909974098206, "step": 1907, "step_time": 4.196369833989593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5330001963302493, "epoch": 0.01908, "grad_norm": 0.02732776291668415, "kl": 0.6736432835459709, "learning_rate": 9.99837828913417e-06, "loss": -0.017, "step": 1908, "step_time": 2.479484928000602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 148.3125, "completions/mean_terminated_length": 148.3125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2483916925266385, "epoch": 0.01909, "frac_reward_zero_std": 0.75, "grad_norm": 0.008225644938647747, "kl": 0.581019788980484, "learning_rate": 9.99837655619938e-06, "loss": -0.0172, "num_tokens": 16654096.0, "reward": 0.8108845949172974, "reward_std": 0.05167318880558014, "rewards/rollout_reward_func/mean": 0.8108845949172974, "rewards/rollout_reward_func/std": 0.3925485908985138, "sampling/importance_sampling_ratio/max": 0.9954938888549805, "sampling/importance_sampling_ratio/mean": 0.9573559165000916, "sampling/importance_sampling_ratio/min": 0.006925116293132305, "sampling/sampling_logp_difference/max": 2.500253677368164, "sampling/sampling_logp_difference/mean": 0.029168646782636642, "step": 1909, "step_time": 4.239623682995443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2551500899717212, "epoch": 0.0191, "grad_norm": 0.0089925117790699, "kl": 0.5789057314395905, "learning_rate": 9.998374822339394e-06, "loss": -0.0172, "step": 1910, "step_time": 1.9976800540025579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 178.65625, "completions/mean_terminated_length": 178.65625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6028026957064867, "epoch": 0.01911, "frac_reward_zero_std": 0.5, "grad_norm": 0.027653580531477928, "kl": 0.44333329424262047, "learning_rate": 9.998373087554208e-06, "loss": -0.0092, "num_tokens": 16670493.0, "reward": 0.6318268775939941, "reward_std": 0.03154784440994263, "rewards/rollout_reward_func/mean": 0.6318268775939941, "rewards/rollout_reward_func/std": 0.3392133116722107, "sampling/importance_sampling_ratio/max": 0.998580813407898, "sampling/importance_sampling_ratio/mean": 0.9321887493133545, "sampling/importance_sampling_ratio/min": 6.115696331439766e-17, "sampling/sampling_logp_difference/max": 2.8809926509857178, "sampling/sampling_logp_difference/mean": 0.18955698609352112, "step": 1911, "step_time": 4.806511357004638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.600415819324553, "epoch": 0.01912, "grad_norm": 0.027697337791323662, "kl": 0.4403330162167549, "learning_rate": 9.998371351843823e-06, "loss": -0.0093, "step": 1912, "step_time": 2.06101390299591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 76.8125, "completions/mean_terminated_length": 76.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9875617912039161, "epoch": 0.01913, "frac_reward_zero_std": 0.5, "grad_norm": 0.29530221223831177, "kl": 0.7082887776196003, "learning_rate": 9.998369615208243e-06, "loss": -0.0445, "num_tokens": 16683751.0, "reward": 0.7041346430778503, "reward_std": 0.0848734974861145, "rewards/rollout_reward_func/mean": 0.7041346430778503, "rewards/rollout_reward_func/std": 0.24808652698993683, "sampling/importance_sampling_ratio/max": 1.0709035396575928, "sampling/importance_sampling_ratio/mean": 0.8854023218154907, "sampling/importance_sampling_ratio/min": 4.8318536927951185e-29, "sampling/sampling_logp_difference/max": 8.367422103881836, "sampling/sampling_logp_difference/mean": 0.44222429394721985, "step": 1913, "step_time": 4.037339247013733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.9929179958999157, "epoch": 0.01914, "grad_norm": 0.05247325077652931, "kl": 0.7038109302520752, "learning_rate": 9.998367877647466e-06, "loss": -0.0454, "step": 1914, "step_time": 2.497410817006312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 145.25, "completions/mean_terminated_length": 145.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2880946593359113, "epoch": 0.01915, "frac_reward_zero_std": 1.0, "grad_norm": 0.009465397335588932, "kl": 0.5292788743972778, "learning_rate": 9.998366139161492e-06, "loss": 0.0017, "num_tokens": 16699167.0, "reward": 0.6880769729614258, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6880769729614258, "rewards/rollout_reward_func/std": 0.2374294400215149, "sampling/importance_sampling_ratio/max": 0.9978877902030945, "sampling/importance_sampling_ratio/mean": 0.934829592704773, "sampling/importance_sampling_ratio/min": 0.04720264673233032, "sampling/sampling_logp_difference/max": 2.3706915378570557, "sampling/sampling_logp_difference/mean": 0.032789379358291626, "step": 1915, "step_time": 4.115933431989106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28729102946817875, "epoch": 0.01916, "grad_norm": 0.009279423393309116, "kl": 0.5248274505138397, "learning_rate": 9.99836439975032e-06, "loss": 0.0017, "step": 1916, "step_time": 2.465624000004027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 125.25, "completions/mean_terminated_length": 125.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.08027512207627296, "epoch": 0.01917, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006247305427677929, "kl": 0.4649710915982723, "learning_rate": 9.998362659413955e-06, "loss": 0.0013, "num_tokens": 16714087.0, "reward": 0.7311538457870483, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7311538457870483, "rewards/rollout_reward_func/std": 0.18434512615203857, "sampling/importance_sampling_ratio/max": 0.9990270733833313, "sampling/importance_sampling_ratio/mean": 0.9948184490203857, "sampling/importance_sampling_ratio/min": 0.9890108108520508, "sampling/sampling_logp_difference/max": 0.005831722170114517, "sampling/sampling_logp_difference/mean": 0.0014550855848938227, "step": 1917, "step_time": 3.9551534940037527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08176101930439472, "epoch": 0.01918, "grad_norm": 0.0006435749819502234, "kl": 0.46470844000577927, "learning_rate": 9.998360918152395e-06, "loss": 0.0013, "step": 1918, "step_time": 1.970704740990186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 218.5625, "completions/mean_terminated_length": 218.5625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.8755845809355378, "epoch": 0.01919, "frac_reward_zero_std": 0.25, "grad_norm": 0.033989787101745605, "kl": 0.6033379696309566, "learning_rate": 9.998359175965638e-06, "loss": -0.0852, "num_tokens": 16731849.0, "reward": 0.8022716641426086, "reward_std": 0.1635374128818512, "rewards/rollout_reward_func/mean": 0.8022716641426086, "rewards/rollout_reward_func/std": 0.438034325838089, "sampling/importance_sampling_ratio/max": 0.9967402219772339, "sampling/importance_sampling_ratio/mean": 0.8357729911804199, "sampling/importance_sampling_ratio/min": 0.00038103104452602565, "sampling/sampling_logp_difference/max": 2.744607448577881, "sampling/sampling_logp_difference/mean": 0.14608171582221985, "step": 1919, "step_time": 4.84717819900834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8731574481353164, "epoch": 0.0192, "grad_norm": 0.03303276747465134, "kl": 0.6092195063829422, "learning_rate": 9.998357432853687e-06, "loss": -0.0852, "step": 1920, "step_time": 2.5072443619865226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 122.71875, "completions/mean_terminated_length": 122.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8198976991698146, "epoch": 0.01921, "frac_reward_zero_std": 0.75, "grad_norm": 0.01968321017920971, "kl": 0.5249665267765522, "learning_rate": 9.998355688816543e-06, "loss": 0.0205, "num_tokens": 16746632.0, "reward": 0.5651153922080994, "reward_std": 0.010878569446504116, "rewards/rollout_reward_func/mean": 0.5651153922080994, "rewards/rollout_reward_func/std": 0.16924549639225006, "sampling/importance_sampling_ratio/max": 0.9955906271934509, "sampling/importance_sampling_ratio/mean": 0.9239176511764526, "sampling/importance_sampling_ratio/min": 2.4645929730823205e-16, "sampling/sampling_logp_difference/max": 3.888559341430664, "sampling/sampling_logp_difference/mean": 0.28711649775505066, "step": 1921, "step_time": 4.210482528011198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8188031194731593, "epoch": 0.01922, "grad_norm": 0.02021697536110878, "kl": 0.5257032476365566, "learning_rate": 9.998353943854206e-06, "loss": 0.0205, "step": 1922, "step_time": 2.5059886049930356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 189.40625, "completions/mean_terminated_length": 189.40625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.4917270205914974, "epoch": 0.01923, "frac_reward_zero_std": 0.5, "grad_norm": 0.06327025592327118, "kl": 0.5585332736372948, "learning_rate": 9.998352197966675e-06, "loss": -0.0101, "num_tokens": 16763317.0, "reward": 0.7652404308319092, "reward_std": 0.02505568042397499, "rewards/rollout_reward_func/mean": 0.7652404308319092, "rewards/rollout_reward_func/std": 0.260672003030777, "sampling/importance_sampling_ratio/max": 0.9971455931663513, "sampling/importance_sampling_ratio/mean": 0.9003954529762268, "sampling/importance_sampling_ratio/min": 0.0008568920311518013, "sampling/sampling_logp_difference/max": 2.0678162574768066, "sampling/sampling_logp_difference/mean": 0.06703925877809525, "step": 1923, "step_time": 4.099180229990452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4830962289124727, "epoch": 0.01924, "grad_norm": 0.06087779626250267, "kl": 0.5567931607365608, "learning_rate": 9.998350451153953e-06, "loss": -0.0103, "step": 1924, "step_time": 1.9966472969972529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 176.6875, "completions/mean_terminated_length": 176.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5315965483896434, "epoch": 0.01925, "frac_reward_zero_std": 0.25, "grad_norm": 0.04218496009707451, "kl": 0.6115763187408447, "learning_rate": 9.998348703416036e-06, "loss": -0.0137, "num_tokens": 16779851.0, "reward": 0.8067981004714966, "reward_std": 0.07030272483825684, "rewards/rollout_reward_func/mean": 0.8067981004714966, "rewards/rollout_reward_func/std": 0.42367997765541077, "sampling/importance_sampling_ratio/max": 0.9965217113494873, "sampling/importance_sampling_ratio/mean": 0.9011270999908447, "sampling/importance_sampling_ratio/min": 1.501059069793434e-13, "sampling/sampling_logp_difference/max": 17.498933792114258, "sampling/sampling_logp_difference/mean": 0.29636430740356445, "step": 1925, "step_time": 4.109699485990859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5301405778154731, "epoch": 0.01926, "grad_norm": 0.038908086717128754, "kl": 0.5901721678674221, "learning_rate": 9.99834695475293e-06, "loss": -0.0138, "step": 1926, "step_time": 2.458153683008277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 192.75, "completions/mean_terminated_length": 192.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.08556054718792439, "epoch": 0.01927, "frac_reward_zero_std": 1.0, "grad_norm": 0.003598650684580207, "kl": 0.3984118923544884, "learning_rate": 9.998345205164631e-06, "loss": 0.0015, "num_tokens": 16796755.0, "reward": 0.8228076696395874, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8228076696395874, "rewards/rollout_reward_func/std": 0.36288589239120483, "sampling/importance_sampling_ratio/max": 1.0552663803100586, "sampling/importance_sampling_ratio/mean": 0.9946959614753723, "sampling/importance_sampling_ratio/min": 0.88526850938797, "sampling/sampling_logp_difference/max": 0.11632446944713593, "sampling/sampling_logp_difference/mean": 0.003392208134755492, "step": 1927, "step_time": 4.170101644995157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0877597788348794, "epoch": 0.01928, "grad_norm": 0.004276360385119915, "kl": 0.3978145718574524, "learning_rate": 9.998343454651143e-06, "loss": 0.0015, "step": 1928, "step_time": 2.510306696014595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 103.15625, "completions/mean_terminated_length": 103.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8082148712128401, "epoch": 0.01929, "frac_reward_zero_std": 0.5, "grad_norm": 0.017528455704450607, "kl": 0.5490240268409252, "learning_rate": 9.998341703212461e-06, "loss": -0.0368, "num_tokens": 16810880.0, "reward": 0.5433173179626465, "reward_std": 0.02651650458574295, "rewards/rollout_reward_func/mean": 0.5433173179626465, "rewards/rollout_reward_func/std": 0.13507527112960815, "sampling/importance_sampling_ratio/max": 1.0012428760528564, "sampling/importance_sampling_ratio/mean": 0.9265203475952148, "sampling/importance_sampling_ratio/min": 3.220424294718974e-16, "sampling/sampling_logp_difference/max": 14.79322624206543, "sampling/sampling_logp_difference/mean": 0.4176532030105591, "step": 1929, "step_time": 4.029150408998248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8079172670841217, "epoch": 0.0193, "grad_norm": 0.01730240136384964, "kl": 0.548872098326683, "learning_rate": 9.99833995084859e-06, "loss": -0.0368, "step": 1930, "step_time": 2.0270724430083646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 119.59375, "completions/mean_terminated_length": 119.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7992043551057577, "epoch": 0.01931, "frac_reward_zero_std": 0.5, "grad_norm": 0.03673059493303299, "kl": 0.6717460639774799, "learning_rate": 9.998338197559531e-06, "loss": -0.0489, "num_tokens": 16825419.0, "reward": 0.7373217940330505, "reward_std": 0.08268281072378159, "rewards/rollout_reward_func/mean": 0.7373217940330505, "rewards/rollout_reward_func/std": 0.2870490550994873, "sampling/importance_sampling_ratio/max": 0.996334433555603, "sampling/importance_sampling_ratio/mean": 0.8684073686599731, "sampling/importance_sampling_ratio/min": 1.0397757321811696e-08, "sampling/sampling_logp_difference/max": 10.387458801269531, "sampling/sampling_logp_difference/mean": 0.18118922412395477, "step": 1931, "step_time": 4.161693563997687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7882408890873194, "epoch": 0.01932, "grad_norm": 0.03636092692613602, "kl": 0.6696816757321358, "learning_rate": 9.998336443345282e-06, "loss": -0.0489, "step": 1932, "step_time": 2.452223884014529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 263.75, "completions/mean_terminated_length": 263.75, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.06094183726236224, "epoch": 0.01933, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007565679843537509, "kl": 0.3755461350083351, "learning_rate": 9.998334688205847e-06, "loss": 0.0018, "num_tokens": 16844483.0, "reward": 1.0772693157196045, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0772693157196045, "rewards/rollout_reward_func/std": 0.25397413969039917, "sampling/importance_sampling_ratio/max": 0.9964151382446289, "sampling/importance_sampling_ratio/mean": 0.990678608417511, "sampling/importance_sampling_ratio/min": 0.9854394197463989, "sampling/sampling_logp_difference/max": 0.009383393451571465, "sampling/sampling_logp_difference/mean": 0.0014047491131350398, "step": 1933, "step_time": 4.5098886209889315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.060199557803571224, "epoch": 0.01934, "grad_norm": 0.0007426891825161874, "kl": 0.3756655268371105, "learning_rate": 9.99833293214122e-06, "loss": 0.0018, "step": 1934, "step_time": 2.4871879020065535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 192.0625, "completions/mean_terminated_length": 192.0625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.23178009409457445, "epoch": 0.01935, "frac_reward_zero_std": 0.5, "grad_norm": 0.05127311870455742, "kl": 0.5461634956300259, "learning_rate": 9.998331175151406e-06, "loss": -0.0497, "num_tokens": 16861397.0, "reward": 0.7090576887130737, "reward_std": 0.02474873885512352, "rewards/rollout_reward_func/mean": 0.7090576887130737, "rewards/rollout_reward_func/std": 0.0748790055513382, "sampling/importance_sampling_ratio/max": 1.0236353874206543, "sampling/importance_sampling_ratio/mean": 0.9411534667015076, "sampling/importance_sampling_ratio/min": 0.03451697155833244, "sampling/sampling_logp_difference/max": 2.505504846572876, "sampling/sampling_logp_difference/mean": 0.0291619710624218, "step": 1935, "step_time": 4.178641844002414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.228414804674685, "epoch": 0.01936, "grad_norm": 0.05062760412693024, "kl": 0.5505558326840401, "learning_rate": 9.998329417236405e-06, "loss": -0.0499, "step": 1936, "step_time": 2.025831562008534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 216.75, "completions/mean_terminated_length": 216.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06062857108190656, "epoch": 0.01937, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005835483316332102, "kl": 0.36334504187107086, "learning_rate": 9.998327658396217e-06, "loss": 0.0015, "num_tokens": 16879069.0, "reward": 0.7140769362449646, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7140769362449646, "rewards/rollout_reward_func/std": 0.32552090287208557, "sampling/importance_sampling_ratio/max": 0.9977447986602783, "sampling/importance_sampling_ratio/mean": 0.9925062656402588, "sampling/importance_sampling_ratio/min": 0.9892478585243225, "sampling/sampling_logp_difference/max": 0.006444314494729042, "sampling/sampling_logp_difference/mean": 0.0012560765026137233, "step": 1937, "step_time": 4.421933177989558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05867744330316782, "epoch": 0.01938, "grad_norm": 0.0005515728262253106, "kl": 0.3636765442788601, "learning_rate": 9.998325898630842e-06, "loss": 0.0015, "step": 1938, "step_time": 2.4702022150086123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07446461636573076, "epoch": 0.01939, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005606294143944979, "kl": 0.43971938267350197, "learning_rate": 9.998324137940282e-06, "loss": 0.0015, "num_tokens": 16895373.0, "reward": 0.7851153612136841, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7851153612136841, "rewards/rollout_reward_func/std": 0.2486337274312973, "sampling/importance_sampling_ratio/max": 0.9955120086669922, "sampling/importance_sampling_ratio/mean": 0.9914456009864807, "sampling/importance_sampling_ratio/min": 0.9829297661781311, "sampling/sampling_logp_difference/max": 0.009144067764282227, "sampling/sampling_logp_difference/mean": 0.0017264019697904587, "step": 1939, "step_time": 4.24154285799159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07281825225800276, "epoch": 0.0194, "grad_norm": 0.0005314575973898172, "kl": 0.44002513214945793, "learning_rate": 9.998322376324535e-06, "loss": 0.0015, "step": 1940, "step_time": 2.497779449004156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 119.5625, "completions/mean_terminated_length": 119.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.26768921967595816, "epoch": 0.01941, "frac_reward_zero_std": 0.75, "grad_norm": 0.08183490484952927, "kl": 0.6031887419521809, "learning_rate": 9.998320613783604e-06, "loss": 0.0281, "num_tokens": 16909967.0, "reward": 0.7572596073150635, "reward_std": 0.0020397298503667116, "rewards/rollout_reward_func/mean": 0.7572596073150635, "rewards/rollout_reward_func/std": 0.2240203320980072, "sampling/importance_sampling_ratio/max": 0.9976627826690674, "sampling/importance_sampling_ratio/mean": 0.9595493078231812, "sampling/importance_sampling_ratio/min": 0.0917583480477333, "sampling/sampling_logp_difference/max": 0.8238482475280762, "sampling/sampling_logp_difference/mean": 0.016102727502584457, "step": 1941, "step_time": 4.165700880002987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26065885927528143, "epoch": 0.01942, "grad_norm": 0.08354318141937256, "kl": 0.6022570170462132, "learning_rate": 9.998318850317487e-06, "loss": 0.028, "step": 1942, "step_time": 1.988595623006404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 153.1875, "completions/mean_terminated_length": 153.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.14416463486850262, "epoch": 0.01943, "frac_reward_zero_std": 0.75, "grad_norm": 0.014383349567651749, "kl": 0.5016929320991039, "learning_rate": 9.998317085926187e-06, "loss": -0.0253, "num_tokens": 16925717.0, "reward": 0.7342548370361328, "reward_std": 0.05106126517057419, "rewards/rollout_reward_func/mean": 0.7342548370361328, "rewards/rollout_reward_func/std": 0.37409067153930664, "sampling/importance_sampling_ratio/max": 1.0004465579986572, "sampling/importance_sampling_ratio/mean": 0.9638197422027588, "sampling/importance_sampling_ratio/min": 0.10400578379631042, "sampling/sampling_logp_difference/max": 2.1901354789733887, "sampling/sampling_logp_difference/mean": 0.014012726955115795, "step": 1943, "step_time": 4.34987563300092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.1446883650496602, "epoch": 0.01944, "grad_norm": 0.01630898006260395, "kl": 0.5073573477566242, "learning_rate": 9.998315320609702e-06, "loss": -0.0253, "step": 1944, "step_time": 1.9297722409974085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 80.625, "completions/mean_terminated_length": 80.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6963820196688175, "epoch": 0.01945, "frac_reward_zero_std": 0.25, "grad_norm": 0.04097757115960121, "kl": 1.0086322501301765, "learning_rate": 9.998313554368034e-06, "loss": -0.0418, "num_tokens": 16939241.0, "reward": 0.7492788434028625, "reward_std": 0.13693395256996155, "rewards/rollout_reward_func/mean": 0.7492788434028625, "rewards/rollout_reward_func/std": 0.23545336723327637, "sampling/importance_sampling_ratio/max": 0.9986224174499512, "sampling/importance_sampling_ratio/mean": 0.9027401208877563, "sampling/importance_sampling_ratio/min": 0.01797393523156643, "sampling/sampling_logp_difference/max": 2.3597376346588135, "sampling/sampling_logp_difference/mean": 0.11516322195529938, "step": 1945, "step_time": 3.9714320770071936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6893877927213907, "epoch": 0.01946, "grad_norm": 0.03957675024867058, "kl": 1.0348745062947273, "learning_rate": 9.998311787201181e-06, "loss": -0.0418, "step": 1946, "step_time": 2.4863677190005546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 147.1875, "completions/mean_terminated_length": 147.1875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2602288508787751, "epoch": 0.01947, "frac_reward_zero_std": 0.75, "grad_norm": 0.010292931459844112, "kl": 0.46473217383027077, "learning_rate": 9.998310019109147e-06, "loss": -0.0258, "num_tokens": 16954663.0, "reward": 0.671668291091919, "reward_std": 0.02878740429878235, "rewards/rollout_reward_func/mean": 0.671668291091919, "rewards/rollout_reward_func/std": 0.40671879053115845, "sampling/importance_sampling_ratio/max": 0.9984300136566162, "sampling/importance_sampling_ratio/mean": 0.9416199922561646, "sampling/importance_sampling_ratio/min": 0.037894222885370255, "sampling/sampling_logp_difference/max": 1.7066510915756226, "sampling/sampling_logp_difference/mean": 0.03161393105983734, "step": 1947, "step_time": 3.975891013004002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26292769936844707, "epoch": 0.01948, "grad_norm": 0.00955088622868061, "kl": 0.4677176885306835, "learning_rate": 9.99830825009193e-06, "loss": -0.0258, "step": 1948, "step_time": 2.038867116010806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 189.59375, "completions/mean_terminated_length": 189.59375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.38924034778028727, "epoch": 0.01949, "frac_reward_zero_std": 0.5, "grad_norm": 0.02395815961062908, "kl": 0.47494248300790787, "learning_rate": 9.998306480149534e-06, "loss": 0.0137, "num_tokens": 16971498.0, "reward": 0.9224230647087097, "reward_std": 0.024476777762174606, "rewards/rollout_reward_func/mean": 0.9224230647087097, "rewards/rollout_reward_func/std": 0.3072836697101593, "sampling/importance_sampling_ratio/max": 0.9979978799819946, "sampling/importance_sampling_ratio/mean": 0.9363964796066284, "sampling/importance_sampling_ratio/min": 2.5787244561925604e-10, "sampling/sampling_logp_difference/max": 3.6643757820129395, "sampling/sampling_logp_difference/mean": 0.1070675402879715, "step": 1949, "step_time": 4.767322543004411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38933259155601263, "epoch": 0.0195, "grad_norm": 0.022740107029676437, "kl": 0.47988636791706085, "learning_rate": 9.998304709281952e-06, "loss": 0.0137, "step": 1950, "step_time": 2.014459468991845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 122.3125, "completions/mean_terminated_length": 122.3125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.46509452629834414, "epoch": 0.01951, "frac_reward_zero_std": 0.5, "grad_norm": 0.28790631890296936, "kl": 0.7835483066737652, "learning_rate": 9.998302937489192e-06, "loss": -0.0078, "num_tokens": 16986180.0, "reward": 0.5947644114494324, "reward_std": 0.07306315749883652, "rewards/rollout_reward_func/mean": 0.5947644114494324, "rewards/rollout_reward_func/std": 0.37581366300582886, "sampling/importance_sampling_ratio/max": 1.0498541593551636, "sampling/importance_sampling_ratio/mean": 0.9284765124320984, "sampling/importance_sampling_ratio/min": 0.00610563438385725, "sampling/sampling_logp_difference/max": 2.77675724029541, "sampling/sampling_logp_difference/mean": 0.06792394071817398, "step": 1951, "step_time": 3.6008620989887277 }, { "clip_ratio/high_max": 0.03750000149011612, "clip_ratio/high_mean": 0.01875000074505806, "clip_ratio/low_mean": 0.04843750037252903, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0671875006519258, "entropy": 0.5115638449788094, "epoch": 0.01952, "grad_norm": 0.3825843036174774, "kl": 0.7546615973114967, "learning_rate": 9.998301164771252e-06, "loss": -0.0093, "step": 1952, "step_time": 2.4251610879873624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 106.8125, "completions/mean_terminated_length": 106.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4964587949216366, "epoch": 0.01953, "frac_reward_zero_std": 0.75, "grad_norm": 0.01092950813472271, "kl": 0.5658315233886242, "learning_rate": 9.998299391128132e-06, "loss": -0.0075, "num_tokens": 17000366.0, "reward": 0.7619711756706238, "reward_std": 0.01563793420791626, "rewards/rollout_reward_func/mean": 0.7619711756706238, "rewards/rollout_reward_func/std": 0.26876094937324524, "sampling/importance_sampling_ratio/max": 0.9970987439155579, "sampling/importance_sampling_ratio/mean": 0.9310363531112671, "sampling/importance_sampling_ratio/min": 0.009615878574550152, "sampling/sampling_logp_difference/max": 1.5242595672607422, "sampling/sampling_logp_difference/mean": 0.08336267620325089, "step": 1953, "step_time": 3.914967629003513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4993294030427933, "epoch": 0.01954, "grad_norm": 0.01053064875304699, "kl": 0.5658372528851032, "learning_rate": 9.998297616559831e-06, "loss": -0.0075, "step": 1954, "step_time": 2.010139534009795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.25054808240383863, "epoch": 0.01955, "frac_reward_zero_std": 0.5, "grad_norm": 0.034663282334804535, "kl": 0.6552061252295971, "learning_rate": 9.99829584106635e-06, "loss": -0.0501, "num_tokens": 17015890.0, "reward": 0.7496153712272644, "reward_std": 0.03807498514652252, "rewards/rollout_reward_func/mean": 0.7496153712272644, "rewards/rollout_reward_func/std": 0.2307298630475998, "sampling/importance_sampling_ratio/max": 0.9974820017814636, "sampling/importance_sampling_ratio/mean": 0.9356355667114258, "sampling/importance_sampling_ratio/min": 0.05659661069512367, "sampling/sampling_logp_difference/max": 2.0595626831054688, "sampling/sampling_logp_difference/mean": 0.028744852170348167, "step": 1955, "step_time": 4.126009694999084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24824599362909794, "epoch": 0.01956, "grad_norm": 0.03158736228942871, "kl": 0.6562101021409035, "learning_rate": 9.998294064647693e-06, "loss": -0.0502, "step": 1956, "step_time": 2.4389604680181947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 171.96875, "completions/mean_terminated_length": 171.96875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2661920776590705, "epoch": 0.01957, "frac_reward_zero_std": 0.75, "grad_norm": 0.008885967545211315, "kl": 0.5221461169421673, "learning_rate": 9.998292287303857e-06, "loss": -0.0269, "num_tokens": 17032161.0, "reward": 0.9441298246383667, "reward_std": 0.010946566238999367, "rewards/rollout_reward_func/mean": 0.9441298246383667, "rewards/rollout_reward_func/std": 0.32017895579338074, "sampling/importance_sampling_ratio/max": 1.01192045211792, "sampling/importance_sampling_ratio/mean": 0.9655469059944153, "sampling/importance_sampling_ratio/min": 8.694689313415438e-05, "sampling/sampling_logp_difference/max": 3.214167833328247, "sampling/sampling_logp_difference/mean": 0.05752367526292801, "step": 1957, "step_time": 4.638165339994885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2681763730943203, "epoch": 0.01958, "grad_norm": 0.009779232554137707, "kl": 0.5276719778776169, "learning_rate": 9.99829050903484e-06, "loss": -0.0269, "step": 1958, "step_time": 2.011344693004503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 144.75, "completions/mean_terminated_length": 144.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.07242080383002758, "epoch": 0.01959, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004971857415512204, "kl": 0.4133491553366184, "learning_rate": 9.998288729840651e-06, "loss": 0.0013, "num_tokens": 17047561.0, "reward": 0.6978076696395874, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6978076696395874, "rewards/rollout_reward_func/std": 0.2752751111984253, "sampling/importance_sampling_ratio/max": 0.9979498386383057, "sampling/importance_sampling_ratio/mean": 0.9934554696083069, "sampling/importance_sampling_ratio/min": 0.9891226887702942, "sampling/sampling_logp_difference/max": 0.008700132369995117, "sampling/sampling_logp_difference/mean": 0.0014869996812194586, "step": 1959, "step_time": 4.047763193004357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07283637579530478, "epoch": 0.0196, "grad_norm": 0.0004941393854096532, "kl": 0.4132792577147484, "learning_rate": 9.998286949721281e-06, "loss": 0.0013, "step": 1960, "step_time": 2.0040695189964026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 215.625, "completions/mean_terminated_length": 215.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.11366900242865086, "epoch": 0.01961, "frac_reward_zero_std": 0.75, "grad_norm": 0.009315533563494682, "kl": 0.5418121963739395, "learning_rate": 9.998285168676736e-06, "loss": -0.0329, "num_tokens": 17065141.0, "reward": 1.0075912475585938, "reward_std": 0.010946566238999367, "rewards/rollout_reward_func/mean": 1.0075912475585938, "rewards/rollout_reward_func/std": 0.2824409604072571, "sampling/importance_sampling_ratio/max": 0.9970142841339111, "sampling/importance_sampling_ratio/mean": 0.9633313417434692, "sampling/importance_sampling_ratio/min": 0.08302631229162216, "sampling/sampling_logp_difference/max": 2.2797038555145264, "sampling/sampling_logp_difference/mean": 0.01244304422289133, "step": 1961, "step_time": 4.478416373996879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11470554210245609, "epoch": 0.01962, "grad_norm": 0.008505609817802906, "kl": 0.5446711592376232, "learning_rate": 9.998283386707014e-06, "loss": -0.0329, "step": 1962, "step_time": 2.482226626998454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 80.09375, "completions/mean_terminated_length": 80.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.19360649678856134, "epoch": 0.01963, "frac_reward_zero_std": 0.75, "grad_norm": 0.05487649887800217, "kl": 0.6332036182284355, "learning_rate": 9.998281603812117e-06, "loss": 0.0153, "num_tokens": 17078528.0, "reward": 0.7363942265510559, "reward_std": 0.01590990275144577, "rewards/rollout_reward_func/mean": 0.7363942265510559, "rewards/rollout_reward_func/std": 0.13764333724975586, "sampling/importance_sampling_ratio/max": 0.9963550567626953, "sampling/importance_sampling_ratio/mean": 0.9639598727226257, "sampling/importance_sampling_ratio/min": 0.17660103738307953, "sampling/sampling_logp_difference/max": 1.377039909362793, "sampling/sampling_logp_difference/mean": 0.01713281124830246, "step": 1963, "step_time": 4.2179899780021515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.19461738504469395, "epoch": 0.01964, "grad_norm": 0.05501183494925499, "kl": 0.6335643976926804, "learning_rate": 9.998279819992045e-06, "loss": 0.0153, "step": 1964, "step_time": 1.996016958997643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 117.09375, "completions/mean_terminated_length": 117.09375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 0.4624760979786515, "epoch": 0.01965, "frac_reward_zero_std": 0.5, "grad_norm": 0.00636576022952795, "kl": 0.5738932304084301, "learning_rate": 9.998278035246798e-06, "loss": -0.0462, "num_tokens": 17093131.0, "reward": 0.8589423298835754, "reward_std": 0.06282372027635574, "rewards/rollout_reward_func/mean": 0.8589423298835754, "rewards/rollout_reward_func/std": 0.28961288928985596, "sampling/importance_sampling_ratio/max": 0.9990653395652771, "sampling/importance_sampling_ratio/mean": 0.9327444434165955, "sampling/importance_sampling_ratio/min": 1.58050705856283e-09, "sampling/sampling_logp_difference/max": 15.676072120666504, "sampling/sampling_logp_difference/mean": 0.1744762361049652, "step": 1965, "step_time": 3.9152677599922754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4645443167537451, "epoch": 0.01966, "grad_norm": 0.006190860643982887, "kl": 0.5696958303451538, "learning_rate": 9.998276249576377e-06, "loss": -0.0462, "step": 1966, "step_time": 1.988909302999673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 146.25, "completions/mean_terminated_length": 146.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.06512486282736063, "epoch": 0.01967, "frac_reward_zero_std": 1.0, "grad_norm": 0.00043598079355433583, "kl": 0.4154646396636963, "learning_rate": 9.998274462980782e-06, "loss": 0.0013, "num_tokens": 17108603.0, "reward": 0.7073076963424683, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7073076963424683, "rewards/rollout_reward_func/std": 0.29354342818260193, "sampling/importance_sampling_ratio/max": 0.9981032609939575, "sampling/importance_sampling_ratio/mean": 0.9917911291122437, "sampling/importance_sampling_ratio/min": 0.9816240668296814, "sampling/sampling_logp_difference/max": 0.010899640619754791, "sampling/sampling_logp_difference/mean": 0.001690130913630128, "step": 1967, "step_time": 4.384639457006415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06552700884640217, "epoch": 0.01968, "grad_norm": 0.0004432684800121933, "kl": 0.4153778851032257, "learning_rate": 9.998272675460014e-06, "loss": 0.0013, "step": 1968, "step_time": 2.0370030639969627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 85.90625, "completions/mean_terminated_length": 85.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7062176670879126, "epoch": 0.01969, "frac_reward_zero_std": 0.25, "grad_norm": 0.10085167735815048, "kl": 0.730421245098114, "learning_rate": 9.998270887014071e-06, "loss": 0.0212, "num_tokens": 17122176.0, "reward": 0.48423075675964355, "reward_std": 0.05194514989852905, "rewards/rollout_reward_func/mean": 0.48423075675964355, "rewards/rollout_reward_func/std": 0.4037240147590637, "sampling/importance_sampling_ratio/max": 0.9950609803199768, "sampling/importance_sampling_ratio/mean": 0.9071742296218872, "sampling/importance_sampling_ratio/min": 0.00042518723057582974, "sampling/sampling_logp_difference/max": 1.881084680557251, "sampling/sampling_logp_difference/mean": 0.11493711173534393, "step": 1969, "step_time": 4.2447926300010295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.689830326475203, "epoch": 0.0197, "grad_norm": 0.09813563525676727, "kl": 0.7172835618257523, "learning_rate": 9.998269097642959e-06, "loss": 0.0208, "step": 1970, "step_time": 2.0357465079941903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 170.25, "completions/mean_terminated_length": 170.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.0668472102843225, "epoch": 0.01971, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005310673150233924, "kl": 0.39984671398997307, "learning_rate": 9.998267307346673e-06, "loss": 0.0014, "num_tokens": 17138360.0, "reward": 0.8497307300567627, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8497307300567627, "rewards/rollout_reward_func/std": 0.33162015676498413, "sampling/importance_sampling_ratio/max": 0.9976913928985596, "sampling/importance_sampling_ratio/mean": 0.9932599067687988, "sampling/importance_sampling_ratio/min": 0.988815426826477, "sampling/sampling_logp_difference/max": 0.007646389305591583, "sampling/sampling_logp_difference/mean": 0.001363134477287531, "step": 1971, "step_time": 4.259356539987493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0693562994711101, "epoch": 0.01972, "grad_norm": 0.0005529270856641233, "kl": 0.3993835113942623, "learning_rate": 9.998265516125213e-06, "loss": 0.0014, "step": 1972, "step_time": 2.0376676879968727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 155.84375, "completions/mean_terminated_length": 155.84375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.5493725771084428, "epoch": 0.01973, "frac_reward_zero_std": 0.5, "grad_norm": 0.013776956126093864, "kl": 0.5034861713647842, "learning_rate": 9.998263723978585e-06, "loss": -0.036, "num_tokens": 17154115.0, "reward": 0.9177548289299011, "reward_std": 0.04170570895075798, "rewards/rollout_reward_func/mean": 0.9177548289299011, "rewards/rollout_reward_func/std": 0.24006158113479614, "sampling/importance_sampling_ratio/max": 0.9994460940361023, "sampling/importance_sampling_ratio/mean": 0.9307718276977539, "sampling/importance_sampling_ratio/min": 1.727838935039472e-05, "sampling/sampling_logp_difference/max": 1.7558839321136475, "sampling/sampling_logp_difference/mean": 0.08840354532003403, "step": 1973, "step_time": 4.658357954976964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5382881034165621, "epoch": 0.01974, "grad_norm": 0.01473134383559227, "kl": 0.49523259699344635, "learning_rate": 9.998261930906784e-06, "loss": -0.036, "step": 1974, "step_time": 2.5168722749949666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 179.3125, "completions/mean_terminated_length": 179.3125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.20046882797032595, "epoch": 0.01975, "frac_reward_zero_std": 0.75, "grad_norm": 0.0975799635052681, "kl": 0.41226566955447197, "learning_rate": 9.998260136909814e-06, "loss": -0.026, "num_tokens": 17170477.0, "reward": 0.6142500042915344, "reward_std": 0.0030267168767750263, "rewards/rollout_reward_func/mean": 0.6142500042915344, "rewards/rollout_reward_func/std": 0.33184221386909485, "sampling/importance_sampling_ratio/max": 0.9956982731819153, "sampling/importance_sampling_ratio/mean": 0.9533510804176331, "sampling/importance_sampling_ratio/min": 0.36428916454315186, "sampling/sampling_logp_difference/max": 0.8164523243904114, "sampling/sampling_logp_difference/mean": 0.01199693325906992, "step": 1975, "step_time": 4.250303098997392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20177250169217587, "epoch": 0.01976, "grad_norm": 0.10197248309850693, "kl": 0.41386203467845917, "learning_rate": 9.998258341987673e-06, "loss": -0.0261, "step": 1976, "step_time": 2.0102775880150148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 145.34375, "completions/mean_terminated_length": 145.34375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3002017252147198, "epoch": 0.01977, "frac_reward_zero_std": 0.5, "grad_norm": 0.03914771229028702, "kl": 0.5325799249112606, "learning_rate": 9.998256546140363e-06, "loss": 0.004, "num_tokens": 17185920.0, "reward": 0.6477547883987427, "reward_std": 0.17069830000400543, "rewards/rollout_reward_func/mean": 0.6477547883987427, "rewards/rollout_reward_func/std": 0.41141611337661743, "sampling/importance_sampling_ratio/max": 0.9983295798301697, "sampling/importance_sampling_ratio/mean": 0.9342338442802429, "sampling/importance_sampling_ratio/min": 0.00010719238343881443, "sampling/sampling_logp_difference/max": 3.0047311782836914, "sampling/sampling_logp_difference/mean": 0.0589144341647625, "step": 1977, "step_time": 4.2837695839989465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.2988880453631282, "epoch": 0.01978, "grad_norm": 0.017122061923146248, "kl": 0.5414213761687279, "learning_rate": 9.998254749367885e-06, "loss": 0.0039, "step": 1978, "step_time": 2.0320210790087003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.059807170648127794, "epoch": 0.01979, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005546481115743518, "kl": 0.410179540514946, "learning_rate": 9.998252951670238e-06, "loss": 0.0014, "num_tokens": 17202008.0, "reward": 0.8774230480194092, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8774230480194092, "rewards/rollout_reward_func/std": 0.3987748920917511, "sampling/importance_sampling_ratio/max": 0.9980103969573975, "sampling/importance_sampling_ratio/mean": 0.9924474954605103, "sampling/importance_sampling_ratio/min": 0.984167218208313, "sampling/sampling_logp_difference/max": 0.011194178834557533, "sampling/sampling_logp_difference/mean": 0.0014592689694836736, "step": 1979, "step_time": 4.674184068993782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05932261934503913, "epoch": 0.0198, "grad_norm": 0.0005525150918401778, "kl": 0.410274013876915, "learning_rate": 9.998251153047422e-06, "loss": 0.0014, "step": 1980, "step_time": 2.4683249019799405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 151.4375, "completions/mean_terminated_length": 151.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.24855411984026432, "epoch": 0.01981, "frac_reward_zero_std": 0.75, "grad_norm": 0.014296848326921463, "kl": 0.5342022776603699, "learning_rate": 9.99824935349944e-06, "loss": 0.0107, "num_tokens": 17217654.0, "reward": 0.9349230527877808, "reward_std": 0.013598205521702766, "rewards/rollout_reward_func/mean": 0.9349230527877808, "rewards/rollout_reward_func/std": 0.3353347182273865, "sampling/importance_sampling_ratio/max": 0.9976401925086975, "sampling/importance_sampling_ratio/mean": 0.9612524509429932, "sampling/importance_sampling_ratio/min": 0.020312055945396423, "sampling/sampling_logp_difference/max": 1.9296882152557373, "sampling/sampling_logp_difference/mean": 0.03153928741812706, "step": 1981, "step_time": 4.049744780015317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2493234956637025, "epoch": 0.01982, "grad_norm": 0.01397644355893135, "kl": 0.5318287201225758, "learning_rate": 9.998247553026289e-06, "loss": 0.0107, "step": 1982, "step_time": 2.0165722359888605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 97.6875, "completions/mean_terminated_length": 97.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5342803532257676, "epoch": 0.01983, "frac_reward_zero_std": 0.5, "grad_norm": 0.009268886409699917, "kl": 0.7184050604701042, "learning_rate": 9.998245751627972e-06, "loss": -0.0354, "num_tokens": 17231604.0, "reward": 0.7384134531021118, "reward_std": 0.22532229125499725, "rewards/rollout_reward_func/mean": 0.7384134531021118, "rewards/rollout_reward_func/std": 0.4154207110404968, "sampling/importance_sampling_ratio/max": 0.9980008602142334, "sampling/importance_sampling_ratio/mean": 0.9320051670074463, "sampling/importance_sampling_ratio/min": 1.5447879420094068e-08, "sampling/sampling_logp_difference/max": 4.1761274337768555, "sampling/sampling_logp_difference/mean": 0.1829378604888916, "step": 1983, "step_time": 3.8435602719982853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5347264725714922, "epoch": 0.01984, "grad_norm": 0.009410797618329525, "kl": 0.7190547361969948, "learning_rate": 9.99824394930449e-06, "loss": -0.0354, "step": 1984, "step_time": 2.4519071620088653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 126.25, "completions/mean_terminated_length": 126.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.0738941514864564, "epoch": 0.01985, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004878097679466009, "kl": 0.42198164761066437, "learning_rate": 9.99824214605584e-06, "loss": 0.0012, "num_tokens": 17246468.0, "reward": 0.6778076887130737, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6778076887130737, "rewards/rollout_reward_func/std": 0.24186331033706665, "sampling/importance_sampling_ratio/max": 0.997938334941864, "sampling/importance_sampling_ratio/mean": 0.9918030500411987, "sampling/importance_sampling_ratio/min": 0.9847722053527832, "sampling/sampling_logp_difference/max": 0.012366004288196564, "sampling/sampling_logp_difference/mean": 0.0018876888789236546, "step": 1985, "step_time": 3.8252603679939057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07370955077931285, "epoch": 0.01986, "grad_norm": 0.000485797761939466, "kl": 0.42202699556946754, "learning_rate": 9.998240341882025e-06, "loss": 0.0012, "step": 1986, "step_time": 2.4473140240079374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.7301807552576065, "epoch": 0.01987, "frac_reward_zero_std": 0.25, "grad_norm": 0.06826220452785492, "kl": 0.6062050387263298, "learning_rate": 9.998238536783045e-06, "loss": 0.0563, "num_tokens": 17263860.0, "reward": 0.7405865788459778, "reward_std": 0.02773917093873024, "rewards/rollout_reward_func/mean": 0.7405865788459778, "rewards/rollout_reward_func/std": 0.42019981145858765, "sampling/importance_sampling_ratio/max": 0.997133731842041, "sampling/importance_sampling_ratio/mean": 0.8748296499252319, "sampling/importance_sampling_ratio/min": 8.349036472078556e-16, "sampling/sampling_logp_difference/max": 2.695267677307129, "sampling/sampling_logp_difference/mean": 0.19276374578475952, "step": 1987, "step_time": 4.310180632994161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.724446717184037, "epoch": 0.01988, "grad_norm": 0.06950357556343079, "kl": 0.5943744368851185, "learning_rate": 9.998236730758902e-06, "loss": 0.0561, "step": 1988, "step_time": 2.0497441240004264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 196.03125, "completions/mean_terminated_length": 196.03125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5742259556427598, "epoch": 0.01989, "frac_reward_zero_std": 0.25, "grad_norm": 0.05676422268152237, "kl": 0.5017736069858074, "learning_rate": 9.998234923809592e-06, "loss": -0.0465, "num_tokens": 17280845.0, "reward": 0.6980192065238953, "reward_std": 0.036293670535087585, "rewards/rollout_reward_func/mean": 0.6980192065238953, "rewards/rollout_reward_func/std": 0.41865772008895874, "sampling/importance_sampling_ratio/max": 0.9984753727912903, "sampling/importance_sampling_ratio/mean": 0.8624382615089417, "sampling/importance_sampling_ratio/min": 2.6865753170568496e-05, "sampling/sampling_logp_difference/max": 2.3012917041778564, "sampling/sampling_logp_difference/mean": 0.0918322205543518, "step": 1989, "step_time": 4.272199109007488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5621412838809192, "epoch": 0.0199, "grad_norm": 0.05535884201526642, "kl": 0.4935809224843979, "learning_rate": 9.99823311593512e-06, "loss": -0.0466, "step": 1990, "step_time": 2.5312924080135417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2380963684991002, "epoch": 0.01991, "frac_reward_zero_std": 0.5, "grad_norm": 0.11034736782312393, "kl": 0.4628691300749779, "learning_rate": 9.998231307135483e-06, "loss": -0.0564, "num_tokens": 17299201.0, "reward": 1.0566587448120117, "reward_std": 0.05742524191737175, "rewards/rollout_reward_func/mean": 1.0566587448120117, "rewards/rollout_reward_func/std": 0.33376166224479675, "sampling/importance_sampling_ratio/max": 1.0087404251098633, "sampling/importance_sampling_ratio/mean": 0.9435611963272095, "sampling/importance_sampling_ratio/min": 0.13685880601406097, "sampling/sampling_logp_difference/max": 1.4683599472045898, "sampling/sampling_logp_difference/mean": 0.0182797871530056, "step": 1991, "step_time": 4.2954270579939475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23891395796090364, "epoch": 0.01992, "grad_norm": 0.11022135615348816, "kl": 0.4616767019033432, "learning_rate": 9.998229497410686e-06, "loss": -0.0563, "step": 1992, "step_time": 2.0180184919954627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 130.5, "completions/mean_terminated_length": 130.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3821779889985919, "epoch": 0.01993, "frac_reward_zero_std": 0.75, "grad_norm": 0.0052868821658194065, "kl": 0.7123829275369644, "learning_rate": 9.998227686760726e-06, "loss": -0.0074, "num_tokens": 17314233.0, "reward": 0.7680288553237915, "reward_std": 0.02243703417479992, "rewards/rollout_reward_func/mean": 0.7680288553237915, "rewards/rollout_reward_func/std": 0.21696855127811432, "sampling/importance_sampling_ratio/max": 0.9980087280273438, "sampling/importance_sampling_ratio/mean": 0.9590555429458618, "sampling/importance_sampling_ratio/min": 0.015520691871643066, "sampling/sampling_logp_difference/max": 1.7254770994186401, "sampling/sampling_logp_difference/mean": 0.03571934252977371, "step": 1993, "step_time": 3.904566129014711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3840827299281955, "epoch": 0.01994, "grad_norm": 0.004520401358604431, "kl": 0.6958221085369587, "learning_rate": 9.998225875185604e-06, "loss": -0.0074, "step": 1994, "step_time": 2.0209311399958096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 170.96875, "completions/mean_terminated_length": 169.51612854003906, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7433792240917683, "epoch": 0.01995, "frac_reward_zero_std": 0.25, "grad_norm": 0.4802533984184265, "kl": 0.4864877462387085, "learning_rate": 9.99822406268532e-06, "loss": -0.0767, "num_tokens": 17330384.0, "reward": 0.7444999814033508, "reward_std": 0.1306515783071518, "rewards/rollout_reward_func/mean": 0.7444999814033508, "rewards/rollout_reward_func/std": 0.41516414284706116, "sampling/importance_sampling_ratio/max": 1.011826992034912, "sampling/importance_sampling_ratio/mean": 0.880294919013977, "sampling/importance_sampling_ratio/min": 2.235061907687047e-13, "sampling/sampling_logp_difference/max": 3.6274609565734863, "sampling/sampling_logp_difference/mean": 0.22211411595344543, "step": 1995, "step_time": 4.350337772986677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 0.7501991409808397, "epoch": 0.01996, "grad_norm": 0.018429933115839958, "kl": 0.5190138071775436, "learning_rate": 9.998222249259874e-06, "loss": -0.0771, "step": 1996, "step_time": 2.482230942005117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 130.375, "completions/mean_terminated_length": 130.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6868816036731005, "epoch": 0.01997, "frac_reward_zero_std": 0.25, "grad_norm": 0.0321163684129715, "kl": 0.686522476375103, "learning_rate": 9.99822043490927e-06, "loss": -0.044, "num_tokens": 17345412.0, "reward": 0.867892861366272, "reward_std": 0.05197642371058464, "rewards/rollout_reward_func/mean": 0.867892861366272, "rewards/rollout_reward_func/std": 0.28818655014038086, "sampling/importance_sampling_ratio/max": 0.9953957200050354, "sampling/importance_sampling_ratio/mean": 0.9007372260093689, "sampling/importance_sampling_ratio/min": 0.00020600289280992, "sampling/sampling_logp_difference/max": 1.8950116634368896, "sampling/sampling_logp_difference/mean": 0.11053943634033203, "step": 1997, "step_time": 4.748455538014241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6836779061704874, "epoch": 0.01998, "grad_norm": 0.029896089807152748, "kl": 0.702553778886795, "learning_rate": 9.998218619633505e-06, "loss": -0.044, "step": 1998, "step_time": 2.0462819629974547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 149.6875, "completions/mean_terminated_length": 149.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.22063126508146524, "epoch": 0.01999, "frac_reward_zero_std": 0.75, "grad_norm": 0.01623011939227581, "kl": 0.509683471173048, "learning_rate": 9.99821680343258e-06, "loss": 0.0203, "num_tokens": 17361082.0, "reward": 0.6358797550201416, "reward_std": 0.002488473430275917, "rewards/rollout_reward_func/mean": 0.6358797550201416, "rewards/rollout_reward_func/std": 0.23243136703968048, "sampling/importance_sampling_ratio/max": 1.0127586126327515, "sampling/importance_sampling_ratio/mean": 0.9673901200294495, "sampling/importance_sampling_ratio/min": 0.013594591990113258, "sampling/sampling_logp_difference/max": 1.9537522792816162, "sampling/sampling_logp_difference/mean": 0.028699448332190514, "step": 1999, "step_time": 3.933015873990371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2199585447087884, "epoch": 0.02, "grad_norm": 0.016027648001909256, "kl": 0.5080794394016266, "learning_rate": 9.998214986306494e-06, "loss": 0.0203, "step": 2000, "step_time": 1.9984878289978951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 171.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.17369648767635226, "epoch": 0.02001, "frac_reward_zero_std": 0.75, "grad_norm": 0.012439731508493423, "kl": 0.40797457098960876, "learning_rate": 9.998213168255252e-06, "loss": -0.0172, "num_tokens": 17377290.0, "reward": 0.9414903521537781, "reward_std": 0.008838835172355175, "rewards/rollout_reward_func/mean": 0.9414903521537781, "rewards/rollout_reward_func/std": 0.2671399712562561, "sampling/importance_sampling_ratio/max": 0.9976761937141418, "sampling/importance_sampling_ratio/mean": 0.9626553058624268, "sampling/importance_sampling_ratio/min": 0.016402993351221085, "sampling/sampling_logp_difference/max": 2.5957791805267334, "sampling/sampling_logp_difference/mean": 0.027117159217596054, "step": 2001, "step_time": 4.117125471006148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1726176766678691, "epoch": 0.02002, "grad_norm": 0.011776519939303398, "kl": 0.4094548486173153, "learning_rate": 9.99821134927885e-06, "loss": -0.0172, "step": 2002, "step_time": 2.4997636139960377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 98.5625, "completions/mean_terminated_length": 98.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4001256115734577, "epoch": 0.02003, "frac_reward_zero_std": 0.5, "grad_norm": 0.02919500693678856, "kl": 0.5703718513250351, "learning_rate": 9.998209529377293e-06, "loss": -0.0092, "num_tokens": 17391300.0, "reward": 0.6165754795074463, "reward_std": 0.040417950600385666, "rewards/rollout_reward_func/mean": 0.6165754795074463, "rewards/rollout_reward_func/std": 0.1163865402340889, "sampling/importance_sampling_ratio/max": 0.9982806444168091, "sampling/importance_sampling_ratio/mean": 0.9328080415725708, "sampling/importance_sampling_ratio/min": 0.0021343373227864504, "sampling/sampling_logp_difference/max": 2.0863053798675537, "sampling/sampling_logp_difference/mean": 0.07011888921260834, "step": 2003, "step_time": 4.394378691009479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.395312356762588, "epoch": 0.02004, "grad_norm": 0.02920965850353241, "kl": 0.5743094757199287, "learning_rate": 9.998207708550573e-06, "loss": -0.0092, "step": 2004, "step_time": 2.048094902005687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 208.28125, "completions/mean_terminated_length": 208.28125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.14803016372025013, "epoch": 0.02005, "frac_reward_zero_std": 0.75, "grad_norm": 0.027909228578209877, "kl": 0.7740930207073689, "learning_rate": 9.9982058867987e-06, "loss": -0.0294, "num_tokens": 17408589.0, "reward": 0.6149759292602539, "reward_std": 0.1576842963695526, "rewards/rollout_reward_func/mean": 0.6149759292602539, "rewards/rollout_reward_func/std": 0.4085465669631958, "sampling/importance_sampling_ratio/max": 0.9949964880943298, "sampling/importance_sampling_ratio/mean": 0.9344276785850525, "sampling/importance_sampling_ratio/min": 0.05461690574884415, "sampling/sampling_logp_difference/max": 2.7778687477111816, "sampling/sampling_logp_difference/mean": 0.02189299836754799, "step": 2005, "step_time": 4.738181398999586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1472862046211958, "epoch": 0.02006, "grad_norm": 0.030881362035870552, "kl": 0.79154197499156, "learning_rate": 9.998204064121671e-06, "loss": -0.0293, "step": 2006, "step_time": 2.0320797560125357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 155.3125, "completions/mean_terminated_length": 155.3125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.615766734816134, "epoch": 0.02007, "frac_reward_zero_std": 0.5, "grad_norm": 0.02325107716023922, "kl": 0.5155012235045433, "learning_rate": 9.998202240519484e-06, "loss": 0.0009, "num_tokens": 17424295.0, "reward": 0.542471170425415, "reward_std": 0.01835758239030838, "rewards/rollout_reward_func/mean": 0.542471170425415, "rewards/rollout_reward_func/std": 0.20373791456222534, "sampling/importance_sampling_ratio/max": 0.9979544878005981, "sampling/importance_sampling_ratio/mean": 0.9331952333450317, "sampling/importance_sampling_ratio/min": 4.232967796263349e-19, "sampling/sampling_logp_difference/max": 4.334904670715332, "sampling/sampling_logp_difference/mean": 0.2743903696537018, "step": 2007, "step_time": 4.482087879987375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.61460514459759, "epoch": 0.02008, "grad_norm": 0.023335032165050507, "kl": 0.504012081772089, "learning_rate": 9.998200415992141e-06, "loss": 0.0009, "step": 2008, "step_time": 3.0221198400104186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 174.8125, "completions/mean_terminated_length": 174.8125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.23108482686802745, "epoch": 0.02009, "frac_reward_zero_std": 0.75, "grad_norm": 0.07473227381706238, "kl": 0.4799247831106186, "learning_rate": 9.998198590539644e-06, "loss": 0.0323, "num_tokens": 17440601.0, "reward": 0.6694711446762085, "reward_std": 0.018747860565781593, "rewards/rollout_reward_func/mean": 0.6694711446762085, "rewards/rollout_reward_func/std": 0.1790381222963333, "sampling/importance_sampling_ratio/max": 0.999460756778717, "sampling/importance_sampling_ratio/mean": 0.9425047039985657, "sampling/importance_sampling_ratio/min": 0.1363554447889328, "sampling/sampling_logp_difference/max": 1.3746980428695679, "sampling/sampling_logp_difference/mean": 0.024934418499469757, "step": 2009, "step_time": 4.266781713005912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2172650252468884, "epoch": 0.0201, "grad_norm": 0.06882211565971375, "kl": 0.48295804113149643, "learning_rate": 9.998196764161994e-06, "loss": 0.0322, "step": 2010, "step_time": 2.0554310400038958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 122.65625, "completions/mean_terminated_length": 122.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.37746181804686785, "epoch": 0.02011, "frac_reward_zero_std": 0.5, "grad_norm": 0.014804188162088394, "kl": 0.6487613096833229, "learning_rate": 9.998194936859188e-06, "loss": -0.0078, "num_tokens": 17455238.0, "reward": 0.5865865349769592, "reward_std": 0.02651650831103325, "rewards/rollout_reward_func/mean": 0.5865865349769592, "rewards/rollout_reward_func/std": 0.12827855348587036, "sampling/importance_sampling_ratio/max": 0.9956144094467163, "sampling/importance_sampling_ratio/mean": 0.9271233081817627, "sampling/importance_sampling_ratio/min": 0.0069270082749426365, "sampling/sampling_logp_difference/max": 1.7085922956466675, "sampling/sampling_logp_difference/mean": 0.06638062745332718, "step": 2011, "step_time": 3.9808183740024106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3763347305357456, "epoch": 0.02012, "grad_norm": 0.01483008824288845, "kl": 0.6501968502998352, "learning_rate": 9.99819310863123e-06, "loss": -0.0078, "step": 2012, "step_time": 2.045162573995185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 101.75, "completions/mean_terminated_length": 101.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09194738697260618, "epoch": 0.02013, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005303929792717099, "kl": 0.496636476367712, "learning_rate": 9.998191279478115e-06, "loss": 0.0011, "num_tokens": 17469318.0, "reward": 0.7982692122459412, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7982692122459412, "rewards/rollout_reward_func/std": 0.2223958969116211, "sampling/importance_sampling_ratio/max": 0.9969207048416138, "sampling/importance_sampling_ratio/mean": 0.9910677671432495, "sampling/importance_sampling_ratio/min": 0.9852586388587952, "sampling/sampling_logp_difference/max": 0.010646224021911621, "sampling/sampling_logp_difference/mean": 0.002285701222717762, "step": 2013, "step_time": 3.739123310006107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09389202855527401, "epoch": 0.02014, "grad_norm": 0.0005508078611455858, "kl": 0.4962836876511574, "learning_rate": 9.998189449399849e-06, "loss": 0.0011, "step": 2014, "step_time": 2.501128124000388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 152.5625, "completions/mean_terminated_length": 153.61289978027344, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5821972209960222, "epoch": 0.02015, "frac_reward_zero_std": 0.25, "grad_norm": 0.021881740540266037, "kl": 0.5830247886478901, "learning_rate": 9.99818761839643e-06, "loss": -0.07, "num_tokens": 17484880.0, "reward": 0.885937511920929, "reward_std": 0.046301908791065216, "rewards/rollout_reward_func/mean": 0.885937511920929, "rewards/rollout_reward_func/std": 0.31986650824546814, "sampling/importance_sampling_ratio/max": 0.9991850852966309, "sampling/importance_sampling_ratio/mean": 0.9044314622879028, "sampling/importance_sampling_ratio/min": 1.090416599109625e-14, "sampling/sampling_logp_difference/max": 2.9173994064331055, "sampling/sampling_logp_difference/mean": 0.20513835549354553, "step": 2015, "step_time": 4.301637919015775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5824386663734913, "epoch": 0.02016, "grad_norm": 0.02195592038333416, "kl": 0.5826634801924229, "learning_rate": 9.99818578646786e-06, "loss": -0.07, "step": 2016, "step_time": 2.042974634001439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 201.0625, "completions/mean_terminated_length": 201.0625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.972468982450664, "epoch": 0.02017, "frac_reward_zero_std": 0.0, "grad_norm": 0.10868015140295029, "kl": 1.132193423807621, "learning_rate": 9.998183953614138e-06, "loss": -0.1238, "num_tokens": 17501994.0, "reward": 0.9043701887130737, "reward_std": 0.29359954595565796, "rewards/rollout_reward_func/mean": 0.9043701887130737, "rewards/rollout_reward_func/std": 0.44543686509132385, "sampling/importance_sampling_ratio/max": 0.9947247505187988, "sampling/importance_sampling_ratio/mean": 0.7912356853485107, "sampling/importance_sampling_ratio/min": 6.823187848059575e-14, "sampling/sampling_logp_difference/max": 3.279271125793457, "sampling/sampling_logp_difference/mean": 0.19617946445941925, "step": 2017, "step_time": 4.815772204005043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9739311644807458, "epoch": 0.02018, "grad_norm": 0.0945805013179779, "kl": 1.0537594482302666, "learning_rate": 9.998182119835265e-06, "loss": -0.1242, "step": 2018, "step_time": 2.0207068259915104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 142.28125, "completions/mean_terminated_length": 142.28125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.18149559292942286, "epoch": 0.02019, "frac_reward_zero_std": 0.75, "grad_norm": 0.05777789652347565, "kl": 0.45757006108760834, "learning_rate": 9.998180285131242e-06, "loss": -0.0235, "num_tokens": 17517227.0, "reward": 0.6809134483337402, "reward_std": 0.002311693038791418, "rewards/rollout_reward_func/mean": 0.6809134483337402, "rewards/rollout_reward_func/std": 0.30411943793296814, "sampling/importance_sampling_ratio/max": 0.9945199489593506, "sampling/importance_sampling_ratio/mean": 0.9646580815315247, "sampling/importance_sampling_ratio/min": 0.18601730465888977, "sampling/sampling_logp_difference/max": 1.0821110010147095, "sampling/sampling_logp_difference/mean": 0.012996208854019642, "step": 2019, "step_time": 4.652377065009205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.186161189340055, "epoch": 0.0202, "grad_norm": 0.05731310695409775, "kl": 0.45871224254369736, "learning_rate": 9.998178449502067e-06, "loss": -0.0236, "step": 2020, "step_time": 2.4265310920018237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 208.40625, "completions/mean_terminated_length": 208.40625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.13790826592594385, "epoch": 0.02021, "frac_reward_zero_std": 0.75, "grad_norm": 0.07722116261720657, "kl": 0.4179328605532646, "learning_rate": 9.998176612947743e-06, "loss": -0.0221, "num_tokens": 17534576.0, "reward": 0.6844903826713562, "reward_std": 0.002311693038791418, "rewards/rollout_reward_func/mean": 0.6844903826713562, "rewards/rollout_reward_func/std": 0.443318635225296, "sampling/importance_sampling_ratio/max": 0.9971197247505188, "sampling/importance_sampling_ratio/mean": 0.9692606925964355, "sampling/importance_sampling_ratio/min": 0.24016603827476501, "sampling/sampling_logp_difference/max": 0.8927474617958069, "sampling/sampling_logp_difference/mean": 0.00778828002512455, "step": 2021, "step_time": 4.434676619996026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14798871893435717, "epoch": 0.02022, "grad_norm": 0.08044838905334473, "kl": 0.42192526161670685, "learning_rate": 9.99817477546827e-06, "loss": -0.0226, "step": 2022, "step_time": 2.050764596009685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 148.03125, "completions/mean_terminated_length": 148.03125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.6331143910065293, "epoch": 0.02023, "frac_reward_zero_std": 0.5, "grad_norm": 0.007240065839141607, "kl": 0.4763074517250061, "learning_rate": 9.998172937063648e-06, "loss": -0.0592, "num_tokens": 17550161.0, "reward": 0.49764904379844666, "reward_std": 0.019169151782989502, "rewards/rollout_reward_func/mean": 0.49764904379844666, "rewards/rollout_reward_func/std": 0.24049897491931915, "sampling/importance_sampling_ratio/max": 1.0060334205627441, "sampling/importance_sampling_ratio/mean": 0.905142605304718, "sampling/importance_sampling_ratio/min": 0.00037545961095020175, "sampling/sampling_logp_difference/max": 3.115236759185791, "sampling/sampling_logp_difference/mean": 0.12271785736083984, "step": 2023, "step_time": 4.1613527619992965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6504950216040015, "epoch": 0.02024, "grad_norm": 0.007337147369980812, "kl": 0.48802217468619347, "learning_rate": 9.99817109773388e-06, "loss": -0.0592, "step": 2024, "step_time": 2.0362034870049683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 206.96875, "completions/mean_terminated_length": 206.96875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5133201293647289, "epoch": 0.02025, "frac_reward_zero_std": 0.25, "grad_norm": 0.027949176728725433, "kl": 0.629235677421093, "learning_rate": 9.99816925747896e-06, "loss": -0.0955, "num_tokens": 17567408.0, "reward": 0.900913417339325, "reward_std": 0.09249623119831085, "rewards/rollout_reward_func/mean": 0.900913417339325, "rewards/rollout_reward_func/std": 0.3837510943412781, "sampling/importance_sampling_ratio/max": 1.0004079341888428, "sampling/importance_sampling_ratio/mean": 0.8746727705001831, "sampling/importance_sampling_ratio/min": 0.0029899757355451584, "sampling/sampling_logp_difference/max": 2.2160940170288086, "sampling/sampling_logp_difference/mean": 0.0823529064655304, "step": 2025, "step_time": 4.801791611993394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.5195673797279596, "epoch": 0.02026, "grad_norm": 0.02254806086421013, "kl": 0.6571843810379505, "learning_rate": 9.998167416298896e-06, "loss": -0.0956, "step": 2026, "step_time": 2.517221658010385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 144.71875, "completions/mean_terminated_length": 144.71875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2826526681892574, "epoch": 0.02027, "frac_reward_zero_std": 0.75, "grad_norm": 0.0068184188567101955, "kl": 0.5368297658860683, "learning_rate": 9.998165574193682e-06, "loss": -0.0175, "num_tokens": 17582719.0, "reward": 0.8149038553237915, "reward_std": 0.020397311076521873, "rewards/rollout_reward_func/mean": 0.8149038553237915, "rewards/rollout_reward_func/std": 0.37589630484580994, "sampling/importance_sampling_ratio/max": 0.9984867572784424, "sampling/importance_sampling_ratio/mean": 0.9635436534881592, "sampling/importance_sampling_ratio/min": 0.0004200295079499483, "sampling/sampling_logp_difference/max": 2.398420810699463, "sampling/sampling_logp_difference/mean": 0.04915240779519081, "step": 2027, "step_time": 3.874190560003626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2813766351900995, "epoch": 0.02028, "grad_norm": 0.006796194240450859, "kl": 0.5385187789797783, "learning_rate": 9.998163731163323e-06, "loss": -0.0175, "step": 2028, "step_time": 2.000243036003667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 122.75, "completions/mean_terminated_length": 122.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.14093206357210875, "epoch": 0.02029, "frac_reward_zero_std": 1.0, "grad_norm": 0.012782647274434566, "kl": 0.47223350033164024, "learning_rate": 9.998161887207818e-06, "loss": 0.0013, "num_tokens": 17597559.0, "reward": 0.6830769777297974, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6830769777297974, "rewards/rollout_reward_func/std": 0.13377340137958527, "sampling/importance_sampling_ratio/max": 1.0226260423660278, "sampling/importance_sampling_ratio/mean": 0.9783673286437988, "sampling/importance_sampling_ratio/min": 0.2931842505931854, "sampling/sampling_logp_difference/max": 1.2224372625350952, "sampling/sampling_logp_difference/mean": 0.00975461769849062, "step": 2029, "step_time": 3.8943237270068494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13935894053429365, "epoch": 0.0203, "grad_norm": 0.012076778337359428, "kl": 0.47036853060126305, "learning_rate": 9.998160042327167e-06, "loss": 0.0013, "step": 2030, "step_time": 2.014285136021499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 124.0, "completions/mean_terminated_length": 124.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.13483707001432776, "epoch": 0.02031, "frac_reward_zero_std": 0.75, "grad_norm": 0.02384658344089985, "kl": 0.5776313729584217, "learning_rate": 9.998158196521371e-06, "loss": -0.0146, "num_tokens": 17612263.0, "reward": 0.6512500047683716, "reward_std": 0.031275875866413116, "rewards/rollout_reward_func/mean": 0.6512500047683716, "rewards/rollout_reward_func/std": 0.26251763105392456, "sampling/importance_sampling_ratio/max": 0.9985843300819397, "sampling/importance_sampling_ratio/mean": 0.9686335325241089, "sampling/importance_sampling_ratio/min": 0.15651535987854004, "sampling/sampling_logp_difference/max": 2.033393144607544, "sampling/sampling_logp_difference/mean": 0.01859898492693901, "step": 2031, "step_time": 4.369056594005087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13326881313696504, "epoch": 0.02032, "grad_norm": 0.022639790549874306, "kl": 0.5853445641696453, "learning_rate": 9.998156349790431e-06, "loss": -0.0146, "step": 2032, "step_time": 2.471588193984644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 139.78125, "completions/mean_terminated_length": 139.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.20056115556508303, "epoch": 0.02033, "frac_reward_zero_std": 0.75, "grad_norm": 0.0070771342143416405, "kl": 0.5238875523209572, "learning_rate": 9.998154502134346e-06, "loss": -0.0274, "num_tokens": 17627504.0, "reward": 0.8654615879058838, "reward_std": 0.07897838950157166, "rewards/rollout_reward_func/mean": 0.8654615879058838, "rewards/rollout_reward_func/std": 0.3645413815975189, "sampling/importance_sampling_ratio/max": 0.999961793422699, "sampling/importance_sampling_ratio/mean": 0.9645168781280518, "sampling/importance_sampling_ratio/min": 0.003616258269175887, "sampling/sampling_logp_difference/max": 2.840589761734009, "sampling/sampling_logp_difference/mean": 0.03134828433394432, "step": 2033, "step_time": 4.1189394449975225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.19976369384676218, "epoch": 0.02034, "grad_norm": 0.007031391374766827, "kl": 0.5239321812987328, "learning_rate": 9.998152653553116e-06, "loss": -0.0274, "step": 2034, "step_time": 1.988238325000566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 151.25, "completions/mean_terminated_length": 151.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.05322943674400449, "epoch": 0.02035, "frac_reward_zero_std": 1.0, "grad_norm": 0.00038561763358302414, "kl": 0.3936745934188366, "learning_rate": 9.998150804046745e-06, "loss": 0.0012, "num_tokens": 17643224.0, "reward": 0.9318461418151855, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9318461418151855, "rewards/rollout_reward_func/std": 0.3326389789581299, "sampling/importance_sampling_ratio/max": 0.9976648688316345, "sampling/importance_sampling_ratio/mean": 0.99583899974823, "sampling/importance_sampling_ratio/min": 0.9926496744155884, "sampling/sampling_logp_difference/max": 0.0043019745498895645, "sampling/sampling_logp_difference/mean": 0.0009236233308911324, "step": 2035, "step_time": 4.093184604003909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0524728000164032, "epoch": 0.02036, "grad_norm": 0.0003772661730181426, "kl": 0.39379819855093956, "learning_rate": 9.998148953615228e-06, "loss": 0.0012, "step": 2036, "step_time": 1.9951872370002093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 146.03125, "completions/mean_terminated_length": 146.03125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.40761316753923893, "epoch": 0.02037, "frac_reward_zero_std": 0.5, "grad_norm": 0.017238423228263855, "kl": 0.5925531424582005, "learning_rate": 9.998147102258571e-06, "loss": -0.0171, "num_tokens": 17658633.0, "reward": 0.732519268989563, "reward_std": 0.03154784440994263, "rewards/rollout_reward_func/mean": 0.732519268989563, "rewards/rollout_reward_func/std": 0.40533289313316345, "sampling/importance_sampling_ratio/max": 0.9998846054077148, "sampling/importance_sampling_ratio/mean": 0.9337413311004639, "sampling/importance_sampling_ratio/min": 0.005030445288866758, "sampling/sampling_logp_difference/max": 2.491468906402588, "sampling/sampling_logp_difference/mean": 0.06650814414024353, "step": 2037, "step_time": 4.597415793010441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40831797011196613, "epoch": 0.02038, "grad_norm": 0.01735653169453144, "kl": 0.5923091471195221, "learning_rate": 9.998145249976771e-06, "loss": -0.0171, "step": 2038, "step_time": 2.4927629069861723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 195.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.436592856887728, "epoch": 0.02039, "frac_reward_zero_std": 0.75, "grad_norm": 0.012022609822452068, "kl": 0.44128769263625145, "learning_rate": 9.99814339676983e-06, "loss": 0.0208, "num_tokens": 17675649.0, "reward": 1.093644142150879, "reward_std": 0.0020397298503667116, "rewards/rollout_reward_func/mean": 1.093644142150879, "rewards/rollout_reward_func/std": 0.26661449670791626, "sampling/importance_sampling_ratio/max": 0.997032642364502, "sampling/importance_sampling_ratio/mean": 0.9638156890869141, "sampling/importance_sampling_ratio/min": 7.899718960253758e-20, "sampling/sampling_logp_difference/max": 4.083401203155518, "sampling/sampling_logp_difference/mean": 0.2007385939359665, "step": 2039, "step_time": 4.58206480199442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.43715898878872395, "epoch": 0.0204, "grad_norm": 0.011977985501289368, "kl": 0.44117065519094467, "learning_rate": 9.998141542637748e-06, "loss": 0.0208, "step": 2040, "step_time": 2.039923415002704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 311.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 217.45159912109375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.7410781187936664, "epoch": 0.02041, "frac_reward_zero_std": 0.5, "grad_norm": 0.194931760430336, "kl": 0.5573194846510887, "learning_rate": 9.998139687580526e-06, "loss": 0.0038, "num_tokens": 17693381.0, "reward": 0.7462018728256226, "reward_std": 0.03403817489743233, "rewards/rollout_reward_func/mean": 0.7462018728256226, "rewards/rollout_reward_func/std": 0.1662268340587616, "sampling/importance_sampling_ratio/max": 1.0675156116485596, "sampling/importance_sampling_ratio/mean": 0.9080618619918823, "sampling/importance_sampling_ratio/min": 7.845234576632936e-22, "sampling/sampling_logp_difference/max": 4.4139604568481445, "sampling/sampling_logp_difference/mean": 0.2619973421096802, "step": 2041, "step_time": 4.7989164480241016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7398465601727366, "epoch": 0.02042, "grad_norm": 0.07995791733264923, "kl": 0.575774896889925, "learning_rate": 9.998137831598162e-06, "loss": 0.0035, "step": 2042, "step_time": 2.0446543040088727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.04959560604766011, "epoch": 0.02043, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003865838225465268, "kl": 0.3836282677948475, "learning_rate": 9.998135974690659e-06, "loss": 0.0012, "num_tokens": 17708501.0, "reward": 0.7476922869682312, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7476922869682312, "rewards/rollout_reward_func/std": 0.2243630290031433, "sampling/importance_sampling_ratio/max": 0.9981999397277832, "sampling/importance_sampling_ratio/mean": 0.995896577835083, "sampling/importance_sampling_ratio/min": 0.9921403527259827, "sampling/sampling_logp_difference/max": 0.00515708327293396, "sampling/sampling_logp_difference/mean": 0.0009099952294491231, "step": 2043, "step_time": 5.1263662520068465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.049337619449943304, "epoch": 0.02044, "grad_norm": 0.0003842447476927191, "kl": 0.3836599141359329, "learning_rate": 9.998134116858017e-06, "loss": 0.0012, "step": 2044, "step_time": 1.9495915649968083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.04011329589411616, "epoch": 0.02045, "frac_reward_zero_std": 1.0, "grad_norm": 0.00035810709232464433, "kl": 0.38928331062197685, "learning_rate": 9.998132258100235e-06, "loss": 0.0014, "num_tokens": 17725637.0, "reward": 0.8961923122406006, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8961923122406006, "rewards/rollout_reward_func/std": 0.39695224165916443, "sampling/importance_sampling_ratio/max": 0.9987621903419495, "sampling/importance_sampling_ratio/mean": 0.9965192079544067, "sampling/importance_sampling_ratio/min": 0.9940021634101868, "sampling/sampling_logp_difference/max": 0.003505355678498745, "sampling/sampling_logp_difference/mean": 0.0008932724595069885, "step": 2045, "step_time": 4.717246460997558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04030669294297695, "epoch": 0.02046, "grad_norm": 0.00035729544470086694, "kl": 0.38925936073064804, "learning_rate": 9.998130398417316e-06, "loss": 0.0014, "step": 2046, "step_time": 2.022896230992046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 170.28125, "completions/mean_terminated_length": 170.28125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.2292669708840549, "epoch": 0.02047, "frac_reward_zero_std": 0.75, "grad_norm": 0.011542909778654575, "kl": 0.5177556052803993, "learning_rate": 9.99812853780926e-06, "loss": 0.0206, "num_tokens": 17741822.0, "reward": 0.9677596092224121, "reward_std": 0.0020397298503667116, "rewards/rollout_reward_func/mean": 0.9677596092224121, "rewards/rollout_reward_func/std": 0.2749616205692291, "sampling/importance_sampling_ratio/max": 0.997532308101654, "sampling/importance_sampling_ratio/mean": 0.9625721573829651, "sampling/importance_sampling_ratio/min": 0.003920628223568201, "sampling/sampling_logp_difference/max": 2.6313822269439697, "sampling/sampling_logp_difference/mean": 0.03580692410469055, "step": 2047, "step_time": 4.366107019981428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23077542427927256, "epoch": 0.02048, "grad_norm": 0.011871932074427605, "kl": 0.5190769471228123, "learning_rate": 9.998126676276065e-06, "loss": 0.0206, "step": 2048, "step_time": 2.04938607999793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.052285308949649334, "epoch": 0.02049, "frac_reward_zero_std": 1.0, "grad_norm": 0.00036804613773711026, "kl": 0.44819488376379013, "learning_rate": 9.998124813817733e-06, "loss": 0.0015, "num_tokens": 17758046.0, "reward": 1.0903077125549316, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0903077125549316, "rewards/rollout_reward_func/std": 0.2816295623779297, "sampling/importance_sampling_ratio/max": 0.9988006353378296, "sampling/importance_sampling_ratio/mean": 0.9953986406326294, "sampling/importance_sampling_ratio/min": 0.9899348616600037, "sampling/sampling_logp_difference/max": 0.005612097680568695, "sampling/sampling_logp_difference/mean": 0.0010416469303891063, "step": 2049, "step_time": 5.416567819993361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05308806337416172, "epoch": 0.0205, "grad_norm": 0.0003699580847751349, "kl": 0.4480934180319309, "learning_rate": 9.998122950434266e-06, "loss": 0.0015, "step": 2050, "step_time": 1.9993834269916988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.38112441543489695, "epoch": 0.02051, "frac_reward_zero_std": 0.75, "grad_norm": 0.033731892704963684, "kl": 0.8133393712341785, "learning_rate": 9.998121086125662e-06, "loss": -0.0315, "num_tokens": 17773458.0, "reward": 0.6681919693946838, "reward_std": 0.02099182829260826, "rewards/rollout_reward_func/mean": 0.6681919693946838, "rewards/rollout_reward_func/std": 0.1455044448375702, "sampling/importance_sampling_ratio/max": 0.9986445903778076, "sampling/importance_sampling_ratio/mean": 0.9352009296417236, "sampling/importance_sampling_ratio/min": 2.9409082458187186e-07, "sampling/sampling_logp_difference/max": 2.554673671722412, "sampling/sampling_logp_difference/mean": 0.08831329643726349, "step": 2051, "step_time": 4.639267953993112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3819619561545551, "epoch": 0.02052, "grad_norm": 0.033152949064970016, "kl": 0.8000829666852951, "learning_rate": 9.998119220891924e-06, "loss": -0.0316, "step": 2052, "step_time": 2.0795964829958393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4974652510136366, "epoch": 0.02053, "frac_reward_zero_std": 0.5, "grad_norm": 0.01209619827568531, "kl": 0.5209768190979958, "learning_rate": 9.998117354733048e-06, "loss": -0.0536, "num_tokens": 17789738.0, "reward": 0.6160625219345093, "reward_std": 0.06803182512521744, "rewards/rollout_reward_func/mean": 0.6160625219345093, "rewards/rollout_reward_func/std": 0.41132476925849915, "sampling/importance_sampling_ratio/max": 0.9984483122825623, "sampling/importance_sampling_ratio/mean": 0.9329957962036133, "sampling/importance_sampling_ratio/min": 5.045704857180127e-16, "sampling/sampling_logp_difference/max": 2.896362066268921, "sampling/sampling_logp_difference/mean": 0.17591044306755066, "step": 2053, "step_time": 4.535626532997412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4925378831103444, "epoch": 0.02054, "grad_norm": 0.013378226198256016, "kl": 0.509503960609436, "learning_rate": 9.998115487649039e-06, "loss": -0.0535, "step": 2054, "step_time": 2.4781411590010975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 179.3125, "completions/mean_terminated_length": 179.3125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.47796698240563273, "epoch": 0.02055, "frac_reward_zero_std": 0.5, "grad_norm": 0.022289926186203957, "kl": 0.646091852337122, "learning_rate": 9.998113619639896e-06, "loss": -0.0074, "num_tokens": 17806300.0, "reward": 0.761269211769104, "reward_std": 0.024476777762174606, "rewards/rollout_reward_func/mean": 0.761269211769104, "rewards/rollout_reward_func/std": 0.24487155675888062, "sampling/importance_sampling_ratio/max": 0.9981555342674255, "sampling/importance_sampling_ratio/mean": 0.930317759513855, "sampling/importance_sampling_ratio/min": 1.854448601079639e-05, "sampling/sampling_logp_difference/max": 2.795414924621582, "sampling/sampling_logp_difference/mean": 0.0942513719201088, "step": 2055, "step_time": 4.795362825992925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4765139017254114, "epoch": 0.02056, "grad_norm": 0.02086765132844448, "kl": 0.6286142058670521, "learning_rate": 9.99811175070562e-06, "loss": -0.0074, "step": 2056, "step_time": 1.9835547459806548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 163.4375, "completions/mean_terminated_length": 163.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5705367820337415, "epoch": 0.02057, "frac_reward_zero_std": 0.5, "grad_norm": 0.022523954510688782, "kl": 0.8625187203288078, "learning_rate": 9.998109880846207e-06, "loss": -0.0358, "num_tokens": 17822242.0, "reward": 0.7810769081115723, "reward_std": 0.03154783323407173, "rewards/rollout_reward_func/mean": 0.7810769081115723, "rewards/rollout_reward_func/std": 0.39819425344467163, "sampling/importance_sampling_ratio/max": 0.9970858097076416, "sampling/importance_sampling_ratio/mean": 0.9304708242416382, "sampling/importance_sampling_ratio/min": 7.341747277678223e-06, "sampling/sampling_logp_difference/max": 4.436173915863037, "sampling/sampling_logp_difference/mean": 0.13595786690711975, "step": 2057, "step_time": 4.495370592005202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.571235335431993, "epoch": 0.02058, "grad_norm": 0.019204353913664818, "kl": 0.8063707426190376, "learning_rate": 9.998108010061664e-06, "loss": -0.0358, "step": 2058, "step_time": 2.020544418002828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 71.5, "completions/mean_terminated_length": 71.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10454994905740023, "epoch": 0.02059, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006236638291738927, "kl": 0.6247969195246696, "learning_rate": 9.998106138351988e-06, "loss": 0.0014, "num_tokens": 17835330.0, "reward": 0.7450000047683716, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7450000047683716, "rewards/rollout_reward_func/std": 0.19249796867370605, "sampling/importance_sampling_ratio/max": 0.9970296025276184, "sampling/importance_sampling_ratio/mean": 0.9917866587638855, "sampling/importance_sampling_ratio/min": 0.9874348044395447, "sampling/sampling_logp_difference/max": 0.008302232250571251, "sampling/sampling_logp_difference/mean": 0.002439289353787899, "step": 2059, "step_time": 3.7779976859965245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10672359727323055, "epoch": 0.0206, "grad_norm": 0.0006400766433216631, "kl": 0.6243889555335045, "learning_rate": 9.99810426571718e-06, "loss": 0.0014, "step": 2060, "step_time": 2.871480377005355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.08837037719786167, "epoch": 0.02061, "frac_reward_zero_std": 1.0, "grad_norm": 0.000734448607545346, "kl": 0.4400871768593788, "learning_rate": 9.998102392157243e-06, "loss": 0.0012, "num_tokens": 17850098.0, "reward": 0.8528846502304077, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8528846502304077, "rewards/rollout_reward_func/std": 0.25040704011917114, "sampling/importance_sampling_ratio/max": 1.0002731084823608, "sampling/importance_sampling_ratio/mean": 0.9936667680740356, "sampling/importance_sampling_ratio/min": 0.9860177636146545, "sampling/sampling_logp_difference/max": 0.009554766118526459, "sampling/sampling_logp_difference/mean": 0.001573447254486382, "step": 2061, "step_time": 4.022183237000718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08924141619354486, "epoch": 0.02062, "grad_norm": 0.0007583750993944705, "kl": 0.43989111483097076, "learning_rate": 9.998100517672172e-06, "loss": 0.0012, "step": 2062, "step_time": 2.0078960089813336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 170.125, "completions/mean_terminated_length": 172.29031372070312, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 1.004697410389781, "epoch": 0.02063, "frac_reward_zero_std": 0.25, "grad_norm": 0.021562205627560616, "kl": 0.5700930505990982, "learning_rate": 9.998098642261971e-06, "loss": -0.0839, "num_tokens": 17866278.0, "reward": 0.7636057734489441, "reward_std": 0.0675830990076065, "rewards/rollout_reward_func/mean": 0.7636057734489441, "rewards/rollout_reward_func/std": 0.2396579533815384, "sampling/importance_sampling_ratio/max": 1.0010402202606201, "sampling/importance_sampling_ratio/mean": 0.89968341588974, "sampling/importance_sampling_ratio/min": 3.656447259443171e-18, "sampling/sampling_logp_difference/max": 13.816587448120117, "sampling/sampling_logp_difference/mean": 0.3722294569015503, "step": 2063, "step_time": 4.608460197996465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.004976350814104, "epoch": 0.02064, "grad_norm": 0.021394575014710426, "kl": 0.5625350028276443, "learning_rate": 9.998096765926641e-06, "loss": -0.084, "step": 2064, "step_time": 2.0631880039873067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 152.4375, "completions/mean_terminated_length": 152.4375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.29962029308080673, "epoch": 0.02065, "frac_reward_zero_std": 0.5, "grad_norm": 0.01824527233839035, "kl": 0.6589275822043419, "learning_rate": 9.99809488866618e-06, "loss": -0.0427, "num_tokens": 17882068.0, "reward": 0.7980961799621582, "reward_std": 0.04487408697605133, "rewards/rollout_reward_func/mean": 0.7980961799621582, "rewards/rollout_reward_func/std": 0.24156031012535095, "sampling/importance_sampling_ratio/max": 0.9977760314941406, "sampling/importance_sampling_ratio/mean": 0.9342567920684814, "sampling/importance_sampling_ratio/min": 0.003370226826518774, "sampling/sampling_logp_difference/max": 2.4004323482513428, "sampling/sampling_logp_difference/mean": 0.05957920104265213, "step": 2065, "step_time": 4.176227755997388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2970843445509672, "epoch": 0.02066, "grad_norm": 0.01762242428958416, "kl": 0.6400690898299217, "learning_rate": 9.998093010480592e-06, "loss": -0.0428, "step": 2066, "step_time": 2.95292381800391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5285808136686683, "epoch": 0.02067, "frac_reward_zero_std": 0.25, "grad_norm": 0.013808299787342548, "kl": 0.5806933492422104, "learning_rate": 9.998091131369874e-06, "loss": -0.0745, "num_tokens": 17897804.0, "reward": 0.56641685962677, "reward_std": 0.04744822531938553, "rewards/rollout_reward_func/mean": 0.56641685962677, "rewards/rollout_reward_func/std": 0.17655764520168304, "sampling/importance_sampling_ratio/max": 0.999790370464325, "sampling/importance_sampling_ratio/mean": 0.9027185440063477, "sampling/importance_sampling_ratio/min": 0.002358619589358568, "sampling/sampling_logp_difference/max": 2.4033749103546143, "sampling/sampling_logp_difference/mean": 0.09328452497720718, "step": 2067, "step_time": 4.324893519988109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5245036771520972, "epoch": 0.02068, "grad_norm": 0.013659550808370113, "kl": 0.5753176547586918, "learning_rate": 9.998089251334028e-06, "loss": -0.0746, "step": 2068, "step_time": 2.0673771030051284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06253395648673177, "epoch": 0.02069, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005081359413452446, "kl": 0.47622155770659447, "learning_rate": 9.998087370373053e-06, "loss": 0.0015, "num_tokens": 17913412.0, "reward": 0.5600000023841858, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5600000023841858, "rewards/rollout_reward_func/std": 0.17384284734725952, "sampling/importance_sampling_ratio/max": 0.9998313784599304, "sampling/importance_sampling_ratio/mean": 0.9961425065994263, "sampling/importance_sampling_ratio/min": 0.9908889532089233, "sampling/sampling_logp_difference/max": 0.005163948982954025, "sampling/sampling_logp_difference/mean": 0.0009609215776436031, "step": 2069, "step_time": 3.95539173000725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06083063222467899, "epoch": 0.0207, "grad_norm": 0.0004959927755407989, "kl": 0.4764731712639332, "learning_rate": 9.998085488486952e-06, "loss": 0.0015, "step": 2070, "step_time": 2.0163730530039174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 195.5, "completions/mean_terminated_length": 195.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.11211074655875564, "epoch": 0.02071, "frac_reward_zero_std": 0.75, "grad_norm": 0.026142267510294914, "kl": 0.5216513052582741, "learning_rate": 9.998083605675725e-06, "loss": -0.0226, "num_tokens": 17930348.0, "reward": 0.9464423060417175, "reward_std": 0.031275875866413116, "rewards/rollout_reward_func/mean": 0.9464423060417175, "rewards/rollout_reward_func/std": 0.23438765108585358, "sampling/importance_sampling_ratio/max": 0.9988266229629517, "sampling/importance_sampling_ratio/mean": 0.9681910872459412, "sampling/importance_sampling_ratio/min": 0.14267078042030334, "sampling/sampling_logp_difference/max": 2.0035929679870605, "sampling/sampling_logp_difference/mean": 0.0117725171148777, "step": 2071, "step_time": 4.110524570991402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11142881261184812, "epoch": 0.02072, "grad_norm": 0.023735156282782555, "kl": 0.5294286794960499, "learning_rate": 9.998081721939372e-06, "loss": -0.0227, "step": 2072, "step_time": 2.9220916789927287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 166.1875, "completions/mean_terminated_length": 166.1875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.12682384392246604, "epoch": 0.02073, "frac_reward_zero_std": 0.75, "grad_norm": 0.009782589972019196, "kl": 0.6291946060955524, "learning_rate": 9.998079837277892e-06, "loss": -0.0339, "num_tokens": 17946402.0, "reward": 0.7331297993659973, "reward_std": 0.041515324264764786, "rewards/rollout_reward_func/mean": 0.7331297993659973, "rewards/rollout_reward_func/std": 0.3434097468852997, "sampling/importance_sampling_ratio/max": 0.9973335266113281, "sampling/importance_sampling_ratio/mean": 0.9661292433738708, "sampling/importance_sampling_ratio/min": 0.08754010498523712, "sampling/sampling_logp_difference/max": 2.432210922241211, "sampling/sampling_logp_difference/mean": 0.014041068963706493, "step": 2073, "step_time": 4.153775965998648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12424401473253965, "epoch": 0.02074, "grad_norm": 0.00695059634745121, "kl": 0.645664606243372, "learning_rate": 9.998077951691287e-06, "loss": -0.0339, "step": 2074, "step_time": 2.0212477869936265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 100.0, "completions/mean_terminated_length": 100.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06638488359749317, "epoch": 0.02075, "frac_reward_zero_std": 1.0, "grad_norm": 0.00042653069249354303, "kl": 0.5067601799964905, "learning_rate": 9.998076065179556e-06, "loss": 0.0013, "num_tokens": 17960338.0, "reward": 0.6557692289352417, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6557692289352417, "rewards/rollout_reward_func/std": 0.16865454614162445, "sampling/importance_sampling_ratio/max": 1.0017402172088623, "sampling/importance_sampling_ratio/mean": 0.9955376386642456, "sampling/importance_sampling_ratio/min": 0.9888383746147156, "sampling/sampling_logp_difference/max": 0.006466001272201538, "sampling/sampling_logp_difference/mean": 0.001299805473536253, "step": 2075, "step_time": 3.612352723997901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06511298567056656, "epoch": 0.02076, "grad_norm": 0.00041256178519688547, "kl": 0.5069545991718769, "learning_rate": 9.9980741777427e-06, "loss": 0.0013, "step": 2076, "step_time": 1.9715489820155199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 128.5625, "completions/mean_terminated_length": 128.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5600271774455905, "epoch": 0.02077, "frac_reward_zero_std": 0.5, "grad_norm": 0.008071601390838623, "kl": 0.5681700892746449, "learning_rate": 9.998072289380722e-06, "loss": -0.0463, "num_tokens": 17975244.0, "reward": 0.7425961494445801, "reward_std": 0.03399552404880524, "rewards/rollout_reward_func/mean": 0.7425961494445801, "rewards/rollout_reward_func/std": 0.31214162707328796, "sampling/importance_sampling_ratio/max": 0.998857319355011, "sampling/importance_sampling_ratio/mean": 0.9334147572517395, "sampling/importance_sampling_ratio/min": 8.907964001991786e-06, "sampling/sampling_logp_difference/max": 4.065715789794922, "sampling/sampling_logp_difference/mean": 0.14572380483150482, "step": 2077, "step_time": 4.572432305991242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5572231360711157, "epoch": 0.02078, "grad_norm": 0.008558719418942928, "kl": 0.5764067582786083, "learning_rate": 9.998070400093619e-06, "loss": -0.0463, "step": 2078, "step_time": 2.487539408975863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 171.9375, "completions/mean_terminated_length": 171.9375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 1.0399211263284087, "epoch": 0.02079, "frac_reward_zero_std": 0.25, "grad_norm": 0.021036680787801743, "kl": 0.5942639335989952, "learning_rate": 9.998068509881394e-06, "loss": -0.0309, "num_tokens": 17991370.0, "reward": 0.4073110520839691, "reward_std": 0.07901912927627563, "rewards/rollout_reward_func/mean": 0.4073110520839691, "rewards/rollout_reward_func/std": 0.12910491228103638, "sampling/importance_sampling_ratio/max": 0.9995120763778687, "sampling/importance_sampling_ratio/mean": 0.8407565355300903, "sampling/importance_sampling_ratio/min": 8.529401384294033e-06, "sampling/sampling_logp_difference/max": 3.5778188705444336, "sampling/sampling_logp_difference/mean": 0.20726078748703003, "step": 2079, "step_time": 4.491245940000226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.042010935023427, "epoch": 0.0208, "grad_norm": 0.022524533793330193, "kl": 0.601774625480175, "learning_rate": 9.998066618744046e-06, "loss": -0.0308, "step": 2080, "step_time": 2.0471037270035595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 150.3125, "completions/mean_terminated_length": 150.3125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.1858837422914803, "epoch": 0.02081, "frac_reward_zero_std": 0.75, "grad_norm": 0.009556370787322521, "kl": 0.5076614990830421, "learning_rate": 9.998064726681576e-06, "loss": -0.027, "num_tokens": 18007060.0, "reward": 0.757514476776123, "reward_std": 0.02878740429878235, "rewards/rollout_reward_func/mean": 0.757514476776123, "rewards/rollout_reward_func/std": 0.2790406048297882, "sampling/importance_sampling_ratio/max": 1.00094735622406, "sampling/importance_sampling_ratio/mean": 0.9676240682601929, "sampling/importance_sampling_ratio/min": 0.008148975670337677, "sampling/sampling_logp_difference/max": 1.8105074167251587, "sampling/sampling_logp_difference/mean": 0.03006618842482567, "step": 2081, "step_time": 4.146775508001156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18453888315707445, "epoch": 0.02082, "grad_norm": 0.01007822621613741, "kl": 0.5115626305341721, "learning_rate": 9.998062833693984e-06, "loss": -0.027, "step": 2082, "step_time": 2.0257213579970994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 201.5, "completions/mean_terminated_length": 204.32257080078125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 1.717309219762683, "epoch": 0.02083, "frac_reward_zero_std": 0.0, "grad_norm": 0.02880498208105564, "kl": 0.6551084741950035, "learning_rate": 9.998060939781272e-06, "loss": -0.0832, "num_tokens": 18024276.0, "reward": 0.7767689228057861, "reward_std": 0.28151223063468933, "rewards/rollout_reward_func/mean": 0.7767689228057861, "rewards/rollout_reward_func/std": 0.47341999411582947, "sampling/importance_sampling_ratio/max": 0.9996844530105591, "sampling/importance_sampling_ratio/mean": 0.7795206308364868, "sampling/importance_sampling_ratio/min": 1.123308273317291e-18, "sampling/sampling_logp_difference/max": 2.848123073577881, "sampling/sampling_logp_difference/mean": 0.44592201709747314, "step": 2083, "step_time": 5.429034234992287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.705790746025741, "epoch": 0.02084, "grad_norm": 0.028936276212334633, "kl": 0.6484981514513493, "learning_rate": 9.998059044943437e-06, "loss": -0.0832, "step": 2084, "step_time": 2.54767687900312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05420458223670721, "epoch": 0.02085, "frac_reward_zero_std": 1.0, "grad_norm": 0.00042633796692825854, "kl": 0.449591476470232, "learning_rate": 9.998057149180485e-06, "loss": 0.0014, "num_tokens": 18039764.0, "reward": 0.8657692670822144, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8657692670822144, "rewards/rollout_reward_func/std": 0.30993402004241943, "sampling/importance_sampling_ratio/max": 0.9992330074310303, "sampling/importance_sampling_ratio/mean": 0.9968668818473816, "sampling/importance_sampling_ratio/min": 0.9940512776374817, "sampling/sampling_logp_difference/max": 0.005357261747121811, "sampling/sampling_logp_difference/mean": 0.0009150863625109196, "step": 2085, "step_time": 4.211528643994825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.055422271601855755, "epoch": 0.02086, "grad_norm": 0.00044068112038075924, "kl": 0.4493585675954819, "learning_rate": 9.99805525249241e-06, "loss": 0.0014, "step": 2086, "step_time": 1.975507536000805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 151.46875, "completions/mean_terminated_length": 151.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.45798971224576235, "epoch": 0.02087, "frac_reward_zero_std": 0.5, "grad_norm": 0.011531059630215168, "kl": 0.6975345015525818, "learning_rate": 9.998053354879217e-06, "loss": -0.0458, "num_tokens": 18055379.0, "reward": 0.9428220987319946, "reward_std": 0.03966597467660904, "rewards/rollout_reward_func/mean": 0.9428220987319946, "rewards/rollout_reward_func/std": 0.297292023897171, "sampling/importance_sampling_ratio/max": 1.0012860298156738, "sampling/importance_sampling_ratio/mean": 0.9327247142791748, "sampling/importance_sampling_ratio/min": 0.0003177695907652378, "sampling/sampling_logp_difference/max": 2.806884527206421, "sampling/sampling_logp_difference/mean": 0.09131258726119995, "step": 2087, "step_time": 4.398340224011918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.45975648425519466, "epoch": 0.02088, "grad_norm": 0.011829213239252567, "kl": 0.6930651888251305, "learning_rate": 9.998051456340905e-06, "loss": -0.0459, "step": 2088, "step_time": 2.0463858939911006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 147.40625, "completions/mean_terminated_length": 147.40625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6630811151117086, "epoch": 0.02089, "frac_reward_zero_std": 0.25, "grad_norm": 0.018137607723474503, "kl": 0.5673931650817394, "learning_rate": 9.998049556877473e-06, "loss": -0.0087, "num_tokens": 18070776.0, "reward": 0.5935240387916565, "reward_std": 0.027645152062177658, "rewards/rollout_reward_func/mean": 0.5935240387916565, "rewards/rollout_reward_func/std": 0.28813067078590393, "sampling/importance_sampling_ratio/max": 0.9988685846328735, "sampling/importance_sampling_ratio/mean": 0.9031571745872498, "sampling/importance_sampling_ratio/min": 1.705640184646029e-09, "sampling/sampling_logp_difference/max": 12.179722785949707, "sampling/sampling_logp_difference/mean": 0.21587593853473663, "step": 2089, "step_time": 4.928383925995149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6604466969147325, "epoch": 0.0209, "grad_norm": 0.017441796138882637, "kl": 0.5647479295730591, "learning_rate": 9.998047656488927e-06, "loss": -0.0087, "step": 2090, "step_time": 2.0534518329877756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 76.9375, "completions/mean_terminated_length": 76.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.29612221755087376, "epoch": 0.02091, "frac_reward_zero_std": 0.75, "grad_norm": 0.015481125563383102, "kl": 0.717082642018795, "learning_rate": 9.998045755175261e-06, "loss": 0.0107, "num_tokens": 18084150.0, "reward": 0.6709614992141724, "reward_std": 0.013598205521702766, "rewards/rollout_reward_func/mean": 0.6709614992141724, "rewards/rollout_reward_func/std": 0.13128681480884552, "sampling/importance_sampling_ratio/max": 0.9944159984588623, "sampling/importance_sampling_ratio/mean": 0.9604079723358154, "sampling/importance_sampling_ratio/min": 0.02097366563975811, "sampling/sampling_logp_difference/max": 1.9376211166381836, "sampling/sampling_logp_difference/mean": 0.043606821447610855, "step": 2091, "step_time": 3.862767802005692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29463633336126804, "epoch": 0.02092, "grad_norm": 0.015255448408424854, "kl": 0.7097436040639877, "learning_rate": 9.998043852936478e-06, "loss": 0.0106, "step": 2092, "step_time": 1.984546035004314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 118.1875, "completions/mean_terminated_length": 118.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7320263129658997, "epoch": 0.02093, "frac_reward_zero_std": 0.5, "grad_norm": 0.015257859602570534, "kl": 0.7325224727392197, "learning_rate": 9.998041949772579e-06, "loss": -0.0513, "num_tokens": 18098700.0, "reward": 0.7365384697914124, "reward_std": 0.06836910545825958, "rewards/rollout_reward_func/mean": 0.7365384697914124, "rewards/rollout_reward_func/std": 0.13192743062973022, "sampling/importance_sampling_ratio/max": 1.0044349431991577, "sampling/importance_sampling_ratio/mean": 0.9043188095092773, "sampling/importance_sampling_ratio/min": 5.995357423671521e-05, "sampling/sampling_logp_difference/max": 2.674020767211914, "sampling/sampling_logp_difference/mean": 0.11887489259243011, "step": 2093, "step_time": 3.985480772993469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.729431732557714, "epoch": 0.02094, "grad_norm": 0.014950561337172985, "kl": 0.7278607785701752, "learning_rate": 9.998040045683564e-06, "loss": -0.0513, "step": 2094, "step_time": 2.0155566889879992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 72.875, "completions/mean_terminated_length": 72.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3996029431000352, "epoch": 0.02095, "frac_reward_zero_std": 0.75, "grad_norm": 0.009493602439761162, "kl": 0.7653137594461441, "learning_rate": 9.998038140669432e-06, "loss": -0.0175, "num_tokens": 18111800.0, "reward": 0.6649038195610046, "reward_std": 0.0176776684820652, "rewards/rollout_reward_func/mean": 0.6649038195610046, "rewards/rollout_reward_func/std": 0.12902702391147614, "sampling/importance_sampling_ratio/max": 0.9985924363136292, "sampling/importance_sampling_ratio/mean": 0.9616349339485168, "sampling/importance_sampling_ratio/min": 0.0001412124838680029, "sampling/sampling_logp_difference/max": 2.3951196670532227, "sampling/sampling_logp_difference/mean": 0.07136157900094986, "step": 2095, "step_time": 4.130537857992749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3965750755742192, "epoch": 0.02096, "grad_norm": 0.008585656061768532, "kl": 0.7611921653151512, "learning_rate": 9.998036234730187e-06, "loss": -0.0175, "step": 2096, "step_time": 1.9799554029959836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.23286421177908778, "epoch": 0.02097, "frac_reward_zero_std": 0.75, "grad_norm": 0.007156286854296923, "kl": 0.4441790208220482, "learning_rate": 9.998034327865826e-06, "loss": -0.0274, "num_tokens": 18127056.0, "reward": 0.7275000214576721, "reward_std": 0.035355344414711, "rewards/rollout_reward_func/mean": 0.7275000214576721, "rewards/rollout_reward_func/std": 0.31469443440437317, "sampling/importance_sampling_ratio/max": 0.999819815158844, "sampling/importance_sampling_ratio/mean": 0.966215968132019, "sampling/importance_sampling_ratio/min": 0.0008771215216256678, "sampling/sampling_logp_difference/max": 2.263015031814575, "sampling/sampling_logp_difference/mean": 0.03665246069431305, "step": 2097, "step_time": 4.165631651987496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2338147098198533, "epoch": 0.02098, "grad_norm": 0.007179343141615391, "kl": 0.4427422843873501, "learning_rate": 9.998032420076352e-06, "loss": -0.0274, "step": 2098, "step_time": 2.0342950540143647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 189.1875, "completions/mean_terminated_length": 189.1875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.1786296684294939, "epoch": 0.02099, "frac_reward_zero_std": 0.75, "grad_norm": 0.0047045545652508736, "kl": 0.43950508534908295, "learning_rate": 9.998030511361763e-06, "loss": -0.027, "num_tokens": 18143790.0, "reward": 0.6609278917312622, "reward_std": 0.059383369982242584, "rewards/rollout_reward_func/mean": 0.6609278917312622, "rewards/rollout_reward_func/std": 0.28517672419548035, "sampling/importance_sampling_ratio/max": 1.0016125440597534, "sampling/importance_sampling_ratio/mean": 0.9659485220909119, "sampling/importance_sampling_ratio/min": 0.0064080581068992615, "sampling/sampling_logp_difference/max": 2.324134349822998, "sampling/sampling_logp_difference/mean": 0.02652641572058201, "step": 2099, "step_time": 4.474830136983655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17864755494520068, "epoch": 0.021, "grad_norm": 0.004838585387915373, "kl": 0.4336489140987396, "learning_rate": 9.998028601722061e-06, "loss": -0.027, "step": 2100, "step_time": 2.0311311049954384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 172.1875, "completions/mean_terminated_length": 172.1875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3681870959699154, "epoch": 0.02101, "frac_reward_zero_std": 0.75, "grad_norm": 0.0037685397546738386, "kl": 0.4548536650836468, "learning_rate": 9.998026691157246e-06, "loss": -0.0273, "num_tokens": 18159980.0, "reward": 0.794335126876831, "reward_std": 0.020700551569461823, "rewards/rollout_reward_func/mean": 0.794335126876831, "rewards/rollout_reward_func/std": 0.21195350587368011, "sampling/importance_sampling_ratio/max": 0.9979271292686462, "sampling/importance_sampling_ratio/mean": 0.9636431336402893, "sampling/importance_sampling_ratio/min": 1.469518002292572e-13, "sampling/sampling_logp_difference/max": 2.791722297668457, "sampling/sampling_logp_difference/mean": 0.11747792363166809, "step": 2101, "step_time": 5.57758878498862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36774356942623854, "epoch": 0.02102, "grad_norm": 0.0036627312656491995, "kl": 0.4547470211982727, "learning_rate": 9.99802477966732e-06, "loss": -0.0273, "step": 2102, "step_time": 2.0569906200107653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9023124910891056, "epoch": 0.02103, "frac_reward_zero_std": 0.25, "grad_norm": 0.0180094912648201, "kl": 0.806548248976469, "learning_rate": 9.998022867252281e-06, "loss": -0.0474, "num_tokens": 18174792.0, "reward": 0.9149807691574097, "reward_std": 0.06507229804992676, "rewards/rollout_reward_func/mean": 0.9149807691574097, "rewards/rollout_reward_func/std": 0.35825350880622864, "sampling/importance_sampling_ratio/max": 0.9998131990432739, "sampling/importance_sampling_ratio/mean": 0.8441684246063232, "sampling/importance_sampling_ratio/min": 0.01611434295773506, "sampling/sampling_logp_difference/max": 1.9372941255569458, "sampling/sampling_logp_difference/mean": 0.12242502719163895, "step": 2103, "step_time": 4.121532651995949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9034425765275955, "epoch": 0.02104, "grad_norm": 0.017896950244903564, "kl": 0.7819539196789265, "learning_rate": 9.998020953912132e-06, "loss": -0.0474, "step": 2104, "step_time": 2.0101370600023074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 198.8125, "completions/mean_terminated_length": 198.8125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.8397965938784182, "epoch": 0.02105, "frac_reward_zero_std": 0.75, "grad_norm": 0.005793415475636721, "kl": 0.5015184655785561, "learning_rate": 9.99801903964687e-06, "loss": -0.022, "num_tokens": 18191890.0, "reward": 0.6535803079605103, "reward_std": 0.029973091557621956, "rewards/rollout_reward_func/mean": 0.6535803079605103, "rewards/rollout_reward_func/std": 0.29748085141181946, "sampling/importance_sampling_ratio/max": 1.000502109527588, "sampling/importance_sampling_ratio/mean": 0.9011030197143555, "sampling/importance_sampling_ratio/min": 1.2187461563767984e-15, "sampling/sampling_logp_difference/max": 3.975900173187256, "sampling/sampling_logp_difference/mean": 0.25049519538879395, "step": 2105, "step_time": 4.5925352230187855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8425291581079364, "epoch": 0.02106, "grad_norm": 0.005903972312808037, "kl": 0.5017192512750626, "learning_rate": 9.998017124456499e-06, "loss": -0.022, "step": 2106, "step_time": 2.1101620079934946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 152.6875, "completions/mean_terminated_length": 152.6875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7287995973601937, "epoch": 0.02107, "frac_reward_zero_std": 0.5, "grad_norm": 0.016222290694713593, "kl": 0.6314798668026924, "learning_rate": 9.998015208341019e-06, "loss": -0.0492, "num_tokens": 18207576.0, "reward": 0.8088798522949219, "reward_std": 0.19545793533325195, "rewards/rollout_reward_func/mean": 0.8088798522949219, "rewards/rollout_reward_func/std": 0.5026455521583557, "sampling/importance_sampling_ratio/max": 1.0003936290740967, "sampling/importance_sampling_ratio/mean": 0.9038979411125183, "sampling/importance_sampling_ratio/min": 1.1853182435572762e-08, "sampling/sampling_logp_difference/max": 3.8340210914611816, "sampling/sampling_logp_difference/mean": 0.17413698136806488, "step": 2107, "step_time": 5.0811209619860165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7317663584835827, "epoch": 0.02108, "grad_norm": 0.016119951382279396, "kl": 0.6334175653755665, "learning_rate": 9.998013291300428e-06, "loss": -0.0492, "step": 2108, "step_time": 2.0438269239966758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 165.65625, "completions/mean_terminated_length": 165.65625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3145845653489232, "epoch": 0.02109, "frac_reward_zero_std": 0.5, "grad_norm": 0.018379803746938705, "kl": 0.5125892348587513, "learning_rate": 9.998011373334728e-06, "loss": -0.0, "num_tokens": 18223589.0, "reward": 0.6631875038146973, "reward_std": 0.13738267123699188, "rewards/rollout_reward_func/mean": 0.6631875038146973, "rewards/rollout_reward_func/std": 0.3989023268222809, "sampling/importance_sampling_ratio/max": 1.000088095664978, "sampling/importance_sampling_ratio/mean": 0.9359854459762573, "sampling/importance_sampling_ratio/min": 0.01665833219885826, "sampling/sampling_logp_difference/max": 2.2801737785339355, "sampling/sampling_logp_difference/mean": 0.04433947056531906, "step": 2109, "step_time": 4.19090254400362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3125025602057576, "epoch": 0.0211, "grad_norm": 0.01757660321891308, "kl": 0.5164447985589504, "learning_rate": 9.99800945444392e-06, "loss": 0.0, "step": 2110, "step_time": 2.0247023709962377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 148.28125, "completions/mean_terminated_length": 148.28125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.416870123706758, "epoch": 0.02111, "frac_reward_zero_std": 0.75, "grad_norm": 0.025612208992242813, "kl": 0.505827285349369, "learning_rate": 9.998007534628005e-06, "loss": 0.026, "num_tokens": 18239046.0, "reward": 0.8564615249633789, "reward_std": 0.005341271869838238, "rewards/rollout_reward_func/mean": 0.8564615249633789, "rewards/rollout_reward_func/std": 0.31665530800819397, "sampling/importance_sampling_ratio/max": 0.9995669722557068, "sampling/importance_sampling_ratio/mean": 0.9348701238632202, "sampling/importance_sampling_ratio/min": 0.002869111020117998, "sampling/sampling_logp_difference/max": 2.166830539703369, "sampling/sampling_logp_difference/mean": 0.06814413517713547, "step": 2111, "step_time": 4.104144627999631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41933539882302284, "epoch": 0.02112, "grad_norm": 0.02627578377723694, "kl": 0.503666315227747, "learning_rate": 9.998005613886981e-06, "loss": 0.0259, "step": 2112, "step_time": 2.930759611997928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 95.0625, "completions/mean_terminated_length": 95.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.47232748847454786, "epoch": 0.02113, "frac_reward_zero_std": 0.75, "grad_norm": 0.007888191379606724, "kl": 0.731624461710453, "learning_rate": 9.99800369222085e-06, "loss": -0.0352, "num_tokens": 18252856.0, "reward": 0.811346173286438, "reward_std": 0.005341271869838238, "rewards/rollout_reward_func/mean": 0.811346173286438, "rewards/rollout_reward_func/std": 0.1524130403995514, "sampling/importance_sampling_ratio/max": 0.9962849020957947, "sampling/importance_sampling_ratio/mean": 0.9268316030502319, "sampling/importance_sampling_ratio/min": 0.01599862612783909, "sampling/sampling_logp_difference/max": 1.7577829360961914, "sampling/sampling_logp_difference/mean": 0.06018412858247757, "step": 2113, "step_time": 3.863142513015191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4829005980864167, "epoch": 0.02114, "grad_norm": 0.007867773994803429, "kl": 0.7301993519067764, "learning_rate": 9.998001769629614e-06, "loss": -0.0352, "step": 2114, "step_time": 1.9918167919895495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 240.8125, "completions/mean_terminated_length": 240.8125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.2624386940151453, "epoch": 0.02115, "frac_reward_zero_std": 0.5, "grad_norm": 0.027893640100955963, "kl": 0.40996691957116127, "learning_rate": 9.99799984611327e-06, "loss": -0.0075, "num_tokens": 18271242.0, "reward": 0.7653413414955139, "reward_std": 0.016086677089333534, "rewards/rollout_reward_func/mean": 0.7653413414955139, "rewards/rollout_reward_func/std": 0.3975871503353119, "sampling/importance_sampling_ratio/max": 0.9979316592216492, "sampling/importance_sampling_ratio/mean": 0.9331713914871216, "sampling/importance_sampling_ratio/min": 0.060933977365493774, "sampling/sampling_logp_difference/max": 1.3692376613616943, "sampling/sampling_logp_difference/mean": 0.02852814458310604, "step": 2115, "step_time": 4.605125299007341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26162221748381853, "epoch": 0.02116, "grad_norm": 0.014272088184952736, "kl": 0.40709784254431725, "learning_rate": 9.997997921671822e-06, "loss": -0.0077, "step": 2116, "step_time": 2.036013343014929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 123.03125, "completions/mean_terminated_length": 123.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9142884351313114, "epoch": 0.02117, "frac_reward_zero_std": 0.25, "grad_norm": 0.013273150660097599, "kl": 0.6057617664337158, "learning_rate": 9.997995996305268e-06, "loss": -0.0651, "num_tokens": 18285891.0, "reward": 0.7750529050827026, "reward_std": 0.03879568725824356, "rewards/rollout_reward_func/mean": 0.7750529050827026, "rewards/rollout_reward_func/std": 0.39185860753059387, "sampling/importance_sampling_ratio/max": 0.9987285137176514, "sampling/importance_sampling_ratio/mean": 0.9009724259376526, "sampling/importance_sampling_ratio/min": 5.745149495028912e-11, "sampling/sampling_logp_difference/max": 14.549850463867188, "sampling/sampling_logp_difference/mean": 0.297865629196167, "step": 2117, "step_time": 4.30390646700107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9133615987375379, "epoch": 0.02118, "grad_norm": 0.013834621757268906, "kl": 0.6049893349409103, "learning_rate": 9.997994070013607e-06, "loss": -0.0651, "step": 2118, "step_time": 2.9739674080046825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 262.625, "completions/mean_terminated_length": 262.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.17233351059257984, "epoch": 0.02119, "frac_reward_zero_std": 0.75, "grad_norm": 0.023324385285377502, "kl": 0.41994521394371986, "learning_rate": 9.997992142796844e-06, "loss": -0.0253, "num_tokens": 18304919.0, "reward": 0.6895769238471985, "reward_std": 0.01359820831567049, "rewards/rollout_reward_func/mean": 0.6895769238471985, "rewards/rollout_reward_func/std": 0.4536972641944885, "sampling/importance_sampling_ratio/max": 0.9967824816703796, "sampling/importance_sampling_ratio/mean": 0.9604089260101318, "sampling/importance_sampling_ratio/min": 0.0387306734919548, "sampling/sampling_logp_difference/max": 1.3801804780960083, "sampling/sampling_logp_difference/mean": 0.015890900045633316, "step": 2119, "step_time": 4.525212831009412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17241576313972473, "epoch": 0.0212, "grad_norm": 0.024690186604857445, "kl": 0.41902996972203255, "learning_rate": 9.997990214654979e-06, "loss": -0.0253, "step": 2120, "step_time": 2.0572428189989296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 153.8125, "completions/mean_terminated_length": 153.8125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.8910796288400888, "epoch": 0.02121, "frac_reward_zero_std": 0.25, "grad_norm": 1.1103852987289429, "kl": 0.4929877631366253, "learning_rate": 9.997988285588009e-06, "loss": -0.0453, "num_tokens": 18320521.0, "reward": 0.7149759531021118, "reward_std": 0.08014816045761108, "rewards/rollout_reward_func/mean": 0.7149759531021118, "rewards/rollout_reward_func/std": 0.27257755398750305, "sampling/importance_sampling_ratio/max": 1.1633400917053223, "sampling/importance_sampling_ratio/mean": 0.8083305358886719, "sampling/importance_sampling_ratio/min": 0.0001424151996616274, "sampling/sampling_logp_difference/max": 1.9355652332305908, "sampling/sampling_logp_difference/mean": 0.16203439235687256, "step": 2121, "step_time": 4.2231346710104845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0416666679084301, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0416666679084301, "entropy": 0.9386666771024466, "epoch": 0.02122, "grad_norm": 0.02378648892045021, "kl": 0.635967992246151, "learning_rate": 9.997986355595936e-06, "loss": -0.0468, "step": 2122, "step_time": 2.0306167920134612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 155.15625, "completions/mean_terminated_length": 155.15625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.8623375948518515, "epoch": 0.02123, "frac_reward_zero_std": 0.5, "grad_norm": 0.016300123184919357, "kl": 0.6178883388638496, "learning_rate": 9.99798442467876e-06, "loss": -0.0454, "num_tokens": 18336166.0, "reward": 0.7243269085884094, "reward_std": 0.09110798686742783, "rewards/rollout_reward_func/mean": 0.7243269085884094, "rewards/rollout_reward_func/std": 0.1614760160446167, "sampling/importance_sampling_ratio/max": 0.996417760848999, "sampling/importance_sampling_ratio/mean": 0.9254234433174133, "sampling/importance_sampling_ratio/min": 2.225979106144837e-24, "sampling/sampling_logp_difference/max": 3.233153820037842, "sampling/sampling_logp_difference/mean": 0.3123527467250824, "step": 2123, "step_time": 4.404735977004748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8597658751532435, "epoch": 0.02124, "grad_norm": 0.015924135223031044, "kl": 0.6197264119982719, "learning_rate": 9.997982492836483e-06, "loss": -0.0454, "step": 2124, "step_time": 2.9648951049966854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 227.03125, "completions/mean_terminated_length": 227.03125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.695730356965214, "epoch": 0.02125, "frac_reward_zero_std": 0.25, "grad_norm": 0.04675353690981865, "kl": 0.5803769156336784, "learning_rate": 9.997980560069105e-06, "loss": -0.0839, "num_tokens": 18354223.0, "reward": 0.9244213700294495, "reward_std": 0.07330766320228577, "rewards/rollout_reward_func/mean": 0.9244213700294495, "rewards/rollout_reward_func/std": 0.33160582184791565, "sampling/importance_sampling_ratio/max": 0.9991254806518555, "sampling/importance_sampling_ratio/mean": 0.8569343090057373, "sampling/importance_sampling_ratio/min": 3.1599790872860467e-07, "sampling/sampling_logp_difference/max": 3.0633544921875, "sampling/sampling_logp_difference/mean": 0.15457496047019958, "step": 2125, "step_time": 4.684789471990371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6959936642087996, "epoch": 0.02126, "grad_norm": 0.04207571968436241, "kl": 0.6054247580468655, "learning_rate": 9.997978626376626e-06, "loss": -0.084, "step": 2126, "step_time": 2.1034672880050493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 171.59375, "completions/mean_terminated_length": 171.59375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4721325123682618, "epoch": 0.02127, "frac_reward_zero_std": 0.75, "grad_norm": 0.013980885036289692, "kl": 0.4947870299220085, "learning_rate": 9.997976691759046e-06, "loss": -0.036, "num_tokens": 18370506.0, "reward": 0.526846170425415, "reward_std": 0.01359820831567049, "rewards/rollout_reward_func/mean": 0.526846170425415, "rewards/rollout_reward_func/std": 0.14144474267959595, "sampling/importance_sampling_ratio/max": 1.0021038055419922, "sampling/importance_sampling_ratio/mean": 0.9572482109069824, "sampling/importance_sampling_ratio/min": 5.301885687281617e-18, "sampling/sampling_logp_difference/max": 3.997081756591797, "sampling/sampling_logp_difference/mean": 0.1846034824848175, "step": 2127, "step_time": 4.323106274983729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.468731552362442, "epoch": 0.02128, "grad_norm": 0.012989168986678123, "kl": 0.49665237963199615, "learning_rate": 9.997974756216367e-06, "loss": -0.036, "step": 2128, "step_time": 2.0506259010144277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 104.59375, "completions/mean_terminated_length": 104.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5308155603706837, "epoch": 0.02129, "frac_reward_zero_std": 0.5, "grad_norm": 0.018335826694965363, "kl": 0.9064496085047722, "learning_rate": 9.997972819748588e-06, "loss": -0.0381, "num_tokens": 18384653.0, "reward": 0.7611187696456909, "reward_std": 0.46112358570098877, "rewards/rollout_reward_func/mean": 0.7611187696456909, "rewards/rollout_reward_func/std": 0.6577132940292358, "sampling/importance_sampling_ratio/max": 0.9984188675880432, "sampling/importance_sampling_ratio/mean": 0.9001962542533875, "sampling/importance_sampling_ratio/min": 2.750744076962519e-11, "sampling/sampling_logp_difference/max": 12.667986869812012, "sampling/sampling_logp_difference/mean": 0.26923027634620667, "step": 2129, "step_time": 5.018968702985148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5259637264534831, "epoch": 0.0213, "grad_norm": 0.018867896869778633, "kl": 0.9145990088582039, "learning_rate": 9.997970882355708e-06, "loss": -0.0381, "step": 2130, "step_time": 2.0515759589980007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 99.75, "completions/mean_terminated_length": 99.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.48937705252319574, "epoch": 0.02131, "frac_reward_zero_std": 0.75, "grad_norm": 0.009431547485291958, "kl": 0.7822433933615685, "learning_rate": 9.997968944037731e-06, "loss": -0.0174, "num_tokens": 18398645.0, "reward": 0.6915096044540405, "reward_std": 0.04283435270190239, "rewards/rollout_reward_func/mean": 0.6915096044540405, "rewards/rollout_reward_func/std": 0.19180673360824585, "sampling/importance_sampling_ratio/max": 0.9976825714111328, "sampling/importance_sampling_ratio/mean": 0.9625705480575562, "sampling/importance_sampling_ratio/min": 1.2581634345476544e-10, "sampling/sampling_logp_difference/max": 3.603635549545288, "sampling/sampling_logp_difference/mean": 0.11239704489707947, "step": 2131, "step_time": 4.1725892549875425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4900996321812272, "epoch": 0.02132, "grad_norm": 0.009553170762956142, "kl": 0.7892937287688255, "learning_rate": 9.997967004794658e-06, "loss": -0.0174, "step": 2132, "step_time": 1.9897769809904275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 186.8125, "completions/mean_terminated_length": 186.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.30715654604136944, "epoch": 0.02133, "frac_reward_zero_std": 0.5, "grad_norm": 0.04965599998831749, "kl": 0.5697341412305832, "learning_rate": 9.997965064626485e-06, "loss": -0.0013, "num_tokens": 18415335.0, "reward": 0.6541249752044678, "reward_std": 0.025549007579684258, "rewards/rollout_reward_func/mean": 0.6541249752044678, "rewards/rollout_reward_func/std": 0.1885710209608078, "sampling/importance_sampling_ratio/max": 0.9987753629684448, "sampling/importance_sampling_ratio/mean": 0.9076824188232422, "sampling/importance_sampling_ratio/min": 0.011305044405162334, "sampling/sampling_logp_difference/max": 2.1735289096832275, "sampling/sampling_logp_difference/mean": 0.05024303123354912, "step": 2133, "step_time": 4.442254595989652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30827145325019956, "epoch": 0.02134, "grad_norm": 0.04960895702242851, "kl": 0.5685723349452019, "learning_rate": 9.997963123533217e-06, "loss": -0.0013, "step": 2134, "step_time": 2.0280770570025197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 122.84375, "completions/mean_terminated_length": 122.84375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.24371469486504793, "epoch": 0.02135, "frac_reward_zero_std": 0.75, "grad_norm": 0.008115808479487896, "kl": 0.4901970326900482, "learning_rate": 9.99796118151485e-06, "loss": -0.0177, "num_tokens": 18430002.0, "reward": 0.7226442098617554, "reward_std": 0.0088388342410326, "rewards/rollout_reward_func/mean": 0.7226442098617554, "rewards/rollout_reward_func/std": 0.181561678647995, "sampling/importance_sampling_ratio/max": 0.9988165497779846, "sampling/importance_sampling_ratio/mean": 0.9651327133178711, "sampling/importance_sampling_ratio/min": 7.333740281723067e-10, "sampling/sampling_logp_difference/max": 14.813167572021484, "sampling/sampling_logp_difference/mean": 0.1589536815881729, "step": 2135, "step_time": 4.698095072002616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24432497844099998, "epoch": 0.02136, "grad_norm": 0.007493454031646252, "kl": 0.49091361463069916, "learning_rate": 9.997959238571388e-06, "loss": -0.0178, "step": 2136, "step_time": 1.9611208050118876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 149.6875, "completions/mean_terminated_length": 149.6875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.33812881307676435, "epoch": 0.02137, "frac_reward_zero_std": 0.75, "grad_norm": 0.0649198368191719, "kl": 0.9372542835772038, "learning_rate": 9.99795729470283e-06, "loss": -0.0164, "num_tokens": 18445592.0, "reward": 0.7532884478569031, "reward_std": 0.039434801787137985, "rewards/rollout_reward_func/mean": 0.7532884478569031, "rewards/rollout_reward_func/std": 0.4329347610473633, "sampling/importance_sampling_ratio/max": 1.0001786947250366, "sampling/importance_sampling_ratio/mean": 0.9651963710784912, "sampling/importance_sampling_ratio/min": 0.0002888580202125013, "sampling/sampling_logp_difference/max": 1.9726943969726562, "sampling/sampling_logp_difference/mean": 0.06251990050077438, "step": 2137, "step_time": 4.074989255997934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.335285859182477, "epoch": 0.02138, "grad_norm": 0.060066040605306625, "kl": 0.9072459787130356, "learning_rate": 9.997955349909178e-06, "loss": -0.0165, "step": 2138, "step_time": 2.013464430012391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 174.1875, "completions/mean_terminated_length": 174.1875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.49303422775119543, "epoch": 0.02139, "frac_reward_zero_std": 0.5, "grad_norm": 0.026780180633068085, "kl": 0.5530777275562286, "learning_rate": 9.99795340419043e-06, "loss": 0.0015, "num_tokens": 18461990.0, "reward": 0.6972788572311401, "reward_std": 0.016807381063699722, "rewards/rollout_reward_func/mean": 0.6972788572311401, "rewards/rollout_reward_func/std": 0.35309305787086487, "sampling/importance_sampling_ratio/max": 1.0020827054977417, "sampling/importance_sampling_ratio/mean": 0.9317350387573242, "sampling/importance_sampling_ratio/min": 0.0007285245810635388, "sampling/sampling_logp_difference/max": 1.930769681930542, "sampling/sampling_logp_difference/mean": 0.06878668069839478, "step": 2139, "step_time": 4.270368869998492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4947118950076401, "epoch": 0.0214, "grad_norm": 0.031488530337810516, "kl": 0.5365692265331745, "learning_rate": 9.997951457546589e-06, "loss": 0.0014, "step": 2140, "step_time": 2.0323754609926254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 95.25, "completions/mean_terminated_length": 95.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.23951810505241156, "epoch": 0.02141, "frac_reward_zero_std": 0.75, "grad_norm": 0.004290909040719271, "kl": 0.6933132335543633, "learning_rate": 9.997949509977653e-06, "loss": -0.0269, "num_tokens": 18475806.0, "reward": 0.7929567098617554, "reward_std": 0.03134387731552124, "rewards/rollout_reward_func/mean": 0.7929567098617554, "rewards/rollout_reward_func/std": 0.24264183640480042, "sampling/importance_sampling_ratio/max": 0.9974516034126282, "sampling/importance_sampling_ratio/mean": 0.962732195854187, "sampling/importance_sampling_ratio/min": 0.016167564317584038, "sampling/sampling_logp_difference/max": 1.7624964714050293, "sampling/sampling_logp_difference/mean": 0.029159219935536385, "step": 2141, "step_time": 4.302660736007965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24033996649086475, "epoch": 0.02142, "grad_norm": 0.004521651659160852, "kl": 0.6917627900838852, "learning_rate": 9.997947561483624e-06, "loss": -0.0269, "step": 2142, "step_time": 2.0029429879868985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.07476417068392038, "epoch": 0.02143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006027087802067399, "kl": 0.6306965574622154, "learning_rate": 9.997945612064503e-06, "loss": 0.0018, "num_tokens": 18490870.0, "reward": 0.656653881072998, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.656653881072998, "rewards/rollout_reward_func/std": 0.18457230925559998, "sampling/importance_sampling_ratio/max": 0.9973080158233643, "sampling/importance_sampling_ratio/mean": 0.9934629201889038, "sampling/importance_sampling_ratio/min": 0.9877778887748718, "sampling/sampling_logp_difference/max": 0.006262913346290588, "sampling/sampling_logp_difference/mean": 0.0015253927558660507, "step": 2143, "step_time": 4.002398255986918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07635694276541471, "epoch": 0.02144, "grad_norm": 0.0006131899426691234, "kl": 0.6304228082299232, "learning_rate": 9.99794366172029e-06, "loss": 0.0018, "step": 2144, "step_time": 1.9665566209951066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 168.0625, "completions/mean_terminated_length": 168.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.29378744028508663, "epoch": 0.02145, "frac_reward_zero_std": 0.5, "grad_norm": 0.12561358511447906, "kl": 0.4747798629105091, "learning_rate": 9.997941710450985e-06, "loss": 0.0641, "num_tokens": 18506928.0, "reward": 0.22046634554862976, "reward_std": 0.1674347072839737, "rewards/rollout_reward_func/mean": 0.22046634554862976, "rewards/rollout_reward_func/std": 0.6642486453056335, "sampling/importance_sampling_ratio/max": 1.0051052570343018, "sampling/importance_sampling_ratio/mean": 0.9324842691421509, "sampling/importance_sampling_ratio/min": 0.03774654120206833, "sampling/sampling_logp_difference/max": 1.3500312566757202, "sampling/sampling_logp_difference/mean": 0.03675181418657303, "step": 2145, "step_time": 4.2124861080083065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30850822757929564, "epoch": 0.02146, "grad_norm": 0.15820007026195526, "kl": 0.4684906341135502, "learning_rate": 9.997939758256587e-06, "loss": 0.0639, "step": 2146, "step_time": 2.063940864005417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06591013632714748, "epoch": 0.02147, "frac_reward_zero_std": 1.0, "grad_norm": 0.00211162818595767, "kl": 0.37491941824555397, "learning_rate": 9.9979378051371e-06, "loss": 0.0012, "num_tokens": 18522520.0, "reward": 0.7343461513519287, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7343461513519287, "rewards/rollout_reward_func/std": 0.2680402100086212, "sampling/importance_sampling_ratio/max": 1.0101597309112549, "sampling/importance_sampling_ratio/mean": 0.9938451051712036, "sampling/importance_sampling_ratio/min": 0.9791121482849121, "sampling/sampling_logp_difference/max": 0.017171408981084824, "sampling/sampling_logp_difference/mean": 0.001616191235370934, "step": 2147, "step_time": 4.989833900013764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06767601193860173, "epoch": 0.02148, "grad_norm": 0.0008037083898670971, "kl": 0.37477413564920425, "learning_rate": 9.997935851092522e-06, "loss": 0.0012, "step": 2148, "step_time": 2.04189909100387 }, { "clip_ratio/high_max": 0.032366071827709675, "clip_ratio/high_mean": 0.016183035913854837, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025111607741564512, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 168.625, "completions/mean_terminated_length": 168.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6830636551603675, "epoch": 0.02149, "frac_reward_zero_std": 0.5, "grad_norm": 0.8410930037498474, "kl": 0.4150110185146332, "learning_rate": 9.997933896122855e-06, "loss": -0.0194, "num_tokens": 18538628.0, "reward": 0.8776153922080994, "reward_std": 0.2127804309129715, "rewards/rollout_reward_func/mean": 0.8776153922080994, "rewards/rollout_reward_func/std": 0.4886697232723236, "sampling/importance_sampling_ratio/max": 1.7221860885620117, "sampling/importance_sampling_ratio/mean": 0.8999637365341187, "sampling/importance_sampling_ratio/min": 8.462739060632885e-05, "sampling/sampling_logp_difference/max": 2.0489449501037598, "sampling/sampling_logp_difference/mean": 0.11641515791416168, "step": 2149, "step_time": 4.390397662005853 }, { "clip_ratio/high_max": 0.05691964365541935, "clip_ratio/high_mean": 0.028459821827709675, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03738839365541935, "entropy": 0.6731843370944262, "epoch": 0.0215, "grad_norm": 0.03220629319548607, "kl": 0.4738313816487789, "learning_rate": 9.997931940228099e-06, "loss": -0.0199, "step": 2150, "step_time": 2.069108256997424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 171.15625, "completions/mean_terminated_length": 171.15625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5350386090576649, "epoch": 0.02151, "frac_reward_zero_std": 0.5, "grad_norm": 0.020493565127253532, "kl": 0.45663607120513916, "learning_rate": 9.997929983408253e-06, "loss": -0.0622, "num_tokens": 18554785.0, "reward": 0.8617451190948486, "reward_std": 0.051426783204078674, "rewards/rollout_reward_func/mean": 0.8617451190948486, "rewards/rollout_reward_func/std": 0.2726392447948456, "sampling/importance_sampling_ratio/max": 0.9988201260566711, "sampling/importance_sampling_ratio/mean": 0.9006462097167969, "sampling/importance_sampling_ratio/min": 0.001951449317857623, "sampling/sampling_logp_difference/max": 1.9547544717788696, "sampling/sampling_logp_difference/mean": 0.08392572402954102, "step": 2151, "step_time": 4.164396251988364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5383190447464585, "epoch": 0.02152, "grad_norm": 0.02110281214118004, "kl": 0.4577166959643364, "learning_rate": 9.997928025663317e-06, "loss": -0.0622, "step": 2152, "step_time": 2.0325501930201426 }, { "clip_ratio/high_max": 0.017361111473292112, "clip_ratio/high_mean": 0.008680555736646056, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01215277798473835, "completions/clipped_ratio": 0.03125, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 234.59375, "completions/mean_terminated_length": 232.61289978027344, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 1.2370631340891123, "epoch": 0.02153, "frac_reward_zero_std": 0.5, "grad_norm": 0.935941219329834, "kl": 0.47884154692292213, "learning_rate": 9.997926066993295e-06, "loss": 0.0799, "num_tokens": 18573028.0, "reward": 0.5069268941879272, "reward_std": 0.04189957678318024, "rewards/rollout_reward_func/mean": 0.5069268941879272, "rewards/rollout_reward_func/std": 0.2428198903799057, "sampling/importance_sampling_ratio/max": 1.1182224750518799, "sampling/importance_sampling_ratio/mean": 0.8306789398193359, "sampling/importance_sampling_ratio/min": 1.7269206812585944e-17, "sampling/sampling_logp_difference/max": 4.3629560470581055, "sampling/sampling_logp_difference/mean": 0.404634952545166, "step": 2153, "step_time": 5.8879156460025115 }, { "clip_ratio/high_max": 0.05753968423232436, "clip_ratio/high_mean": 0.02876984211616218, "clip_ratio/low_mean": 0.05277777882292867, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0815476207062602, "entropy": 1.4802257977426052, "epoch": 0.02154, "grad_norm": 0.06382620334625244, "kl": 0.419119194149971, "learning_rate": 9.997924107398184e-06, "loss": 0.0753, "step": 2154, "step_time": 2.0587364499879186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 168.25, "completions/mean_terminated_length": 168.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.441295868717134, "epoch": 0.02155, "frac_reward_zero_std": 0.5, "grad_norm": 0.0434948094189167, "kl": 0.49327436462044716, "learning_rate": 9.997922146877988e-06, "loss": -0.0467, "num_tokens": 18589180.0, "reward": 0.886798083782196, "reward_std": 0.06112106889486313, "rewards/rollout_reward_func/mean": 0.886798083782196, "rewards/rollout_reward_func/std": 0.1911461353302002, "sampling/importance_sampling_ratio/max": 1.0034725666046143, "sampling/importance_sampling_ratio/mean": 0.907742977142334, "sampling/importance_sampling_ratio/min": 0.007591044995933771, "sampling/sampling_logp_difference/max": 1.7464208602905273, "sampling/sampling_logp_difference/mean": 0.06063636392354965, "step": 2155, "step_time": 4.1931925660028355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.44316999707370996, "epoch": 0.02156, "grad_norm": 0.04369041323661804, "kl": 0.4938007481396198, "learning_rate": 9.997920185432706e-06, "loss": -0.0467, "step": 2156, "step_time": 2.025238634989364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 168.6875, "completions/mean_terminated_length": 168.6875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.24766928888857365, "epoch": 0.02157, "frac_reward_zero_std": 0.5, "grad_norm": 0.04018288850784302, "kl": 0.6840798482298851, "learning_rate": 9.997918223062336e-06, "loss": -0.0516, "num_tokens": 18605466.0, "reward": 1.0464422702789307, "reward_std": 0.11025427281856537, "rewards/rollout_reward_func/mean": 1.0464422702789307, "rewards/rollout_reward_func/std": 0.30185553431510925, "sampling/importance_sampling_ratio/max": 0.9977784752845764, "sampling/importance_sampling_ratio/mean": 0.9364506006240845, "sampling/importance_sampling_ratio/min": 0.005139641463756561, "sampling/sampling_logp_difference/max": 2.7576470375061035, "sampling/sampling_logp_difference/mean": 0.0394476093351841, "step": 2157, "step_time": 4.155306132997794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2481841817498207, "epoch": 0.02158, "grad_norm": 0.03966199606657028, "kl": 0.6716327294707298, "learning_rate": 9.997916259766882e-06, "loss": -0.0516, "step": 2158, "step_time": 2.0539158579995274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 102.34375, "completions/mean_terminated_length": 102.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6105226771906018, "epoch": 0.02159, "frac_reward_zero_std": 0.5, "grad_norm": 0.05869205668568611, "kl": 0.6001306809484959, "learning_rate": 9.997914295546342e-06, "loss": -0.0469, "num_tokens": 18619621.0, "reward": 0.5711057782173157, "reward_std": 0.08390093594789505, "rewards/rollout_reward_func/mean": 0.5711057782173157, "rewards/rollout_reward_func/std": 0.18966732919216156, "sampling/importance_sampling_ratio/max": 1.0222638845443726, "sampling/importance_sampling_ratio/mean": 0.9340566396713257, "sampling/importance_sampling_ratio/min": 3.0358699495275365e-12, "sampling/sampling_logp_difference/max": 17.570877075195312, "sampling/sampling_logp_difference/mean": 0.23841089010238647, "step": 2159, "step_time": 4.7487931519935955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6076934076845646, "epoch": 0.0216, "grad_norm": 0.052301276475191116, "kl": 0.6026075184345245, "learning_rate": 9.997912330400718e-06, "loss": -0.0469, "step": 2160, "step_time": 2.024451063007291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 144.28125, "completions/mean_terminated_length": 144.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7079078108072281, "epoch": 0.02161, "frac_reward_zero_std": 0.5, "grad_norm": 0.013464194722473621, "kl": 0.5931146517395973, "learning_rate": 9.99791036433001e-06, "loss": -0.0462, "num_tokens": 18635038.0, "reward": 0.7434327006340027, "reward_std": 0.21580354869365692, "rewards/rollout_reward_func/mean": 0.7434327006340027, "rewards/rollout_reward_func/std": 0.3731875419616699, "sampling/importance_sampling_ratio/max": 1.000072956085205, "sampling/importance_sampling_ratio/mean": 0.9294583797454834, "sampling/importance_sampling_ratio/min": 8.594225618666962e-19, "sampling/sampling_logp_difference/max": 2.9958319664001465, "sampling/sampling_logp_difference/mean": 0.30790290236473083, "step": 2161, "step_time": 4.215265790000558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.706215113401413, "epoch": 0.02162, "grad_norm": 0.013100079260766506, "kl": 0.5951892882585526, "learning_rate": 9.997908397334217e-06, "loss": -0.0462, "step": 2162, "step_time": 2.0344237800090923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 115.15625, "completions/mean_terminated_length": 115.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5540314139798284, "epoch": 0.02163, "frac_reward_zero_std": 0.5, "grad_norm": 0.11130953580141068, "kl": 0.7321433275938034, "learning_rate": 9.997906429413344e-06, "loss": 0.016, "num_tokens": 18649435.0, "reward": 0.6174519062042236, "reward_std": 0.030415449291467667, "rewards/rollout_reward_func/mean": 0.6174519062042236, "rewards/rollout_reward_func/std": 0.19936205446720123, "sampling/importance_sampling_ratio/max": 0.997331440448761, "sampling/importance_sampling_ratio/mean": 0.9095829725265503, "sampling/importance_sampling_ratio/min": 0.00051178561989218, "sampling/sampling_logp_difference/max": 1.760692834854126, "sampling/sampling_logp_difference/mean": 0.09707902371883392, "step": 2163, "step_time": 3.975065110003925 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.5555534912273288, "epoch": 0.02164, "grad_norm": 0.09693819284439087, "kl": 0.7134453654289246, "learning_rate": 9.997904460567386e-06, "loss": 0.0157, "step": 2164, "step_time": 2.4017747170073562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 196.71875, "completions/mean_terminated_length": 196.71875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3738248022273183, "epoch": 0.02165, "frac_reward_zero_std": 0.5, "grad_norm": 0.027356956154108047, "kl": 0.5724681280553341, "learning_rate": 9.997902490796346e-06, "loss": -0.0585, "num_tokens": 18666522.0, "reward": 0.9369962215423584, "reward_std": 0.4195130169391632, "rewards/rollout_reward_func/mean": 0.9369962215423584, "rewards/rollout_reward_func/std": 0.646065354347229, "sampling/importance_sampling_ratio/max": 0.9991722702980042, "sampling/importance_sampling_ratio/mean": 0.901347279548645, "sampling/importance_sampling_ratio/min": 0.002597649348899722, "sampling/sampling_logp_difference/max": 2.4561965465545654, "sampling/sampling_logp_difference/mean": 0.06292963027954102, "step": 2165, "step_time": 4.914203154017741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.37552123237401247, "epoch": 0.02166, "grad_norm": 0.042207278311252594, "kl": 0.5591920875012875, "learning_rate": 9.997900520100225e-06, "loss": -0.0584, "step": 2166, "step_time": 2.028299474994128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 212.75, "completions/mean_terminated_length": 216.29031372070312, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.5098713478073478, "epoch": 0.02167, "frac_reward_zero_std": 0.5, "grad_norm": 0.02368519827723503, "kl": 0.4926907941699028, "learning_rate": 9.997898548479022e-06, "loss": -0.0532, "num_tokens": 18684010.0, "reward": 1.0548028945922852, "reward_std": 0.11184525489807129, "rewards/rollout_reward_func/mean": 1.0548028945922852, "rewards/rollout_reward_func/std": 0.3131660223007202, "sampling/importance_sampling_ratio/max": 0.9998674392700195, "sampling/importance_sampling_ratio/mean": 0.933838963508606, "sampling/importance_sampling_ratio/min": 8.425651379707483e-19, "sampling/sampling_logp_difference/max": 3.3328068256378174, "sampling/sampling_logp_difference/mean": 0.2087131291627884, "step": 2167, "step_time": 4.320069923989649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5106141688302159, "epoch": 0.02168, "grad_norm": 0.025819936767220497, "kl": 0.48539961501955986, "learning_rate": 9.99789657593274e-06, "loss": -0.0532, "step": 2168, "step_time": 2.04526008200628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 197.03125, "completions/mean_terminated_length": 197.03125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.304803061299026, "epoch": 0.02169, "frac_reward_zero_std": 0.75, "grad_norm": 0.00521484762430191, "kl": 0.4142812192440033, "learning_rate": 9.997894602461376e-06, "loss": -0.0175, "num_tokens": 18701139.0, "reward": 1.1078269481658936, "reward_std": 0.00679910508915782, "rewards/rollout_reward_func/mean": 1.1078269481658936, "rewards/rollout_reward_func/std": 0.2746107280254364, "sampling/importance_sampling_ratio/max": 0.9993510246276855, "sampling/importance_sampling_ratio/mean": 0.9622795581817627, "sampling/importance_sampling_ratio/min": 0.0026934368070214987, "sampling/sampling_logp_difference/max": 1.66495943069458, "sampling/sampling_logp_difference/mean": 0.031397588551044464, "step": 2169, "step_time": 4.246347421001701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3029925851151347, "epoch": 0.0217, "grad_norm": 0.004907118156552315, "kl": 0.4140806496143341, "learning_rate": 9.997892628064933e-06, "loss": -0.0175, "step": 2170, "step_time": 2.9676222860216512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 177.5, "completions/mean_terminated_length": 177.5, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.5679268287494779, "epoch": 0.02171, "frac_reward_zero_std": 0.5, "grad_norm": 0.05313602834939957, "kl": 0.648182712495327, "learning_rate": 9.99789065274341e-06, "loss": -0.0273, "num_tokens": 18717611.0, "reward": 0.6163173317909241, "reward_std": 0.08056218177080154, "rewards/rollout_reward_func/mean": 0.6163173317909241, "rewards/rollout_reward_func/std": 0.1769939512014389, "sampling/importance_sampling_ratio/max": 0.9997811913490295, "sampling/importance_sampling_ratio/mean": 0.9124812483787537, "sampling/importance_sampling_ratio/min": 0.00018271202861797065, "sampling/sampling_logp_difference/max": 1.4171860218048096, "sampling/sampling_logp_difference/mean": 0.08862250298261642, "step": 2171, "step_time": 4.404414189019008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5698388768360019, "epoch": 0.02172, "grad_norm": 0.04754510894417763, "kl": 0.6773395538330078, "learning_rate": 9.997888676496809e-06, "loss": -0.0275, "step": 2172, "step_time": 2.0709821060008835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 151.9375, "completions/mean_terminated_length": 151.9375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.7803858807310462, "epoch": 0.02173, "frac_reward_zero_std": 0.5, "grad_norm": 0.01842605136334896, "kl": 0.5199745856225491, "learning_rate": 9.99788669932513e-06, "loss": -0.0605, "num_tokens": 18733297.0, "reward": 0.6239471435546875, "reward_std": 0.03316355496644974, "rewards/rollout_reward_func/mean": 0.6239471435546875, "rewards/rollout_reward_func/std": 0.11358509212732315, "sampling/importance_sampling_ratio/max": 0.9980875849723816, "sampling/importance_sampling_ratio/mean": 0.9001208543777466, "sampling/importance_sampling_ratio/min": 4.292373705538921e-05, "sampling/sampling_logp_difference/max": 1.8714802265167236, "sampling/sampling_logp_difference/mean": 0.12774917483329773, "step": 2173, "step_time": 4.299619600998994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7781965350732207, "epoch": 0.02174, "grad_norm": 0.017709139734506607, "kl": 0.520308144390583, "learning_rate": 9.997884721228374e-06, "loss": -0.0606, "step": 2174, "step_time": 2.0380934749919106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 146.625, "completions/mean_terminated_length": 146.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.510227226652205, "epoch": 0.02175, "frac_reward_zero_std": 0.5, "grad_norm": 0.6159825325012207, "kl": 0.41197729855775833, "learning_rate": 9.997882742206542e-06, "loss": -0.0328, "num_tokens": 18748725.0, "reward": 0.6876922845840454, "reward_std": 0.03247956186532974, "rewards/rollout_reward_func/mean": 0.6876922845840454, "rewards/rollout_reward_func/std": 0.18962998688220978, "sampling/importance_sampling_ratio/max": 1.0256508588790894, "sampling/importance_sampling_ratio/mean": 0.8753008842468262, "sampling/importance_sampling_ratio/min": 0.08753906190395355, "sampling/sampling_logp_difference/max": 1.28790283203125, "sampling/sampling_logp_difference/mean": 0.06452873349189758, "step": 2175, "step_time": 4.157097038005304 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.03437500074505806, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.044791667722165585, "entropy": 0.5280160056427121, "epoch": 0.02176, "grad_norm": 0.0369502454996109, "kl": 0.4918517842888832, "learning_rate": 9.99788076225963e-06, "loss": -0.0339, "step": 2176, "step_time": 2.973915037000552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 203.96875, "completions/mean_terminated_length": 203.96875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.4795310478657484, "epoch": 0.02177, "frac_reward_zero_std": 0.25, "grad_norm": 0.08260238170623779, "kl": 0.513121135532856, "learning_rate": 9.997878781387643e-06, "loss": -0.0749, "num_tokens": 18765988.0, "reward": 0.47213461995124817, "reward_std": 0.2919137477874756, "rewards/rollout_reward_func/mean": 0.47213461995124817, "rewards/rollout_reward_func/std": 0.44529515504837036, "sampling/importance_sampling_ratio/max": 0.998814582824707, "sampling/importance_sampling_ratio/mean": 0.8802996873855591, "sampling/importance_sampling_ratio/min": 0.0068071200512349606, "sampling/sampling_logp_difference/max": 2.1560001373291016, "sampling/sampling_logp_difference/mean": 0.06775546073913574, "step": 2177, "step_time": 4.599277484994673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4733777940273285, "epoch": 0.02178, "grad_norm": 0.07777491956949234, "kl": 0.5219933539628983, "learning_rate": 9.99787679959058e-06, "loss": -0.075, "step": 2178, "step_time": 2.062337003000721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06057431735098362, "epoch": 0.02179, "frac_reward_zero_std": 1.0, "grad_norm": 0.00048803805839270353, "kl": 0.5197743400931358, "learning_rate": 9.997874816868442e-06, "loss": 0.0016, "num_tokens": 18781292.0, "reward": 0.7400000095367432, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7400000095367432, "rewards/rollout_reward_func/std": 0.22417236864566803, "sampling/importance_sampling_ratio/max": 0.9994819164276123, "sampling/importance_sampling_ratio/mean": 0.9957799911499023, "sampling/importance_sampling_ratio/min": 0.991288423538208, "sampling/sampling_logp_difference/max": 0.0039620716124773026, "sampling/sampling_logp_difference/mean": 0.0011217821156606078, "step": 2179, "step_time": 4.034965746017406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.057319674640893936, "epoch": 0.0218, "grad_norm": 0.000449582003057003, "kl": 0.5203205645084381, "learning_rate": 9.99787283322123e-06, "loss": 0.0016, "step": 2180, "step_time": 2.0118256920031854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 173.78125, "completions/mean_terminated_length": 173.78125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.726385266520083, "epoch": 0.02181, "frac_reward_zero_std": 0.5, "grad_norm": 0.01355728693306446, "kl": 0.587707657366991, "learning_rate": 9.997870848648943e-06, "loss": -0.0587, "num_tokens": 18797533.0, "reward": 0.59900963306427, "reward_std": 0.031749177724123, "rewards/rollout_reward_func/mean": 0.59900963306427, "rewards/rollout_reward_func/std": 0.1769610494375229, "sampling/importance_sampling_ratio/max": 0.9999406933784485, "sampling/importance_sampling_ratio/mean": 0.9036847352981567, "sampling/importance_sampling_ratio/min": 0.0001580086100148037, "sampling/sampling_logp_difference/max": 3.1608710289001465, "sampling/sampling_logp_difference/mean": 0.13374032080173492, "step": 2181, "step_time": 4.958101546006219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7289654342457652, "epoch": 0.02182, "grad_norm": 0.013636650517582893, "kl": 0.5856464505195618, "learning_rate": 9.997868863151581e-06, "loss": -0.0587, "step": 2182, "step_time": 2.5136022599981516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 96.5, "completions/mean_terminated_length": 96.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06764364428818226, "epoch": 0.02183, "frac_reward_zero_std": 1.0, "grad_norm": 0.00031926206429488957, "kl": 0.6362173222005367, "learning_rate": 9.997866876729148e-06, "loss": 0.0014, "num_tokens": 18811477.0, "reward": 0.7162692546844482, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7162692546844482, "rewards/rollout_reward_func/std": 0.13117749989032745, "sampling/importance_sampling_ratio/max": 1.0006046295166016, "sampling/importance_sampling_ratio/mean": 0.9953267574310303, "sampling/importance_sampling_ratio/min": 0.9917575716972351, "sampling/sampling_logp_difference/max": 0.004537524655461311, "sampling/sampling_logp_difference/mean": 0.0014156957622617483, "step": 2183, "step_time": 3.9929164119967027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06549865147098899, "epoch": 0.02184, "grad_norm": 0.00030606251675635576, "kl": 0.6365372315049171, "learning_rate": 9.99786488938164e-06, "loss": 0.0014, "step": 2184, "step_time": 2.052457980986219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 171.75, "completions/mean_terminated_length": 171.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.04982416890561581, "epoch": 0.02185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008555646054446697, "kl": 0.44016632065176964, "learning_rate": 9.997862901109061e-06, "loss": 0.0015, "num_tokens": 18827741.0, "reward": 0.8208845853805542, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8208845853805542, "rewards/rollout_reward_func/std": 0.28756198287010193, "sampling/importance_sampling_ratio/max": 1.0191278457641602, "sampling/importance_sampling_ratio/mean": 1.0025795698165894, "sampling/importance_sampling_ratio/min": 0.9958406686782837, "sampling/sampling_logp_difference/max": 0.021311912685632706, "sampling/sampling_logp_difference/mean": 0.0014427623245865107, "step": 2185, "step_time": 4.043664844000887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04970923578366637, "epoch": 0.02186, "grad_norm": 0.001037198700942099, "kl": 0.4400707818567753, "learning_rate": 9.99786091191141e-06, "loss": 0.0015, "step": 2186, "step_time": 2.008449053981167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 171.75, "completions/mean_terminated_length": 171.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.043491574469953775, "epoch": 0.02187, "frac_reward_zero_std": 1.0, "grad_norm": 0.00033463200088590384, "kl": 0.40340762957930565, "learning_rate": 9.997858921788687e-06, "loss": 0.0014, "num_tokens": 18843973.0, "reward": 0.6049230694770813, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6049230694770813, "rewards/rollout_reward_func/std": 0.30566835403442383, "sampling/importance_sampling_ratio/max": 0.9991534948348999, "sampling/importance_sampling_ratio/mean": 0.9971563816070557, "sampling/importance_sampling_ratio/min": 0.9954184293746948, "sampling/sampling_logp_difference/max": 0.0024926308542490005, "sampling/sampling_logp_difference/mean": 0.0006848014891147614, "step": 2187, "step_time": 4.855864336997911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04260836262255907, "epoch": 0.02188, "grad_norm": 0.0003287111467216164, "kl": 0.40353601053357124, "learning_rate": 9.997856930740893e-06, "loss": 0.0014, "step": 2188, "step_time": 2.442254749017593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 97.0625, "completions/mean_terminated_length": 97.0625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.214102684520185, "epoch": 0.02189, "frac_reward_zero_std": 0.75, "grad_norm": 0.018410930410027504, "kl": 0.6121641285717487, "learning_rate": 9.99785493876803e-06, "loss": -0.0163, "num_tokens": 18857815.0, "reward": 0.59375, "reward_std": 0.00679910508915782, "rewards/rollout_reward_func/mean": 0.59375, "rewards/rollout_reward_func/std": 0.12281978875398636, "sampling/importance_sampling_ratio/max": 1.0000263452529907, "sampling/importance_sampling_ratio/mean": 0.9672701358795166, "sampling/importance_sampling_ratio/min": 0.06344637274742126, "sampling/sampling_logp_difference/max": 1.1950056552886963, "sampling/sampling_logp_difference/mean": 0.02265322022140026, "step": 2189, "step_time": 3.634011182002723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21966718276962638, "epoch": 0.0219, "grad_norm": 0.018287209793925285, "kl": 0.622573871165514, "learning_rate": 9.997852945870096e-06, "loss": -0.0164, "step": 2190, "step_time": 1.980649760007509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 146.75, "completions/mean_terminated_length": 146.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04676749184727669, "epoch": 0.02191, "frac_reward_zero_std": 1.0, "grad_norm": 0.00023786431120242923, "kl": 0.5745446011424065, "learning_rate": 9.997850952047092e-06, "loss": 0.0016, "num_tokens": 18873279.0, "reward": 0.7220385074615479, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7220385074615479, "rewards/rollout_reward_func/std": 0.09547410905361176, "sampling/importance_sampling_ratio/max": 0.9988546371459961, "sampling/importance_sampling_ratio/mean": 0.994580864906311, "sampling/importance_sampling_ratio/min": 0.9908012747764587, "sampling/sampling_logp_difference/max": 0.004669479560106993, "sampling/sampling_logp_difference/mean": 0.0012520963791757822, "step": 2191, "step_time": 4.295669280996663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04661473771557212, "epoch": 0.02192, "grad_norm": 0.00023607659386470914, "kl": 0.5746190845966339, "learning_rate": 9.997848957299021e-06, "loss": 0.0016, "step": 2192, "step_time": 2.0128044740049518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 152.65625, "completions/mean_terminated_length": 152.65625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2941071204841137, "epoch": 0.02193, "frac_reward_zero_std": 0.75, "grad_norm": 0.010268725454807281, "kl": 0.527162455022335, "learning_rate": 9.99784696162588e-06, "loss": -0.027, "num_tokens": 18888964.0, "reward": 0.742644190788269, "reward_std": 0.12578341364860535, "rewards/rollout_reward_func/mean": 0.742644190788269, "rewards/rollout_reward_func/std": 0.4898274838924408, "sampling/importance_sampling_ratio/max": 0.9998424649238586, "sampling/importance_sampling_ratio/mean": 0.9659965634346008, "sampling/importance_sampling_ratio/min": 4.29113151767524e-06, "sampling/sampling_logp_difference/max": 2.2699921131134033, "sampling/sampling_logp_difference/mean": 0.07395610958337784, "step": 2193, "step_time": 4.623128658000496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29128987481817603, "epoch": 0.02194, "grad_norm": 0.011163854971528053, "kl": 0.5320193916559219, "learning_rate": 9.997844965027672e-06, "loss": -0.027, "step": 2194, "step_time": 2.486424198024906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 189.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.13320362055674195, "epoch": 0.02195, "frac_reward_zero_std": 1.0, "grad_norm": 0.03451891615986824, "kl": 0.4775402247905731, "learning_rate": 9.997842967504396e-06, "loss": 0.0018, "num_tokens": 18905808.0, "reward": 0.976769208908081, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.976769208908081, "rewards/rollout_reward_func/std": 0.32682672142982483, "sampling/importance_sampling_ratio/max": 0.9995866417884827, "sampling/importance_sampling_ratio/mean": 0.939667820930481, "sampling/importance_sampling_ratio/min": 3.794374536791878e-10, "sampling/sampling_logp_difference/max": 19.258790969848633, "sampling/sampling_logp_difference/mean": 0.12299390882253647, "step": 2195, "step_time": 4.370410450996133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13385829608887434, "epoch": 0.02196, "grad_norm": 0.025047430768609047, "kl": 0.46017859876155853, "learning_rate": 9.997840969056053e-06, "loss": 0.0017, "step": 2196, "step_time": 2.0595353440221515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 150.5, "completions/mean_terminated_length": 150.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.27485814690589905, "epoch": 0.02197, "frac_reward_zero_std": 0.75, "grad_norm": 0.04078627750277519, "kl": 0.7732207961380482, "learning_rate": 9.997838969682644e-06, "loss": 0.0211, "num_tokens": 18921392.0, "reward": 0.6949999928474426, "reward_std": 0.024476774036884308, "rewards/rollout_reward_func/mean": 0.6949999928474426, "rewards/rollout_reward_func/std": 0.31138333678245544, "sampling/importance_sampling_ratio/max": 1.0003138780593872, "sampling/importance_sampling_ratio/mean": 0.9403994083404541, "sampling/importance_sampling_ratio/min": 0.020057568326592445, "sampling/sampling_logp_difference/max": 1.8399512767791748, "sampling/sampling_logp_difference/mean": 0.04553372040390968, "step": 2197, "step_time": 4.154765105013212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2721458678133786, "epoch": 0.02198, "grad_norm": 0.03858698904514313, "kl": 0.7571866698563099, "learning_rate": 9.99783696938417e-06, "loss": 0.0211, "step": 2198, "step_time": 2.0083751330093946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 121.15625, "completions/mean_terminated_length": 121.15625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.14718199148774147, "epoch": 0.02199, "frac_reward_zero_std": 0.75, "grad_norm": 0.031830038875341415, "kl": 0.650441151112318, "learning_rate": 9.99783496816063e-06, "loss": -0.0242, "num_tokens": 18936093.0, "reward": 0.623798131942749, "reward_std": 0.0023116914089769125, "rewards/rollout_reward_func/mean": 0.623798131942749, "rewards/rollout_reward_func/std": 0.1456574946641922, "sampling/importance_sampling_ratio/max": 0.9985935688018799, "sampling/importance_sampling_ratio/mean": 0.9692038893699646, "sampling/importance_sampling_ratio/min": 0.13025130331516266, "sampling/sampling_logp_difference/max": 1.3725271224975586, "sampling/sampling_logp_difference/mean": 0.013949594460427761, "step": 2199, "step_time": 4.3061227799989865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14781730202957988, "epoch": 0.022, "grad_norm": 0.031622759997844696, "kl": 0.6575022786855698, "learning_rate": 9.997832966012023e-06, "loss": -0.0242, "step": 2200, "step_time": 2.4381959999955143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.034633358009159565, "epoch": 0.02201, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002526775933802128, "kl": 0.39355337992310524, "learning_rate": 9.997830962938354e-06, "loss": 0.0012, "num_tokens": 18951629.0, "reward": 0.9496154189109802, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9496154189109802, "rewards/rollout_reward_func/std": 0.33416882157325745, "sampling/importance_sampling_ratio/max": 0.9999051094055176, "sampling/importance_sampling_ratio/mean": 0.9984277486801147, "sampling/importance_sampling_ratio/min": 0.9961841106414795, "sampling/sampling_logp_difference/max": 0.001897469162940979, "sampling/sampling_logp_difference/mean": 0.0004915164317935705, "step": 2201, "step_time": 4.0771315779857105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.035089628770947456, "epoch": 0.02202, "grad_norm": 0.00025616789935156703, "kl": 0.3934968039393425, "learning_rate": 9.99782895893962e-06, "loss": 0.0012, "step": 2202, "step_time": 1.9367504169931635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05193088622763753, "epoch": 0.02203, "frac_reward_zero_std": 1.0, "grad_norm": 0.00022523065854329616, "kl": 0.5822612345218658, "learning_rate": 9.997826954015821e-06, "loss": 0.0014, "num_tokens": 18966477.0, "reward": 0.8593461513519287, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8593461513519287, "rewards/rollout_reward_func/std": 0.12930205464363098, "sampling/importance_sampling_ratio/max": 0.9973766803741455, "sampling/importance_sampling_ratio/mean": 0.9944077730178833, "sampling/importance_sampling_ratio/min": 0.9908503293991089, "sampling/sampling_logp_difference/max": 0.005565735977143049, "sampling/sampling_logp_difference/mean": 0.0012627174146473408, "step": 2203, "step_time": 4.206004832005419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.052168625174090266, "epoch": 0.02204, "grad_norm": 0.00022639699454884976, "kl": 0.5821964032948017, "learning_rate": 9.997824948166962e-06, "loss": 0.0014, "step": 2204, "step_time": 2.0301520209977753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.08359743840992451, "epoch": 0.02205, "frac_reward_zero_std": 0.75, "grad_norm": 0.025965068489313126, "kl": 0.6404882930219173, "learning_rate": 9.997822941393039e-06, "loss": -0.0218, "num_tokens": 18982957.0, "reward": 0.5176153779029846, "reward_std": 0.013598206453025341, "rewards/rollout_reward_func/mean": 0.5176153779029846, "rewards/rollout_reward_func/std": 0.2205822616815567, "sampling/importance_sampling_ratio/max": 0.9987776875495911, "sampling/importance_sampling_ratio/mean": 0.9711514711380005, "sampling/importance_sampling_ratio/min": 0.16859795153141022, "sampling/sampling_logp_difference/max": 1.7171710729599, "sampling/sampling_logp_difference/mean": 0.009899578988552094, "step": 2205, "step_time": 5.175156448989583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08401012234389782, "epoch": 0.02206, "grad_norm": 0.02516455203294754, "kl": 0.6275617480278015, "learning_rate": 9.997820933694053e-06, "loss": -0.0218, "step": 2206, "step_time": 2.008631010990939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 235.96875, "completions/mean_terminated_length": 235.96875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.23451511515304446, "epoch": 0.02207, "frac_reward_zero_std": 0.75, "grad_norm": 0.004215640481561422, "kl": 0.48328840360045433, "learning_rate": 9.997818925070008e-06, "loss": -0.0363, "num_tokens": 19001188.0, "reward": 0.9079519510269165, "reward_std": 0.0524618923664093, "rewards/rollout_reward_func/mean": 0.9079519510269165, "rewards/rollout_reward_func/std": 0.5085728168487549, "sampling/importance_sampling_ratio/max": 1.002228021621704, "sampling/importance_sampling_ratio/mean": 0.9667271971702576, "sampling/importance_sampling_ratio/min": 0.016420138999819756, "sampling/sampling_logp_difference/max": 1.6983312368392944, "sampling/sampling_logp_difference/mean": 0.022982342168688774, "step": 2207, "step_time": 4.533378163003363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23722619004547596, "epoch": 0.02208, "grad_norm": 0.0049860975705087185, "kl": 0.49533815309405327, "learning_rate": 9.9978169155209e-06, "loss": -0.0363, "step": 2208, "step_time": 2.045657481008675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 194.6875, "completions/mean_terminated_length": 194.6875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3778271214105189, "epoch": 0.02209, "frac_reward_zero_std": 0.75, "grad_norm": 0.0032268627546727657, "kl": 0.45791418850421906, "learning_rate": 9.997814905046733e-06, "loss": -0.0367, "num_tokens": 19018130.0, "reward": 0.8829600811004639, "reward_std": 0.011861721985042095, "rewards/rollout_reward_func/mean": 0.8829600811004639, "rewards/rollout_reward_func/std": 0.13457269966602325, "sampling/importance_sampling_ratio/max": 0.9972416162490845, "sampling/importance_sampling_ratio/mean": 0.9638160467147827, "sampling/importance_sampling_ratio/min": 3.56126792905491e-19, "sampling/sampling_logp_difference/max": 4.261346817016602, "sampling/sampling_logp_difference/mean": 0.1668010652065277, "step": 2209, "step_time": 4.539583618010511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3776953066699207, "epoch": 0.0221, "grad_norm": 0.0033043595030903816, "kl": 0.4575093686580658, "learning_rate": 9.997812893647505e-06, "loss": -0.0367, "step": 2210, "step_time": 2.0484531149995746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 170.1875, "completions/mean_terminated_length": 170.1875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.22174684796482325, "epoch": 0.02211, "frac_reward_zero_std": 0.75, "grad_norm": 0.027857018634676933, "kl": 0.566617276519537, "learning_rate": 9.997810881323218e-06, "loss": -0.0362, "num_tokens": 19034288.0, "reward": 0.95900958776474, "reward_std": 0.011830446310341358, "rewards/rollout_reward_func/mean": 0.95900958776474, "rewards/rollout_reward_func/std": 0.3078737258911133, "sampling/importance_sampling_ratio/max": 0.9980181455612183, "sampling/importance_sampling_ratio/mean": 0.9375489950180054, "sampling/importance_sampling_ratio/min": 0.0015849692281335592, "sampling/sampling_logp_difference/max": 2.6706409454345703, "sampling/sampling_logp_difference/mean": 0.046567682176828384, "step": 2211, "step_time": 5.151963958989654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22226070007309318, "epoch": 0.02212, "grad_norm": 0.02487664483487606, "kl": 0.5573696978390217, "learning_rate": 9.99780886807387e-06, "loss": -0.0362, "step": 2212, "step_time": 2.0550422289888957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 166.90625, "completions/mean_terminated_length": 166.90625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2275384240783751, "epoch": 0.02213, "frac_reward_zero_std": 0.75, "grad_norm": 0.008295335806906223, "kl": 0.6075294092297554, "learning_rate": 9.997806853899465e-06, "loss": -0.0265, "num_tokens": 19050341.0, "reward": 0.5304999947547913, "reward_std": 0.01631784811615944, "rewards/rollout_reward_func/mean": 0.5304999947547913, "rewards/rollout_reward_func/std": 0.15985135734081268, "sampling/importance_sampling_ratio/max": 0.9969595074653625, "sampling/importance_sampling_ratio/mean": 0.9622348546981812, "sampling/importance_sampling_ratio/min": 0.005850714631378651, "sampling/sampling_logp_difference/max": 1.8458354473114014, "sampling/sampling_logp_difference/mean": 0.024978699162602425, "step": 2213, "step_time": 4.293525386005058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22757628234103322, "epoch": 0.02214, "grad_norm": 0.009433464147150517, "kl": 0.6072637960314751, "learning_rate": 9.997804838800002e-06, "loss": -0.0265, "step": 2214, "step_time": 2.0028922480050824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.8567314860410988, "epoch": 0.02215, "frac_reward_zero_std": 0.5, "grad_norm": 0.021945394575595856, "kl": 0.6321076638996601, "learning_rate": 9.997802822775482e-06, "loss": -0.0246, "num_tokens": 19066501.0, "reward": 0.6728076934814453, "reward_std": 0.04356593266129494, "rewards/rollout_reward_func/mean": 0.6728076934814453, "rewards/rollout_reward_func/std": 0.15652978420257568, "sampling/importance_sampling_ratio/max": 0.9982208609580994, "sampling/importance_sampling_ratio/mean": 0.9014322757720947, "sampling/importance_sampling_ratio/min": 3.823505029840017e-15, "sampling/sampling_logp_difference/max": 3.0368757247924805, "sampling/sampling_logp_difference/mean": 0.2639714479446411, "step": 2215, "step_time": 4.25701560498419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8535203272476792, "epoch": 0.02216, "grad_norm": 0.020376702770590782, "kl": 0.6215366721153259, "learning_rate": 9.997800805825903e-06, "loss": -0.0247, "step": 2216, "step_time": 2.0175466100045014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 120.625, "completions/mean_terminated_length": 120.625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.1975312358699739, "epoch": 0.02217, "frac_reward_zero_std": 0.75, "grad_norm": 0.011516530066728592, "kl": 0.5456426851451397, "learning_rate": 9.99779878795127e-06, "loss": -0.0168, "num_tokens": 19081097.0, "reward": 0.8099038600921631, "reward_std": 0.006799102760851383, "rewards/rollout_reward_func/mean": 0.8099038600921631, "rewards/rollout_reward_func/std": 0.11825086176395416, "sampling/importance_sampling_ratio/max": 0.9979948997497559, "sampling/importance_sampling_ratio/mean": 0.9655033946037292, "sampling/importance_sampling_ratio/min": 0.03952275216579437, "sampling/sampling_logp_difference/max": 1.445374608039856, "sampling/sampling_logp_difference/mean": 0.02130828984081745, "step": 2217, "step_time": 4.509000412006571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1995006720535457, "epoch": 0.02218, "grad_norm": 0.011136316694319248, "kl": 0.5453093312680721, "learning_rate": 9.997796769151581e-06, "loss": -0.0168, "step": 2218, "step_time": 1.9928197800109047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 158.0, "completions/mean_terminated_length": 158.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5277885468676686, "epoch": 0.02219, "frac_reward_zero_std": 0.75, "grad_norm": 0.11230015754699707, "kl": 0.5548041835427284, "learning_rate": 9.997794749426835e-06, "loss": -0.0018, "num_tokens": 19096945.0, "reward": 0.7110673189163208, "reward_std": 0.11033578962087631, "rewards/rollout_reward_func/mean": 0.7110673189163208, "rewards/rollout_reward_func/std": 0.2688785195350647, "sampling/importance_sampling_ratio/max": 1.0559041500091553, "sampling/importance_sampling_ratio/mean": 0.8857808709144592, "sampling/importance_sampling_ratio/min": 0.007456107996404171, "sampling/sampling_logp_difference/max": 2.5786919593811035, "sampling/sampling_logp_difference/mean": 0.08645275235176086, "step": 2219, "step_time": 4.110979765013326 }, { "clip_ratio/high_max": 0.0848214291036129, "clip_ratio/high_mean": 0.04241071455180645, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04241071455180645, "entropy": 0.49054365418851376, "epoch": 0.0222, "grad_norm": 0.015217109583318233, "kl": 0.5251806154847145, "learning_rate": 9.997792728777033e-06, "loss": -0.0019, "step": 2220, "step_time": 2.063407544999791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 196.5, "completions/mean_terminated_length": 196.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.1743872044607997, "epoch": 0.02221, "frac_reward_zero_std": 0.75, "grad_norm": 0.027387697249650955, "kl": 0.6493902318179607, "learning_rate": 9.997790707202179e-06, "loss": 0.0207, "num_tokens": 19113857.0, "reward": 1.0347307920455933, "reward_std": 0.01359820831567049, "rewards/rollout_reward_func/mean": 1.0347307920455933, "rewards/rollout_reward_func/std": 0.24500851333141327, "sampling/importance_sampling_ratio/max": 1.0027875900268555, "sampling/importance_sampling_ratio/mean": 0.9662917852401733, "sampling/importance_sampling_ratio/min": 0.027736064046621323, "sampling/sampling_logp_difference/max": 2.761486053466797, "sampling/sampling_logp_difference/mean": 0.019771868363022804, "step": 2221, "step_time": 4.07193337699573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17205748846754432, "epoch": 0.02222, "grad_norm": 0.026658184826374054, "kl": 0.6410986855626106, "learning_rate": 9.997788684702271e-06, "loss": 0.0207, "step": 2222, "step_time": 2.438184178005031 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.7131637800484896, "epoch": 0.02223, "frac_reward_zero_std": 0.5, "grad_norm": 0.5721143484115601, "kl": 0.531475618481636, "learning_rate": 9.997786661277308e-06, "loss": -0.0363, "num_tokens": 19131429.0, "reward": 0.7345000505447388, "reward_std": 0.15307974815368652, "rewards/rollout_reward_func/mean": 0.7345000505447388, "rewards/rollout_reward_func/std": 0.42331358790397644, "sampling/importance_sampling_ratio/max": 1.0630241632461548, "sampling/importance_sampling_ratio/mean": 0.7869354486465454, "sampling/importance_sampling_ratio/min": 2.374764153501019e-05, "sampling/sampling_logp_difference/max": 3.388381004333496, "sampling/sampling_logp_difference/mean": 0.11959022283554077, "step": 2223, "step_time": 4.9180537369975355 }, { "clip_ratio/high_max": 0.12105654925107956, "clip_ratio/high_mean": 0.08266369067132473, "clip_ratio/low_mean": 0.009114583488553762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09177827415987849, "entropy": 0.5707375910133123, "epoch": 0.02224, "grad_norm": 0.5581734776496887, "kl": 0.6786101795732975, "learning_rate": 9.997784636927293e-06, "loss": -0.0328, "step": 2224, "step_time": 2.0299645630002487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 148.09375, "completions/mean_terminated_length": 148.09375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.23218457447364926, "epoch": 0.02225, "frac_reward_zero_std": 0.75, "grad_norm": 0.049553148448467255, "kl": 0.747404046356678, "learning_rate": 9.997782611652224e-06, "loss": 0.0315, "num_tokens": 19146992.0, "reward": 0.5763461589813232, "reward_std": 0.013598206453025341, "rewards/rollout_reward_func/mean": 0.5763461589813232, "rewards/rollout_reward_func/std": 0.17823795974254608, "sampling/importance_sampling_ratio/max": 0.9994532465934753, "sampling/importance_sampling_ratio/mean": 0.964666485786438, "sampling/importance_sampling_ratio/min": 2.3643724489375018e-06, "sampling/sampling_logp_difference/max": 4.099253177642822, "sampling/sampling_logp_difference/mean": 0.06657442450523376, "step": 2225, "step_time": 4.098438416993304 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 0.23420480405911803, "epoch": 0.02226, "grad_norm": 0.042094048112630844, "kl": 0.7077018469572067, "learning_rate": 9.997780585452103e-06, "loss": 0.0313, "step": 2226, "step_time": 1.993938193001668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.17997603677213192, "epoch": 0.02227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0050197080709040165, "kl": 0.5536950193345547, "learning_rate": 9.997778558326932e-06, "loss": 0.0014, "num_tokens": 19162296.0, "reward": 0.5835769176483154, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5835769176483154, "rewards/rollout_reward_func/std": 0.1896621435880661, "sampling/importance_sampling_ratio/max": 1.10406494140625, "sampling/importance_sampling_ratio/mean": 0.9375669956207275, "sampling/importance_sampling_ratio/min": 0.1623380184173584, "sampling/sampling_logp_difference/max": 0.9374497532844543, "sampling/sampling_logp_difference/mean": 0.024208253249526024, "step": 2227, "step_time": 4.272238024997932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18739630188792944, "epoch": 0.02228, "grad_norm": 0.0078096236102283, "kl": 0.554920144379139, "learning_rate": 9.997776530276708e-06, "loss": 0.0015, "step": 2228, "step_time": 2.4803288980037905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.06867624679580331, "epoch": 0.02229, "frac_reward_zero_std": 1.0, "grad_norm": 0.000536483945325017, "kl": 0.40319761261343956, "learning_rate": 9.997774501301435e-06, "loss": 0.0012, "num_tokens": 19177968.0, "reward": 0.8464615345001221, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8464615345001221, "rewards/rollout_reward_func/std": 0.3650369644165039, "sampling/importance_sampling_ratio/max": 1.0021175146102905, "sampling/importance_sampling_ratio/mean": 0.997718095779419, "sampling/importance_sampling_ratio/min": 0.9921145439147949, "sampling/sampling_logp_difference/max": 0.0037400061264634132, "sampling/sampling_logp_difference/mean": 0.0008700514445081353, "step": 2229, "step_time": 4.521437824987515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07096077920868993, "epoch": 0.0223, "grad_norm": 0.0005428827716968954, "kl": 0.4028518944978714, "learning_rate": 9.997772471401112e-06, "loss": 0.0012, "step": 2230, "step_time": 1.987867987001664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 175.53125, "completions/mean_terminated_length": 175.53125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.7098549976944923, "epoch": 0.02231, "frac_reward_zero_std": 0.5, "grad_norm": 0.14250895380973816, "kl": 0.6497371755540371, "learning_rate": 9.99777044057574e-06, "loss": -0.0073, "num_tokens": 19194377.0, "reward": 0.5520000457763672, "reward_std": 0.08291466534137726, "rewards/rollout_reward_func/mean": 0.5520000457763672, "rewards/rollout_reward_func/std": 0.1593303233385086, "sampling/importance_sampling_ratio/max": 1.0740495920181274, "sampling/importance_sampling_ratio/mean": 0.8715205192565918, "sampling/importance_sampling_ratio/min": 1.1528750176736935e-09, "sampling/sampling_logp_difference/max": 12.18952465057373, "sampling/sampling_logp_difference/mean": 0.1765674203634262, "step": 2231, "step_time": 4.225551745999837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 0.7170641496777534, "epoch": 0.02232, "grad_norm": 0.041046544909477234, "kl": 0.6438365466892719, "learning_rate": 9.997768408825318e-06, "loss": -0.0076, "step": 2232, "step_time": 2.0292647629903513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0813995972275734, "epoch": 0.02233, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004771003732457757, "kl": 0.5945344269275665, "learning_rate": 9.997766376149847e-06, "loss": 0.0015, "num_tokens": 19209081.0, "reward": 0.5899999737739563, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5899999737739563, "rewards/rollout_reward_func/std": 0.19908396899700165, "sampling/importance_sampling_ratio/max": 0.996344804763794, "sampling/importance_sampling_ratio/mean": 0.9920215606689453, "sampling/importance_sampling_ratio/min": 0.9897946715354919, "sampling/sampling_logp_difference/max": 0.007049063220620155, "sampling/sampling_logp_difference/mean": 0.0020311856642365456, "step": 2233, "step_time": 3.969403719995171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08346691727638245, "epoch": 0.02234, "grad_norm": 0.0004941973020322621, "kl": 0.5941743478178978, "learning_rate": 9.997764342549329e-06, "loss": 0.0014, "step": 2234, "step_time": 2.8714469779806677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.39069900568574667, "epoch": 0.02235, "frac_reward_zero_std": 0.75, "grad_norm": 0.011253131553530693, "kl": 0.669294036924839, "learning_rate": 9.997762308023762e-06, "loss": -0.0262, "num_tokens": 19223809.0, "reward": 0.6748077273368835, "reward_std": 0.013598206453025341, "rewards/rollout_reward_func/mean": 0.6748077273368835, "rewards/rollout_reward_func/std": 0.1537894457578659, "sampling/importance_sampling_ratio/max": 1.005813479423523, "sampling/importance_sampling_ratio/mean": 0.919434666633606, "sampling/importance_sampling_ratio/min": 0.024194996803998947, "sampling/sampling_logp_difference/max": 2.0512778759002686, "sampling/sampling_logp_difference/mean": 0.060586199164390564, "step": 2235, "step_time": 4.078261411006679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3911958299577236, "epoch": 0.02236, "grad_norm": 0.010947749949991703, "kl": 0.6678999848663807, "learning_rate": 9.99776027257315e-06, "loss": -0.0262, "step": 2236, "step_time": 2.0091777280176757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 144.21875, "completions/mean_terminated_length": 144.21875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.7979373428970575, "epoch": 0.02237, "frac_reward_zero_std": 0.5, "grad_norm": 0.02967541664838791, "kl": 0.6274223029613495, "learning_rate": 9.99775823619749e-06, "loss": -0.0581, "num_tokens": 19239160.0, "reward": 0.594258189201355, "reward_std": 0.04638374596834183, "rewards/rollout_reward_func/mean": 0.594258189201355, "rewards/rollout_reward_func/std": 0.138316810131073, "sampling/importance_sampling_ratio/max": 1.001920223236084, "sampling/importance_sampling_ratio/mean": 0.9011772871017456, "sampling/importance_sampling_ratio/min": 7.259550137491165e-17, "sampling/sampling_logp_difference/max": 3.200723171234131, "sampling/sampling_logp_difference/mean": 0.24947801232337952, "step": 2237, "step_time": 4.463089371012757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8007842935621738, "epoch": 0.02238, "grad_norm": 0.027818165719509125, "kl": 0.6131926737725735, "learning_rate": 9.997756198896784e-06, "loss": -0.0581, "step": 2238, "step_time": 2.072556014994916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 125.59375, "completions/mean_terminated_length": 125.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.542152114212513, "epoch": 0.02239, "frac_reward_zero_std": 0.5, "grad_norm": 0.03479576110839844, "kl": 0.5940560810267925, "learning_rate": 9.997754160671032e-06, "loss": -0.004, "num_tokens": 19253947.0, "reward": 0.7850961685180664, "reward_std": 0.24299657344818115, "rewards/rollout_reward_func/mean": 0.7850961685180664, "rewards/rollout_reward_func/std": 0.479915052652359, "sampling/importance_sampling_ratio/max": 0.9958493113517761, "sampling/importance_sampling_ratio/mean": 0.9040088653564453, "sampling/importance_sampling_ratio/min": 8.539294640286244e-07, "sampling/sampling_logp_difference/max": 3.4063665866851807, "sampling/sampling_logp_difference/mean": 0.13625827431678772, "step": 2239, "step_time": 4.219021208999038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5443071965128183, "epoch": 0.0224, "grad_norm": 0.034295372664928436, "kl": 0.5942462235689163, "learning_rate": 9.997752121520237e-06, "loss": -0.004, "step": 2240, "step_time": 2.958268524998857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 163.09375, "completions/mean_terminated_length": 163.09375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3796686064451933, "epoch": 0.02241, "frac_reward_zero_std": 0.5, "grad_norm": 0.030627090483903885, "kl": 0.5320260338485241, "learning_rate": 9.997750081444396e-06, "loss": -0.052, "num_tokens": 19269902.0, "reward": 0.6066875457763672, "reward_std": 0.007519809529185295, "rewards/rollout_reward_func/mean": 0.6066875457763672, "rewards/rollout_reward_func/std": 0.21789032220840454, "sampling/importance_sampling_ratio/max": 1.0075712203979492, "sampling/importance_sampling_ratio/mean": 0.936758279800415, "sampling/importance_sampling_ratio/min": 0.02754364162683487, "sampling/sampling_logp_difference/max": 1.8980381488800049, "sampling/sampling_logp_difference/mean": 0.03625143691897392, "step": 2241, "step_time": 4.481225755997002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.37548366375267506, "epoch": 0.02242, "grad_norm": 0.030376825481653214, "kl": 0.5343415066599846, "learning_rate": 9.997748040443511e-06, "loss": -0.052, "step": 2242, "step_time": 2.0554244489903795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.1176460050046444, "epoch": 0.02243, "frac_reward_zero_std": 1.0, "grad_norm": 0.002373717026785016, "kl": 0.38231224939227104, "learning_rate": 9.997745998517584e-06, "loss": 0.0013, "num_tokens": 19285982.0, "reward": 0.8489615321159363, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8489615321159363, "rewards/rollout_reward_func/std": 0.22611857950687408, "sampling/importance_sampling_ratio/max": 1.0571322441101074, "sampling/importance_sampling_ratio/mean": 0.9879544973373413, "sampling/importance_sampling_ratio/min": 0.7652053833007812, "sampling/sampling_logp_difference/max": 0.1807769536972046, "sampling/sampling_logp_difference/mean": 0.005928582977503538, "step": 2243, "step_time": 4.432947413006332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11782058794051409, "epoch": 0.02244, "grad_norm": 0.0027478181291371584, "kl": 0.38164766505360603, "learning_rate": 9.997743955666612e-06, "loss": 0.0013, "step": 2244, "step_time": 2.0302137810140266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 151.3125, "completions/mean_terminated_length": 151.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3255610354244709, "epoch": 0.02245, "frac_reward_zero_std": 0.75, "grad_norm": 0.011131374165415764, "kl": 0.5558985061943531, "learning_rate": 9.9977419118906e-06, "loss": -0.0175, "num_tokens": 19301624.0, "reward": 0.7055961489677429, "reward_std": 0.027468375861644745, "rewards/rollout_reward_func/mean": 0.7055961489677429, "rewards/rollout_reward_func/std": 0.32759931683540344, "sampling/importance_sampling_ratio/max": 0.9946327209472656, "sampling/importance_sampling_ratio/mean": 0.959278404712677, "sampling/importance_sampling_ratio/min": 5.2858562412438914e-05, "sampling/sampling_logp_difference/max": 1.8535752296447754, "sampling/sampling_logp_difference/mean": 0.051924046128988266, "step": 2245, "step_time": 4.723682154995913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.32456936966627836, "epoch": 0.02246, "grad_norm": 0.011446338146924973, "kl": 0.5537681207060814, "learning_rate": 9.997739867189544e-06, "loss": -0.0175, "step": 2246, "step_time": 2.5252160759919207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.41894547920674086, "epoch": 0.02247, "frac_reward_zero_std": 0.25, "grad_norm": 0.02702188864350319, "kl": 0.6419812776148319, "learning_rate": 9.997737821563446e-06, "loss": -0.0806, "num_tokens": 19318116.0, "reward": 0.7554119825363159, "reward_std": 0.07335281372070312, "rewards/rollout_reward_func/mean": 0.7554119825363159, "rewards/rollout_reward_func/std": 0.2594427466392517, "sampling/importance_sampling_ratio/max": 0.9996063709259033, "sampling/importance_sampling_ratio/mean": 0.9049307107925415, "sampling/importance_sampling_ratio/min": 0.0037131072022020817, "sampling/sampling_logp_difference/max": 2.1938695907592773, "sampling/sampling_logp_difference/mean": 0.06403172016143799, "step": 2247, "step_time": 4.281494924987783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4159937594085932, "epoch": 0.02248, "grad_norm": 0.02779097855091095, "kl": 0.6335447579622269, "learning_rate": 9.997735775012309e-06, "loss": -0.0806, "step": 2248, "step_time": 2.065247836013441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 96.5, "completions/mean_terminated_length": 96.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.16183913499116898, "epoch": 0.02249, "frac_reward_zero_std": 0.75, "grad_norm": 0.0166678037494421, "kl": 0.8510468080639839, "learning_rate": 9.99773372753613e-06, "loss": -0.0235, "num_tokens": 19332092.0, "reward": 0.5653557777404785, "reward_std": 0.11898431181907654, "rewards/rollout_reward_func/mean": 0.5653557777404785, "rewards/rollout_reward_func/std": 0.31325387954711914, "sampling/importance_sampling_ratio/max": 0.9974340200424194, "sampling/importance_sampling_ratio/mean": 0.9635542631149292, "sampling/importance_sampling_ratio/min": 0.11701375991106033, "sampling/sampling_logp_difference/max": 2.2631804943084717, "sampling/sampling_logp_difference/mean": 0.01586488075554371, "step": 2249, "step_time": 3.9587856590005686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15834880713373423, "epoch": 0.0225, "grad_norm": 0.013508539646863937, "kl": 0.8692305013537407, "learning_rate": 9.99773167913491e-06, "loss": -0.0235, "step": 2250, "step_time": 1.970102165993012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 191.9375, "completions/mean_terminated_length": 191.9375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.4603544296696782, "epoch": 0.02251, "frac_reward_zero_std": 0.75, "grad_norm": 0.007031039334833622, "kl": 0.5008609667420387, "learning_rate": 9.997729629808653e-06, "loss": -0.0227, "num_tokens": 19348970.0, "reward": 0.8198269605636597, "reward_std": 0.04106658697128296, "rewards/rollout_reward_func/mean": 0.8198269605636597, "rewards/rollout_reward_func/std": 0.2807616591453552, "sampling/importance_sampling_ratio/max": 0.9992337822914124, "sampling/importance_sampling_ratio/mean": 0.9293261766433716, "sampling/importance_sampling_ratio/min": 4.771064099351463e-10, "sampling/sampling_logp_difference/max": 18.610116958618164, "sampling/sampling_logp_difference/mean": 0.20521041750907898, "step": 2251, "step_time": 4.901005786014139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.45925986394286156, "epoch": 0.02252, "grad_norm": 0.006812890525907278, "kl": 0.5039486400783062, "learning_rate": 9.997727579557356e-06, "loss": -0.0227, "step": 2252, "step_time": 2.546453021997877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 143.28125, "completions/mean_terminated_length": 143.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3431550255045295, "epoch": 0.02253, "frac_reward_zero_std": 0.5, "grad_norm": 0.014379564672708511, "kl": 0.670481089502573, "learning_rate": 9.99772552838102e-06, "loss": -0.0074, "num_tokens": 19364355.0, "reward": 0.7622451782226562, "reward_std": 0.14418178796768188, "rewards/rollout_reward_func/mean": 0.7622451782226562, "rewards/rollout_reward_func/std": 0.5062146186828613, "sampling/importance_sampling_ratio/max": 0.9983091354370117, "sampling/importance_sampling_ratio/mean": 0.9339982271194458, "sampling/importance_sampling_ratio/min": 0.026243887841701508, "sampling/sampling_logp_difference/max": 2.534994125366211, "sampling/sampling_logp_difference/mean": 0.054165035486221313, "step": 2253, "step_time": 4.30154256199603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3431964796036482, "epoch": 0.02254, "grad_norm": 0.01399365346878767, "kl": 0.6785207390785217, "learning_rate": 9.997723476279649e-06, "loss": -0.0074, "step": 2254, "step_time": 2.0434617279970553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 192.3125, "completions/mean_terminated_length": 192.3125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5759164867922664, "epoch": 0.02255, "frac_reward_zero_std": 0.5, "grad_norm": 0.012571096420288086, "kl": 0.42148831114172935, "learning_rate": 9.997721423253236e-06, "loss": -0.0462, "num_tokens": 19381189.0, "reward": 0.9315096735954285, "reward_std": 0.015637941658496857, "rewards/rollout_reward_func/mean": 0.9315096735954285, "rewards/rollout_reward_func/std": 0.329304963350296, "sampling/importance_sampling_ratio/max": 0.9995112419128418, "sampling/importance_sampling_ratio/mean": 0.9065921306610107, "sampling/importance_sampling_ratio/min": 1.1241997526667546e-05, "sampling/sampling_logp_difference/max": 3.6202971935272217, "sampling/sampling_logp_difference/mean": 0.1176489070057869, "step": 2255, "step_time": 4.361130127006618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5744386706501245, "epoch": 0.02256, "grad_norm": 0.012930619530379772, "kl": 0.41879842430353165, "learning_rate": 9.997719369301789e-06, "loss": -0.0463, "step": 2256, "step_time": 2.0495735009972122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 170.40625, "completions/mean_terminated_length": 170.40625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7863237699493766, "epoch": 0.02257, "frac_reward_zero_std": 0.5, "grad_norm": 0.10570358484983444, "kl": 0.48170940205454826, "learning_rate": 9.997717314425304e-06, "loss": -0.0672, "num_tokens": 19397266.0, "reward": 0.6220129728317261, "reward_std": 0.026407476514577866, "rewards/rollout_reward_func/mean": 0.6220129728317261, "rewards/rollout_reward_func/std": 0.17014484107494354, "sampling/importance_sampling_ratio/max": 1.034722924232483, "sampling/importance_sampling_ratio/mean": 0.8817026615142822, "sampling/importance_sampling_ratio/min": 5.115807950872853e-12, "sampling/sampling_logp_difference/max": 4.170658111572266, "sampling/sampling_logp_difference/mean": 0.23022490739822388, "step": 2257, "step_time": 4.773209271996166 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 0.7637135423719883, "epoch": 0.02258, "grad_norm": 0.04029662907123566, "kl": 0.48324446752667427, "learning_rate": 9.997715258623784e-06, "loss": -0.0673, "step": 2258, "step_time": 2.5303567450100672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 140.96875, "completions/mean_terminated_length": 140.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1589127015322447, "epoch": 0.02259, "frac_reward_zero_std": 0.5, "grad_norm": 0.016604240983724594, "kl": 0.761159498244524, "learning_rate": 9.99771320189723e-06, "loss": -0.0351, "num_tokens": 19412577.0, "reward": 0.9040576815605164, "reward_std": 0.11354143172502518, "rewards/rollout_reward_func/mean": 0.9040576815605164, "rewards/rollout_reward_func/std": 0.33438894152641296, "sampling/importance_sampling_ratio/max": 0.9975373148918152, "sampling/importance_sampling_ratio/mean": 0.8693033456802368, "sampling/importance_sampling_ratio/min": 5.1335326133994386e-05, "sampling/sampling_logp_difference/max": 1.91653573513031, "sampling/sampling_logp_difference/mean": 0.22071057558059692, "step": 2259, "step_time": 4.4418346590027795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1627115029841661, "epoch": 0.0226, "grad_norm": 0.01769949682056904, "kl": 0.7711571045219898, "learning_rate": 9.99771114424564e-06, "loss": -0.035, "step": 2260, "step_time": 2.0430418670002837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 191.375, "completions/mean_terminated_length": 191.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 1.0774294473230839, "epoch": 0.02261, "frac_reward_zero_std": 0.0, "grad_norm": 0.07105845957994461, "kl": 0.6079952046275139, "learning_rate": 9.997709085669014e-06, "loss": -0.0025, "num_tokens": 19429413.0, "reward": 0.5984663367271423, "reward_std": 0.11756283044815063, "rewards/rollout_reward_func/mean": 0.5984663367271423, "rewards/rollout_reward_func/std": 0.2906542420387268, "sampling/importance_sampling_ratio/max": 0.9987619519233704, "sampling/importance_sampling_ratio/mean": 0.8188583850860596, "sampling/importance_sampling_ratio/min": 4.7995182939075676e-08, "sampling/sampling_logp_difference/max": 2.344099521636963, "sampling/sampling_logp_difference/mean": 0.18220637738704681, "step": 2261, "step_time": 4.331263417996524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0749264853075147, "epoch": 0.02262, "grad_norm": 0.07377675920724869, "kl": 0.6000363528728485, "learning_rate": 9.997707026167357e-06, "loss": -0.0026, "step": 2262, "step_time": 2.0502954279945698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 140.4375, "completions/mean_terminated_length": 140.4375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.28461644519120455, "epoch": 0.02263, "frac_reward_zero_std": 0.75, "grad_norm": 0.008127794601023197, "kl": 0.5822876021265984, "learning_rate": 9.997704965740663e-06, "loss": -0.0364, "num_tokens": 19444619.0, "reward": 0.9350528717041016, "reward_std": 0.030636757612228394, "rewards/rollout_reward_func/mean": 0.9350528717041016, "rewards/rollout_reward_func/std": 0.22995273768901825, "sampling/importance_sampling_ratio/max": 1.0008478164672852, "sampling/importance_sampling_ratio/mean": 0.9639350175857544, "sampling/importance_sampling_ratio/min": 0.009851040318608284, "sampling/sampling_logp_difference/max": 1.7912429571151733, "sampling/sampling_logp_difference/mean": 0.02640598639845848, "step": 2263, "step_time": 4.5040521869959775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2834463082253933, "epoch": 0.02264, "grad_norm": 0.008430710062384605, "kl": 0.575608529150486, "learning_rate": 9.997702904388938e-06, "loss": -0.0364, "step": 2264, "step_time": 2.4092209029986407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 195.0625, "completions/mean_terminated_length": 195.0625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.5923468312248588, "epoch": 0.02265, "frac_reward_zero_std": 0.75, "grad_norm": 0.008731946349143982, "kl": 0.42379794269800186, "learning_rate": 9.997700842112181e-06, "loss": -0.035, "num_tokens": 19461597.0, "reward": 0.9478557705879211, "reward_std": 0.07625538855791092, "rewards/rollout_reward_func/mean": 0.9478557705879211, "rewards/rollout_reward_func/std": 0.3693694472312927, "sampling/importance_sampling_ratio/max": 0.9971281290054321, "sampling/importance_sampling_ratio/mean": 0.9320141077041626, "sampling/importance_sampling_ratio/min": 1.026215500132821e-06, "sampling/sampling_logp_difference/max": 3.1022114753723145, "sampling/sampling_logp_difference/mean": 0.11152021586894989, "step": 2265, "step_time": 4.167839024994464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5904291765764356, "epoch": 0.02266, "grad_norm": 0.008972300216555595, "kl": 0.42220909893512726, "learning_rate": 9.997698778910392e-06, "loss": -0.035, "step": 2266, "step_time": 2.011897823009349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 145.25, "completions/mean_terminated_length": 145.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.07393278926610947, "epoch": 0.02267, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005359608912840486, "kl": 0.4154437705874443, "learning_rate": 9.997696714783572e-06, "loss": 0.0013, "num_tokens": 19477037.0, "reward": 0.6034615635871887, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6034615635871887, "rewards/rollout_reward_func/std": 0.18754133582115173, "sampling/importance_sampling_ratio/max": 1.0001111030578613, "sampling/importance_sampling_ratio/mean": 0.9943643808364868, "sampling/importance_sampling_ratio/min": 0.9885315895080566, "sampling/sampling_logp_difference/max": 0.00804174691438675, "sampling/sampling_logp_difference/mean": 0.0012261333176866174, "step": 2267, "step_time": 3.906186134998279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0738733010366559, "epoch": 0.02268, "grad_norm": 0.0005295536830089986, "kl": 0.41542400419712067, "learning_rate": 9.99769464973172e-06, "loss": 0.0013, "step": 2268, "step_time": 2.0126531530040666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 116.5, "completions/mean_terminated_length": 116.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08136620186269283, "epoch": 0.02269, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005345556419342756, "kl": 0.5583665296435356, "learning_rate": 9.997692583754838e-06, "loss": 0.0014, "num_tokens": 19491565.0, "reward": 0.713192343711853, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.713192343711853, "rewards/rollout_reward_func/std": 0.13642095029354095, "sampling/importance_sampling_ratio/max": 0.998957633972168, "sampling/importance_sampling_ratio/mean": 0.9938032031059265, "sampling/importance_sampling_ratio/min": 0.9891110062599182, "sampling/sampling_logp_difference/max": 0.008458668366074562, "sampling/sampling_logp_difference/mean": 0.0015742997638881207, "step": 2269, "step_time": 4.438296909000201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08265070524066687, "epoch": 0.0227, "grad_norm": 0.0005421696114353836, "kl": 0.5580971874296665, "learning_rate": 9.997690516852928e-06, "loss": 0.0014, "step": 2270, "step_time": 2.4915886860035243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 194.0625, "completions/mean_terminated_length": 194.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.44275880977511406, "epoch": 0.02271, "frac_reward_zero_std": 0.5, "grad_norm": 0.13004472851753235, "kl": 0.4838903062045574, "learning_rate": 9.997688449025987e-06, "loss": -0.0364, "num_tokens": 19508399.0, "reward": 0.7907100915908813, "reward_std": 0.03684069961309433, "rewards/rollout_reward_func/mean": 0.7907100915908813, "rewards/rollout_reward_func/std": 0.42929789423942566, "sampling/importance_sampling_ratio/max": 1.0019192695617676, "sampling/importance_sampling_ratio/mean": 0.899288535118103, "sampling/importance_sampling_ratio/min": 0.0005409769946709275, "sampling/sampling_logp_difference/max": 1.8018062114715576, "sampling/sampling_logp_difference/mean": 0.06168721616268158, "step": 2271, "step_time": 4.737688259992865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4422251507639885, "epoch": 0.02272, "grad_norm": 0.11137362569570541, "kl": 0.484299723058939, "learning_rate": 9.997686380274016e-06, "loss": -0.0371, "step": 2272, "step_time": 2.071157881007821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 120.25, "completions/mean_terminated_length": 120.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.13978155050426722, "epoch": 0.02273, "frac_reward_zero_std": 0.75, "grad_norm": 0.04303771257400513, "kl": 0.6527610160410404, "learning_rate": 9.99768431059702e-06, "loss": -0.0208, "num_tokens": 19523015.0, "reward": 0.7437980771064758, "reward_std": 0.014278121292591095, "rewards/rollout_reward_func/mean": 0.7437980771064758, "rewards/rollout_reward_func/std": 0.21613693237304688, "sampling/importance_sampling_ratio/max": 1.0027779340744019, "sampling/importance_sampling_ratio/mean": 0.971017062664032, "sampling/importance_sampling_ratio/min": 0.2174283266067505, "sampling/sampling_logp_difference/max": 1.5804771184921265, "sampling/sampling_logp_difference/mean": 0.011618603020906448, "step": 2273, "step_time": 3.968570550008735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1394843766465783, "epoch": 0.02274, "grad_norm": 0.041511110961437225, "kl": 0.6536546051502228, "learning_rate": 9.997682239994995e-06, "loss": -0.0208, "step": 2274, "step_time": 2.0120730190101312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 220.78125, "completions/mean_terminated_length": 220.78125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4465115834027529, "epoch": 0.02275, "frac_reward_zero_std": 0.5, "grad_norm": 0.1405804306268692, "kl": 0.4666679985821247, "learning_rate": 9.997680168467941e-06, "loss": -0.0483, "num_tokens": 19540816.0, "reward": 0.7393653988838196, "reward_std": 0.039966464042663574, "rewards/rollout_reward_func/mean": 0.7393653988838196, "rewards/rollout_reward_func/std": 0.3627035915851593, "sampling/importance_sampling_ratio/max": 1.000745177268982, "sampling/importance_sampling_ratio/mean": 0.9254557490348816, "sampling/importance_sampling_ratio/min": 3.179434497724287e-05, "sampling/sampling_logp_difference/max": 2.1682677268981934, "sampling/sampling_logp_difference/mean": 0.060239553451538086, "step": 2275, "step_time": 5.2844331479936955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.45046165585517883, "epoch": 0.02276, "grad_norm": 0.1289791762828827, "kl": 0.46949753537774086, "learning_rate": 9.997678096015861e-06, "loss": -0.0487, "step": 2276, "step_time": 2.050064413007931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 213.03125, "completions/mean_terminated_length": 213.03125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.24234431516379118, "epoch": 0.02277, "frac_reward_zero_std": 0.5, "grad_norm": 0.01960388757288456, "kl": 0.5245532914996147, "learning_rate": 9.997676022638757e-06, "loss": -0.0611, "num_tokens": 19558401.0, "reward": 0.6488845944404602, "reward_std": 0.1212960034608841, "rewards/rollout_reward_func/mean": 0.6488845944404602, "rewards/rollout_reward_func/std": 0.42128869891166687, "sampling/importance_sampling_ratio/max": 1.0007864236831665, "sampling/importance_sampling_ratio/mean": 0.9384604692459106, "sampling/importance_sampling_ratio/min": 0.04911115765571594, "sampling/sampling_logp_difference/max": 2.4760913848876953, "sampling/sampling_logp_difference/mean": 0.025958267971873283, "step": 2277, "step_time": 4.352915670991933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24356715800240636, "epoch": 0.02278, "grad_norm": 0.01902890019118786, "kl": 0.5239557102322578, "learning_rate": 9.997673948336624e-06, "loss": -0.0612, "step": 2278, "step_time": 2.0559186480022618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 125.96875, "completions/mean_terminated_length": 125.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3737941635772586, "epoch": 0.02279, "frac_reward_zero_std": 0.5, "grad_norm": 0.020171064883470535, "kl": 0.624983012676239, "learning_rate": 9.997671873109466e-06, "loss": 0.0013, "num_tokens": 19573200.0, "reward": 0.7596394419670105, "reward_std": 0.01067459024488926, "rewards/rollout_reward_func/mean": 0.7596394419670105, "rewards/rollout_reward_func/std": 0.3382970094680786, "sampling/importance_sampling_ratio/max": 0.9999321699142456, "sampling/importance_sampling_ratio/mean": 0.9344998598098755, "sampling/importance_sampling_ratio/min": 0.009836024604737759, "sampling/sampling_logp_difference/max": 1.7452255487442017, "sampling/sampling_logp_difference/mean": 0.056227222084999084, "step": 2279, "step_time": 4.0567073760030326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3739645667374134, "epoch": 0.0228, "grad_norm": 0.018751632422208786, "kl": 0.6294460669159889, "learning_rate": 9.997669796957287e-06, "loss": 0.0013, "step": 2280, "step_time": 2.5461387559989817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 169.1875, "completions/mean_terminated_length": 169.1875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.11765858624130487, "epoch": 0.02281, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036032956559211016, "kl": 0.48933006078004837, "learning_rate": 9.99766771988008e-06, "loss": 0.0017, "num_tokens": 19589350.0, "reward": 0.6305769085884094, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6305769085884094, "rewards/rollout_reward_func/std": 0.3241816759109497, "sampling/importance_sampling_ratio/max": 1.0032212734222412, "sampling/importance_sampling_ratio/mean": 0.9662307500839233, "sampling/importance_sampling_ratio/min": 5.566556104952269e-10, "sampling/sampling_logp_difference/max": 18.428512573242188, "sampling/sampling_logp_difference/mean": 0.1117306798696518, "step": 2281, "step_time": 4.530088685001829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11652133567258716, "epoch": 0.02282, "grad_norm": 0.0031218379735946655, "kl": 0.48509231582283974, "learning_rate": 9.997665641877852e-06, "loss": 0.0017, "step": 2282, "step_time": 2.0154460630074027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.05167865566909313, "epoch": 0.02283, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004831826372537762, "kl": 0.3762594796717167, "learning_rate": 9.997663562950599e-06, "loss": 0.0015, "num_tokens": 19607350.0, "reward": 0.8636922836303711, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8636922836303711, "rewards/rollout_reward_func/std": 0.4328170418739319, "sampling/importance_sampling_ratio/max": 0.9994931817054749, "sampling/importance_sampling_ratio/mean": 0.995486855506897, "sampling/importance_sampling_ratio/min": 0.9916329979896545, "sampling/sampling_logp_difference/max": 0.004610823467373848, "sampling/sampling_logp_difference/mean": 0.0008132087532430887, "step": 2283, "step_time": 4.372007681005925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05102304555475712, "epoch": 0.02284, "grad_norm": 0.0004733713576570153, "kl": 0.37636928260326385, "learning_rate": 9.997661483098324e-06, "loss": 0.0015, "step": 2284, "step_time": 2.0505436020102934 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 171.15625, "completions/mean_terminated_length": 171.15625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.6204562894999981, "epoch": 0.02285, "frac_reward_zero_std": 0.5, "grad_norm": 0.05021177604794502, "kl": 0.45424750074744225, "learning_rate": 9.997659402321028e-06, "loss": -0.0221, "num_tokens": 19623619.0, "reward": 0.6129855513572693, "reward_std": 0.03974049538373947, "rewards/rollout_reward_func/mean": 0.6129855513572693, "rewards/rollout_reward_func/std": 0.22281059622764587, "sampling/importance_sampling_ratio/max": 1.0433963537216187, "sampling/importance_sampling_ratio/mean": 0.8926123380661011, "sampling/importance_sampling_ratio/min": 0.0004365057102404535, "sampling/sampling_logp_difference/max": 1.5903077125549316, "sampling/sampling_logp_difference/mean": 0.09051407873630524, "step": 2285, "step_time": 4.39103271500062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6289934637024999, "epoch": 0.02286, "grad_norm": 0.1641835719347, "kl": 0.45186174660921097, "learning_rate": 9.997657320618708e-06, "loss": -0.0222, "step": 2286, "step_time": 2.5401087250138517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 214.6875, "completions/mean_terminated_length": 214.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2727927463129163, "epoch": 0.02287, "frac_reward_zero_std": 0.5, "grad_norm": 0.03421580046415329, "kl": 0.6924550905823708, "learning_rate": 9.997655237991368e-06, "loss": -0.053, "num_tokens": 19641257.0, "reward": 0.9124423265457153, "reward_std": 0.16617010533809662, "rewards/rollout_reward_func/mean": 0.9124423265457153, "rewards/rollout_reward_func/std": 0.45966407656669617, "sampling/importance_sampling_ratio/max": 1.0061994791030884, "sampling/importance_sampling_ratio/mean": 0.928256630897522, "sampling/importance_sampling_ratio/min": 0.04277537018060684, "sampling/sampling_logp_difference/max": 2.8799197673797607, "sampling/sampling_logp_difference/mean": 0.03339548781514168, "step": 2287, "step_time": 4.9221561970043695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2896188972517848, "epoch": 0.02288, "grad_norm": 0.03331596776843071, "kl": 0.6846840642392635, "learning_rate": 9.997653154439006e-06, "loss": -0.0531, "step": 2288, "step_time": 2.0697714460038696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 195.5, "completions/mean_terminated_length": 195.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.08014069823548198, "epoch": 0.02289, "frac_reward_zero_std": 1.0, "grad_norm": 0.006365896202623844, "kl": 0.39044664800167084, "learning_rate": 9.997651069961627e-06, "loss": 0.0014, "num_tokens": 19658249.0, "reward": 0.8548461198806763, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8548461198806763, "rewards/rollout_reward_func/std": 0.3229258060455322, "sampling/importance_sampling_ratio/max": 1.2651630640029907, "sampling/importance_sampling_ratio/mean": 0.973787784576416, "sampling/importance_sampling_ratio/min": 0.6729047894477844, "sampling/sampling_logp_difference/max": 0.3804335594177246, "sampling/sampling_logp_difference/mean": 0.01093178428709507, "step": 2289, "step_time": 4.1697676420008065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0866253818385303, "epoch": 0.0229, "grad_norm": 0.003995963837951422, "kl": 0.3869624100625515, "learning_rate": 9.997648984559226e-06, "loss": 0.0014, "step": 2290, "step_time": 2.038186218996998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.04664935823529959, "epoch": 0.02291, "frac_reward_zero_std": 1.0, "grad_norm": 0.00043012111564166844, "kl": 0.40345965325832367, "learning_rate": 9.997646898231808e-06, "loss": 0.0015, "num_tokens": 19675353.0, "reward": 1.0012692213058472, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0012692213058472, "rewards/rollout_reward_func/std": 0.2511303126811981, "sampling/importance_sampling_ratio/max": 1.000474214553833, "sampling/importance_sampling_ratio/mean": 0.9965806007385254, "sampling/importance_sampling_ratio/min": 0.9939130544662476, "sampling/sampling_logp_difference/max": 0.0025518322363495827, "sampling/sampling_logp_difference/mean": 0.0006794391083531082, "step": 2291, "step_time": 4.211167139001191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04614261817187071, "epoch": 0.02292, "grad_norm": 0.00043300300603732467, "kl": 0.4035247042775154, "learning_rate": 9.997644810979369e-06, "loss": 0.0015, "step": 2292, "step_time": 2.5241149479843443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 123.3125, "completions/mean_terminated_length": 123.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2690911893732846, "epoch": 0.02293, "frac_reward_zero_std": 0.75, "grad_norm": 0.004203624092042446, "kl": 0.7169615998864174, "learning_rate": 9.997642722801912e-06, "loss": -0.0267, "num_tokens": 19690099.0, "reward": 0.7099038362503052, "reward_std": 0.00679910508915782, "rewards/rollout_reward_func/mean": 0.7099038362503052, "rewards/rollout_reward_func/std": 0.05566639080643654, "sampling/importance_sampling_ratio/max": 1.0003634691238403, "sampling/importance_sampling_ratio/mean": 0.963758111000061, "sampling/importance_sampling_ratio/min": 0.002870983211323619, "sampling/sampling_logp_difference/max": 1.7823582887649536, "sampling/sampling_logp_difference/mean": 0.031893014907836914, "step": 2293, "step_time": 4.3918317679999745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2670895359478891, "epoch": 0.02294, "grad_norm": 0.004209112375974655, "kl": 0.7144464552402496, "learning_rate": 9.997640633699441e-06, "loss": -0.0267, "step": 2294, "step_time": 1.9638162900082534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 169.5, "completions/mean_terminated_length": 169.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.047983096446841955, "epoch": 0.02295, "frac_reward_zero_std": 1.0, "grad_norm": 0.00039155472768470645, "kl": 0.4443235248327255, "learning_rate": 9.99763854367195e-06, "loss": 0.0015, "num_tokens": 19706203.0, "reward": 0.8253846168518066, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8253846168518066, "rewards/rollout_reward_func/std": 0.17565171420574188, "sampling/importance_sampling_ratio/max": 0.9992357492446899, "sampling/importance_sampling_ratio/mean": 0.9963691234588623, "sampling/importance_sampling_ratio/min": 0.9931056499481201, "sampling/sampling_logp_difference/max": 0.004857003688812256, "sampling/sampling_logp_difference/mean": 0.0008177573326975107, "step": 2295, "step_time": 4.029787704006594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04751933412626386, "epoch": 0.02296, "grad_norm": 0.00039550193469040096, "kl": 0.4443897046148777, "learning_rate": 9.997636452719443e-06, "loss": 0.0015, "step": 2296, "step_time": 2.0185415260202717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 192.40625, "completions/mean_terminated_length": 192.40625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.46381885232403874, "epoch": 0.02297, "frac_reward_zero_std": 0.5, "grad_norm": 0.011724412441253662, "kl": 0.5262158438563347, "learning_rate": 9.997634360841921e-06, "loss": -0.0553, "num_tokens": 19723072.0, "reward": 0.7634326815605164, "reward_std": 0.015637939795851707, "rewards/rollout_reward_func/mean": 0.7634326815605164, "rewards/rollout_reward_func/std": 0.3026510775089264, "sampling/importance_sampling_ratio/max": 0.9971532225608826, "sampling/importance_sampling_ratio/mean": 0.932101845741272, "sampling/importance_sampling_ratio/min": 5.2377175513429464e-11, "sampling/sampling_logp_difference/max": 14.21536922454834, "sampling/sampling_logp_difference/mean": 0.15085197985172272, "step": 2297, "step_time": 4.16382929199608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.464881410356611, "epoch": 0.02298, "grad_norm": 0.011642835102975368, "kl": 0.5238119773566723, "learning_rate": 9.997632268039381e-06, "loss": -0.0554, "step": 2298, "step_time": 2.502279511987581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 195.4375, "completions/mean_terminated_length": 195.4375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.27868762239813805, "epoch": 0.02299, "frac_reward_zero_std": 0.75, "grad_norm": 0.011141152121126652, "kl": 0.42881694436073303, "learning_rate": 9.99763017431183e-06, "loss": 0.0209, "num_tokens": 19740038.0, "reward": 0.6683317422866821, "reward_std": 0.006595132406800985, "rewards/rollout_reward_func/mean": 0.6683317422866821, "rewards/rollout_reward_func/std": 0.32455798983573914, "sampling/importance_sampling_ratio/max": 0.9980282187461853, "sampling/importance_sampling_ratio/mean": 0.9644911289215088, "sampling/importance_sampling_ratio/min": 3.1278857932193205e-05, "sampling/sampling_logp_difference/max": 2.468944549560547, "sampling/sampling_logp_difference/mean": 0.05381608009338379, "step": 2299, "step_time": 4.703005766998103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2801068597473204, "epoch": 0.023, "grad_norm": 0.010825393721461296, "kl": 0.4261738695204258, "learning_rate": 9.99762807965926e-06, "loss": 0.0208, "step": 2300, "step_time": 2.0449032730030012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 153.6875, "completions/mean_terminated_length": 153.6875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.18297614622861147, "epoch": 0.02301, "frac_reward_zero_std": 0.75, "grad_norm": 0.023692455142736435, "kl": 0.5872033312916756, "learning_rate": 9.997625984081681e-06, "loss": 0.0196, "num_tokens": 19755692.0, "reward": 0.7792788743972778, "reward_std": 0.0006799129769206047, "rewards/rollout_reward_func/mean": 0.7792788743972778, "rewards/rollout_reward_func/std": 0.260457843542099, "sampling/importance_sampling_ratio/max": 1.0024166107177734, "sampling/importance_sampling_ratio/mean": 0.9655713438987732, "sampling/importance_sampling_ratio/min": 0.032944317907094955, "sampling/sampling_logp_difference/max": 1.473936676979065, "sampling/sampling_logp_difference/mean": 0.02180992253124714, "step": 2301, "step_time": 3.8829417140004807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18277965625748038, "epoch": 0.02302, "grad_norm": 0.02382916584610939, "kl": 0.5870468616485596, "learning_rate": 9.997623887579085e-06, "loss": 0.0196, "step": 2302, "step_time": 2.004610069998307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 131.90625, "completions/mean_terminated_length": 131.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.49495427682995796, "epoch": 0.02303, "frac_reward_zero_std": 0.75, "grad_norm": 0.014140072278678417, "kl": 0.7017318829894066, "learning_rate": 9.997621790151478e-06, "loss": -0.0076, "num_tokens": 19770681.0, "reward": 0.6192307472229004, "reward_std": 0.038074977695941925, "rewards/rollout_reward_func/mean": 0.6192307472229004, "rewards/rollout_reward_func/std": 0.14283150434494019, "sampling/importance_sampling_ratio/max": 1.061614751815796, "sampling/importance_sampling_ratio/mean": 0.9656343460083008, "sampling/importance_sampling_ratio/min": 6.883866444695741e-05, "sampling/sampling_logp_difference/max": 2.235125780105591, "sampling/sampling_logp_difference/mean": 0.07598821818828583, "step": 2303, "step_time": 4.454752639001526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.49984396528452635, "epoch": 0.02304, "grad_norm": 0.013533654622733593, "kl": 0.6954872347414494, "learning_rate": 9.997619691798858e-06, "loss": -0.0077, "step": 2304, "step_time": 2.0462689970008796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 195.9375, "completions/mean_terminated_length": 192.90321350097656, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.1576242009177804, "epoch": 0.02305, "frac_reward_zero_std": 0.75, "grad_norm": 0.006038730964064598, "kl": 0.5409503541886806, "learning_rate": 9.997617592521228e-06, "loss": -0.0253, "num_tokens": 19787663.0, "reward": 0.8839759826660156, "reward_std": 0.0015909892972558737, "rewards/rollout_reward_func/mean": 0.8839759826660156, "rewards/rollout_reward_func/std": 0.3239356279373169, "sampling/importance_sampling_ratio/max": 1.129449725151062, "sampling/importance_sampling_ratio/mean": 0.9699269533157349, "sampling/importance_sampling_ratio/min": 0.02891886793076992, "sampling/sampling_logp_difference/max": 1.6463525295257568, "sampling/sampling_logp_difference/mean": 0.019514136016368866, "step": 2305, "step_time": 4.695509482000489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15669978642836213, "epoch": 0.02306, "grad_norm": 0.006314669735729694, "kl": 0.5437119975686073, "learning_rate": 9.997615492318583e-06, "loss": -0.0253, "step": 2306, "step_time": 2.0720456510025542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 119.625, "completions/mean_terminated_length": 119.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2594087440520525, "epoch": 0.02307, "frac_reward_zero_std": 0.75, "grad_norm": 0.012125769630074501, "kl": 0.7251406610012054, "learning_rate": 9.99761339119093e-06, "loss": -0.0259, "num_tokens": 19802291.0, "reward": 0.8178845643997192, "reward_std": 0.01359820831567049, "rewards/rollout_reward_func/mean": 0.8178845643997192, "rewards/rollout_reward_func/std": 0.1621057093143463, "sampling/importance_sampling_ratio/max": 0.9986374378204346, "sampling/importance_sampling_ratio/mean": 0.9396592378616333, "sampling/importance_sampling_ratio/min": 0.031683631241321564, "sampling/sampling_logp_difference/max": 1.5195391178131104, "sampling/sampling_logp_difference/mean": 0.037809912115335464, "step": 2307, "step_time": 3.907150058003026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2614360926672816, "epoch": 0.02308, "grad_norm": 0.011694766581058502, "kl": 0.7191696912050247, "learning_rate": 9.997611289138265e-06, "loss": -0.0259, "step": 2308, "step_time": 2.0217209859983996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05909290490671992, "epoch": 0.02309, "frac_reward_zero_std": 1.0, "grad_norm": 0.00048242881894111633, "kl": 0.41907164454460144, "learning_rate": 9.997609186160591e-06, "loss": 0.0013, "num_tokens": 19817563.0, "reward": 0.8143461346626282, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8143461346626282, "rewards/rollout_reward_func/std": 0.33813875913619995, "sampling/importance_sampling_ratio/max": 0.9995402097702026, "sampling/importance_sampling_ratio/mean": 0.9953932762145996, "sampling/importance_sampling_ratio/min": 0.9901650547981262, "sampling/sampling_logp_difference/max": 0.005733149126172066, "sampling/sampling_logp_difference/mean": 0.001047295518219471, "step": 2309, "step_time": 4.574521557005937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.060194365214556456, "epoch": 0.0231, "grad_norm": 0.0004844402428716421, "kl": 0.4188675507903099, "learning_rate": 9.997607082257907e-06, "loss": 0.0013, "step": 2310, "step_time": 2.454154066996125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 142.0625, "completions/mean_terminated_length": 142.0625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.17763363383710384, "epoch": 0.02311, "frac_reward_zero_std": 0.75, "grad_norm": 0.022602399811148643, "kl": 0.5662787035107613, "learning_rate": 9.997604977430217e-06, "loss": 0.0297, "num_tokens": 19832789.0, "reward": 0.5504951477050781, "reward_std": 0.002488468773663044, "rewards/rollout_reward_func/mean": 0.5504951477050781, "rewards/rollout_reward_func/std": 0.12443213164806366, "sampling/importance_sampling_ratio/max": 1.000939130783081, "sampling/importance_sampling_ratio/mean": 0.9661495089530945, "sampling/importance_sampling_ratio/min": 0.02046119049191475, "sampling/sampling_logp_difference/max": 1.8386579751968384, "sampling/sampling_logp_difference/mean": 0.024530937895178795, "step": 2311, "step_time": 4.152918467996642 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 0.17695379815995693, "epoch": 0.02312, "grad_norm": 0.017002005130052567, "kl": 0.5518675744533539, "learning_rate": 9.997602871677516e-06, "loss": 0.0296, "step": 2312, "step_time": 2.034893492003903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 103.34375, "completions/mean_terminated_length": 103.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2183170709758997, "epoch": 0.02313, "frac_reward_zero_std": 0.75, "grad_norm": 0.015074041672050953, "kl": 0.755688413977623, "learning_rate": 9.99760076499981e-06, "loss": -0.0169, "num_tokens": 19846920.0, "reward": 0.7562980651855469, "reward_std": 0.02243703417479992, "rewards/rollout_reward_func/mean": 0.7562980651855469, "rewards/rollout_reward_func/std": 0.08830256760120392, "sampling/importance_sampling_ratio/max": 1.0019807815551758, "sampling/importance_sampling_ratio/mean": 0.9646697044372559, "sampling/importance_sampling_ratio/min": 0.010831293649971485, "sampling/sampling_logp_difference/max": 1.434753179550171, "sampling/sampling_logp_difference/mean": 0.02853510156273842, "step": 2313, "step_time": 4.253300368007331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22013285383582115, "epoch": 0.02314, "grad_norm": 0.014726532623171806, "kl": 0.7552200704813004, "learning_rate": 9.997598657397094e-06, "loss": -0.0169, "step": 2314, "step_time": 2.0676333739975234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 75.6875, "completions/mean_terminated_length": 75.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2767187897115946, "epoch": 0.02315, "frac_reward_zero_std": 0.75, "grad_norm": 0.029629042372107506, "kl": 0.6240408942103386, "learning_rate": 9.997596548869372e-06, "loss": -0.016, "num_tokens": 19860166.0, "reward": 0.7163461446762085, "reward_std": 0.024476774036884308, "rewards/rollout_reward_func/mean": 0.7163461446762085, "rewards/rollout_reward_func/std": 0.08978670090436935, "sampling/importance_sampling_ratio/max": 0.9996712803840637, "sampling/importance_sampling_ratio/mean": 0.9661288857460022, "sampling/importance_sampling_ratio/min": 0.09655631333589554, "sampling/sampling_logp_difference/max": 1.201643466949463, "sampling/sampling_logp_difference/mean": 0.02852645330131054, "step": 2315, "step_time": 4.237520403999952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2825002456083894, "epoch": 0.02316, "grad_norm": 0.029360884800553322, "kl": 0.6244153082370758, "learning_rate": 9.997594439416645e-06, "loss": -0.016, "step": 2316, "step_time": 2.4492068249965087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05254587484523654, "epoch": 0.02317, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004395128635223955, "kl": 0.49182582274079323, "learning_rate": 9.99759232903891e-06, "loss": 0.0015, "num_tokens": 19876398.0, "reward": 1.1179230213165283, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.1179230213165283, "rewards/rollout_reward_func/std": 0.22667022049427032, "sampling/importance_sampling_ratio/max": 0.9979629516601562, "sampling/importance_sampling_ratio/mean": 0.994645357131958, "sampling/importance_sampling_ratio/min": 0.9923642873764038, "sampling/sampling_logp_difference/max": 0.005245186388492584, "sampling/sampling_logp_difference/mean": 0.0011047197040170431, "step": 2317, "step_time": 4.508901861998311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05320304725319147, "epoch": 0.02318, "grad_norm": 0.0004446075763553381, "kl": 0.49171219766139984, "learning_rate": 9.997590217736173e-06, "loss": 0.0015, "step": 2318, "step_time": 2.054066930009867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 143.15625, "completions/mean_terminated_length": 143.15625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.5797330592758954, "epoch": 0.02319, "frac_reward_zero_std": 0.5, "grad_norm": 0.025233356282114983, "kl": 0.5626705661416054, "learning_rate": 9.99758810550843e-06, "loss": -0.0504, "num_tokens": 19891715.0, "reward": 0.8988221287727356, "reward_std": 0.03523053601384163, "rewards/rollout_reward_func/mean": 0.8988221287727356, "rewards/rollout_reward_func/std": 0.23370777070522308, "sampling/importance_sampling_ratio/max": 0.9989010691642761, "sampling/importance_sampling_ratio/mean": 0.9065345525741577, "sampling/importance_sampling_ratio/min": 0.005628321319818497, "sampling/sampling_logp_difference/max": 1.7071661949157715, "sampling/sampling_logp_difference/mean": 0.0681302547454834, "step": 2319, "step_time": 3.961321720998967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.587226586882025, "epoch": 0.0232, "grad_norm": 0.025174342095851898, "kl": 0.5634671859443188, "learning_rate": 9.997585992355684e-06, "loss": -0.0505, "step": 2320, "step_time": 2.494676676004019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 267.75, "completions/mean_terminated_length": 267.75, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.04258367698639631, "epoch": 0.02321, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005206585628911853, "kl": 0.3736950643360615, "learning_rate": 9.997583878277932e-06, "loss": 0.0017, "num_tokens": 19911051.0, "reward": 0.8452692031860352, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8452692031860352, "rewards/rollout_reward_func/std": 0.3639231324195862, "sampling/importance_sampling_ratio/max": 0.9972039461135864, "sampling/importance_sampling_ratio/mean": 0.9945606589317322, "sampling/importance_sampling_ratio/min": 0.9900798797607422, "sampling/sampling_logp_difference/max": 0.007326066493988037, "sampling/sampling_logp_difference/mean": 0.0007923189550638199, "step": 2321, "step_time": 4.6098033629968995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.042621652130037546, "epoch": 0.02322, "grad_norm": 0.0005260000471025705, "kl": 0.3736816644668579, "learning_rate": 9.997581763275179e-06, "loss": 0.0017, "step": 2322, "step_time": 2.5016437859885627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 93.4375, "completions/mean_terminated_length": 93.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.529751593247056, "epoch": 0.02323, "frac_reward_zero_std": 0.75, "grad_norm": 0.006059798412024975, "kl": 0.6141922771930695, "learning_rate": 9.997579647347423e-06, "loss": -0.0178, "num_tokens": 19924897.0, "reward": 0.659375011920929, "reward_std": 0.0431063212454319, "rewards/rollout_reward_func/mean": 0.659375011920929, "rewards/rollout_reward_func/std": 0.19258476793766022, "sampling/importance_sampling_ratio/max": 0.9982872009277344, "sampling/importance_sampling_ratio/mean": 0.9333525896072388, "sampling/importance_sampling_ratio/min": 0.0014001572271808982, "sampling/sampling_logp_difference/max": 2.7543392181396484, "sampling/sampling_logp_difference/mean": 0.09127004444599152, "step": 2323, "step_time": 3.892690553024295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5290009807795286, "epoch": 0.02324, "grad_norm": 0.006748114246875048, "kl": 0.6172755733132362, "learning_rate": 9.997577530494666e-06, "loss": -0.0177, "step": 2324, "step_time": 2.012639726010093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 143.75, "completions/mean_terminated_length": 143.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05155998980626464, "epoch": 0.02325, "frac_reward_zero_std": 1.0, "grad_norm": 0.00045469956239685416, "kl": 0.4063325710594654, "learning_rate": 9.997575412716907e-06, "loss": 0.0013, "num_tokens": 19940297.0, "reward": 0.8235769271850586, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8235769271850586, "rewards/rollout_reward_func/std": 0.2709179222583771, "sampling/importance_sampling_ratio/max": 0.9992445707321167, "sampling/importance_sampling_ratio/mean": 0.9963912963867188, "sampling/importance_sampling_ratio/min": 0.9942567348480225, "sampling/sampling_logp_difference/max": 0.004269598051905632, "sampling/sampling_logp_difference/mean": 0.0008405986591242254, "step": 2325, "step_time": 4.024070917999779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.051400816068053246, "epoch": 0.02326, "grad_norm": 0.00045223828055895865, "kl": 0.4063679873943329, "learning_rate": 9.997573294014147e-06, "loss": 0.0013, "step": 2326, "step_time": 2.441835619029007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 120.25, "completions/mean_terminated_length": 120.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.21972180251032114, "epoch": 0.02327, "frac_reward_zero_std": 0.75, "grad_norm": 0.01163669303059578, "kl": 0.5288631729781628, "learning_rate": 9.997571174386387e-06, "loss": -0.0266, "num_tokens": 19954881.0, "reward": 0.5757211446762085, "reward_std": 0.014278121292591095, "rewards/rollout_reward_func/mean": 0.5757211446762085, "rewards/rollout_reward_func/std": 0.0819011926651001, "sampling/importance_sampling_ratio/max": 1.0011883974075317, "sampling/importance_sampling_ratio/mean": 0.965624213218689, "sampling/importance_sampling_ratio/min": 0.03234311565756798, "sampling/sampling_logp_difference/max": 1.7666292190551758, "sampling/sampling_logp_difference/mean": 0.023986585438251495, "step": 2327, "step_time": 4.354131179999968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22116514947265387, "epoch": 0.02328, "grad_norm": 0.011383512057363987, "kl": 0.5298164188861847, "learning_rate": 9.997569053833627e-06, "loss": -0.0266, "step": 2328, "step_time": 1.9960128500097198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 124.0625, "completions/mean_terminated_length": 124.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.39829834178090096, "epoch": 0.02329, "frac_reward_zero_std": 0.5, "grad_norm": 0.011983328498899937, "kl": 0.6922826170921326, "learning_rate": 9.997566932355866e-06, "loss": -0.0455, "num_tokens": 19969675.0, "reward": 0.5406730771064758, "reward_std": 0.028556236997246742, "rewards/rollout_reward_func/mean": 0.5406730771064758, "rewards/rollout_reward_func/std": 0.19041869044303894, "sampling/importance_sampling_ratio/max": 0.9988670945167542, "sampling/importance_sampling_ratio/mean": 0.9344649314880371, "sampling/importance_sampling_ratio/min": 0.0005189019721001387, "sampling/sampling_logp_difference/max": 2.9775757789611816, "sampling/sampling_logp_difference/mean": 0.07309288531541824, "step": 2329, "step_time": 4.104266541005927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3982859905809164, "epoch": 0.0233, "grad_norm": 0.01267729140818119, "kl": 0.6963912472128868, "learning_rate": 9.997564809953108e-06, "loss": -0.0455, "step": 2330, "step_time": 2.0066512920020614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 164.78125, "completions/mean_terminated_length": 164.78125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.3276131311431527, "epoch": 0.02331, "frac_reward_zero_std": 0.75, "grad_norm": 0.009085205383598804, "kl": 0.4948347881436348, "learning_rate": 9.997562686625353e-06, "loss": -0.0269, "num_tokens": 19985628.0, "reward": 0.789519190788269, "reward_std": 0.00679910508915782, "rewards/rollout_reward_func/mean": 0.789519190788269, "rewards/rollout_reward_func/std": 0.24088004231452942, "sampling/importance_sampling_ratio/max": 0.9985834360122681, "sampling/importance_sampling_ratio/mean": 0.9637224674224854, "sampling/importance_sampling_ratio/min": 2.4504118179002887e-10, "sampling/sampling_logp_difference/max": 11.574350357055664, "sampling/sampling_logp_difference/mean": 0.11383616179227829, "step": 2331, "step_time": 4.436287785989407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.32897016406059265, "epoch": 0.02332, "grad_norm": 0.008913537487387657, "kl": 0.49706338718533516, "learning_rate": 9.997560562372598e-06, "loss": -0.0269, "step": 2332, "step_time": 2.512938834996021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 161.0, "completions/mean_terminated_length": 161.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.05093546770513058, "epoch": 0.02333, "frac_reward_zero_std": 1.0, "grad_norm": 0.00042051749187521636, "kl": 0.46977977454662323, "learning_rate": 9.997558437194845e-06, "loss": 0.0016, "num_tokens": 20001404.0, "reward": 0.8465384244918823, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8465384244918823, "rewards/rollout_reward_func/std": 0.14949019253253937, "sampling/importance_sampling_ratio/max": 1.0004844665527344, "sampling/importance_sampling_ratio/mean": 0.9957376718521118, "sampling/importance_sampling_ratio/min": 0.9928383231163025, "sampling/sampling_logp_difference/max": 0.003698885440826416, "sampling/sampling_logp_difference/mean": 0.0009351812768727541, "step": 2333, "step_time": 4.4037178800062975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05038351844996214, "epoch": 0.02334, "grad_norm": 0.0004142048128414899, "kl": 0.4698854498565197, "learning_rate": 9.997556311092098e-06, "loss": 0.0016, "step": 2334, "step_time": 1.9306568549800431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 130.0625, "completions/mean_terminated_length": 130.4516143798828, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8945936411619186, "epoch": 0.02335, "frac_reward_zero_std": 0.5, "grad_norm": 0.07146137952804565, "kl": 0.9281214252114296, "learning_rate": 9.997554184064354e-06, "loss": -0.0063, "num_tokens": 20016422.0, "reward": 0.7537932395935059, "reward_std": 0.07621795684099197, "rewards/rollout_reward_func/mean": 0.7537932395935059, "rewards/rollout_reward_func/std": 0.22620798647403717, "sampling/importance_sampling_ratio/max": 0.9997289776802063, "sampling/importance_sampling_ratio/mean": 0.9098130464553833, "sampling/importance_sampling_ratio/min": 6.958920277409382e-17, "sampling/sampling_logp_difference/max": 4.093469619750977, "sampling/sampling_logp_difference/mean": 0.2991519570350647, "step": 2335, "step_time": 4.067109959003574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8981350464746356, "epoch": 0.02336, "grad_norm": 0.0606219619512558, "kl": 0.8920249678194523, "learning_rate": 9.997552056111614e-06, "loss": -0.0065, "step": 2336, "step_time": 2.076015282997105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.23612402100116014, "epoch": 0.02337, "frac_reward_zero_std": 0.75, "grad_norm": 0.007468977011740208, "kl": 0.4394121691584587, "learning_rate": 9.997549927233878e-06, "loss": -0.0272, "num_tokens": 20033178.0, "reward": 0.7597788572311401, "reward_std": 0.2022053450345993, "rewards/rollout_reward_func/mean": 0.7597788572311401, "rewards/rollout_reward_func/std": 0.4687741696834564, "sampling/importance_sampling_ratio/max": 0.9981643557548523, "sampling/importance_sampling_ratio/mean": 0.9637313485145569, "sampling/importance_sampling_ratio/min": 0.00019619346130639315, "sampling/sampling_logp_difference/max": 2.3957741260528564, "sampling/sampling_logp_difference/mean": 0.05354995280504227, "step": 2337, "step_time": 4.427403583984415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23658616142347455, "epoch": 0.02338, "grad_norm": 0.007124977186322212, "kl": 0.437079481780529, "learning_rate": 9.997547797431149e-06, "loss": -0.0272, "step": 2338, "step_time": 2.526589110988425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 123.84375, "completions/mean_terminated_length": 123.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5846573994494975, "epoch": 0.02339, "frac_reward_zero_std": 0.5, "grad_norm": 0.008035016246140003, "kl": 0.5971798822283745, "learning_rate": 9.997545666703426e-06, "loss": -0.0176, "num_tokens": 20047941.0, "reward": 0.7272307872772217, "reward_std": 0.02474873885512352, "rewards/rollout_reward_func/mean": 0.7272307872772217, "rewards/rollout_reward_func/std": 0.12733814120292664, "sampling/importance_sampling_ratio/max": 0.9991564154624939, "sampling/importance_sampling_ratio/mean": 0.9327404499053955, "sampling/importance_sampling_ratio/min": 4.535202606348321e-05, "sampling/sampling_logp_difference/max": 2.264702081680298, "sampling/sampling_logp_difference/mean": 0.1195475161075592, "step": 2339, "step_time": 4.8207653159915935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5863054003566504, "epoch": 0.0234, "grad_norm": 0.008195733651518822, "kl": 0.5988840572535992, "learning_rate": 9.997543535050707e-06, "loss": -0.0176, "step": 2340, "step_time": 2.086274493012752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 172.40625, "completions/mean_terminated_length": 168.4193572998047, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 1.069675525650382, "epoch": 0.02341, "frac_reward_zero_std": 0.25, "grad_norm": 0.03768258914351463, "kl": 0.5994291119277477, "learning_rate": 9.997541402472996e-06, "loss": -0.031, "num_tokens": 20064226.0, "reward": 0.5787832736968994, "reward_std": 0.06822315603494644, "rewards/rollout_reward_func/mean": 0.5787832736968994, "rewards/rollout_reward_func/std": 0.15177126228809357, "sampling/importance_sampling_ratio/max": 0.9973428249359131, "sampling/importance_sampling_ratio/mean": 0.8706838488578796, "sampling/importance_sampling_ratio/min": 1.3489760691188282e-17, "sampling/sampling_logp_difference/max": 3.2689855098724365, "sampling/sampling_logp_difference/mean": 0.29545801877975464, "step": 2341, "step_time": 4.5639212599926395 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 1.061298182234168, "epoch": 0.02342, "grad_norm": 0.03294862061738968, "kl": 0.5709816105663776, "learning_rate": 9.997539268970293e-06, "loss": -0.0312, "step": 2342, "step_time": 2.0737655599878053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 196.9375, "completions/mean_terminated_length": 196.9375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.1715117832645774, "epoch": 0.02343, "frac_reward_zero_std": 0.75, "grad_norm": 0.006020811386406422, "kl": 0.46145618706941605, "learning_rate": 9.997537134542598e-06, "loss": -0.0264, "num_tokens": 20081208.0, "reward": 0.892100989818573, "reward_std": 0.01450929045677185, "rewards/rollout_reward_func/mean": 0.892100989818573, "rewards/rollout_reward_func/std": 0.3243235647678375, "sampling/importance_sampling_ratio/max": 0.9975729584693909, "sampling/importance_sampling_ratio/mean": 0.9632492661476135, "sampling/importance_sampling_ratio/min": 0.015917206183075905, "sampling/sampling_logp_difference/max": 1.7285313606262207, "sampling/sampling_logp_difference/mean": 0.026243988424539566, "step": 2343, "step_time": 4.443987538004876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17389363748952746, "epoch": 0.02344, "grad_norm": 0.007148780394345522, "kl": 0.4553951844573021, "learning_rate": 9.997534999189912e-06, "loss": -0.0264, "step": 2344, "step_time": 2.4868808220198844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.05395839363336563, "epoch": 0.02345, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005162787856534123, "kl": 0.4483886547386646, "learning_rate": 9.997532862912235e-06, "loss": 0.0017, "num_tokens": 20098024.0, "reward": 0.8806923031806946, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8806923031806946, "rewards/rollout_reward_func/std": 0.43816107511520386, "sampling/importance_sampling_ratio/max": 0.9970170855522156, "sampling/importance_sampling_ratio/mean": 0.9935756921768188, "sampling/importance_sampling_ratio/min": 0.9871987104415894, "sampling/sampling_logp_difference/max": 0.006608426570892334, "sampling/sampling_logp_difference/mean": 0.0011175915133208036, "step": 2345, "step_time": 4.626393929000187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05484189698472619, "epoch": 0.02346, "grad_norm": 0.000525331764947623, "kl": 0.44824958965182304, "learning_rate": 9.997530725709568e-06, "loss": 0.0017, "step": 2346, "step_time": 2.0490069500010577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.050993664655834436, "epoch": 0.02347, "frac_reward_zero_std": 1.0, "grad_norm": 0.000546636467333883, "kl": 0.3769774101674557, "learning_rate": 9.997528587581909e-06, "loss": 0.0015, "num_tokens": 20115608.0, "reward": 0.38907694816589355, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.38907694816589355, "rewards/rollout_reward_func/std": 0.13735970854759216, "sampling/importance_sampling_ratio/max": 0.9979966282844543, "sampling/importance_sampling_ratio/mean": 0.993665337562561, "sampling/importance_sampling_ratio/min": 0.9882094860076904, "sampling/sampling_logp_difference/max": 0.005200488492846489, "sampling/sampling_logp_difference/mean": 0.0010150906164199114, "step": 2347, "step_time": 4.401515366997046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05179486842826009, "epoch": 0.02348, "grad_norm": 0.0005601746961474419, "kl": 0.37684791162610054, "learning_rate": 9.997526448529262e-06, "loss": 0.0015, "step": 2348, "step_time": 2.0703407789987978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 125.5, "completions/mean_terminated_length": 125.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.07653673551976681, "epoch": 0.02349, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013293850934132934, "kl": 0.4090953730046749, "learning_rate": 9.997524308551627e-06, "loss": 0.0012, "num_tokens": 20130360.0, "reward": 0.6540384292602539, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6540384292602539, "rewards/rollout_reward_func/std": 0.28346309065818787, "sampling/importance_sampling_ratio/max": 1.02274751663208, "sampling/importance_sampling_ratio/mean": 1.0006539821624756, "sampling/importance_sampling_ratio/min": 0.99165278673172, "sampling/sampling_logp_difference/max": 0.022577643394470215, "sampling/sampling_logp_difference/mean": 0.0017511432524770498, "step": 2349, "step_time": 3.8282074159869808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07727996725589037, "epoch": 0.0235, "grad_norm": 0.0013083823723718524, "kl": 0.4089673161506653, "learning_rate": 9.997522167649003e-06, "loss": 0.0012, "step": 2350, "step_time": 2.4871529109950643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 147.5625, "completions/mean_terminated_length": 147.5625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.726403621956706, "epoch": 0.02351, "frac_reward_zero_std": 0.5, "grad_norm": 0.46705392003059387, "kl": 0.6643704213202, "learning_rate": 9.997520025821391e-06, "loss": -0.0496, "num_tokens": 20145874.0, "reward": 0.7053509950637817, "reward_std": 0.11410611122846603, "rewards/rollout_reward_func/mean": 0.7053509950637817, "rewards/rollout_reward_func/std": 0.2478494644165039, "sampling/importance_sampling_ratio/max": 1.0217074155807495, "sampling/importance_sampling_ratio/mean": 0.8755995631217957, "sampling/importance_sampling_ratio/min": 0.00011908374290214851, "sampling/sampling_logp_difference/max": 3.4132959842681885, "sampling/sampling_logp_difference/mean": 0.1538187861442566, "step": 2351, "step_time": 4.594403721006529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.732426231727004, "epoch": 0.02352, "grad_norm": 0.013399538584053516, "kl": 0.7268319167196751, "learning_rate": 9.997517883068792e-06, "loss": -0.0503, "step": 2352, "step_time": 2.0584884110066923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 149.1875, "completions/mean_terminated_length": 149.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.33422419521957636, "epoch": 0.02353, "frac_reward_zero_std": 0.5, "grad_norm": 0.014112576842308044, "kl": 0.6318833120167255, "learning_rate": 9.997515739391206e-06, "loss": -0.0432, "num_tokens": 20161440.0, "reward": 0.7495672702789307, "reward_std": 0.04691381752490997, "rewards/rollout_reward_func/mean": 0.7495672702789307, "rewards/rollout_reward_func/std": 0.26524442434310913, "sampling/importance_sampling_ratio/max": 1.0121638774871826, "sampling/importance_sampling_ratio/mean": 0.9369299411773682, "sampling/importance_sampling_ratio/min": 0.011084302328526974, "sampling/sampling_logp_difference/max": 2.4526166915893555, "sampling/sampling_logp_difference/mean": 0.049658872187137604, "step": 2353, "step_time": 4.139995947996795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3342308793216944, "epoch": 0.02354, "grad_norm": 0.014488092623651028, "kl": 0.6311188153922558, "learning_rate": 9.997513594788635e-06, "loss": -0.0432, "step": 2354, "step_time": 2.025310912023997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.05554803553968668, "epoch": 0.02355, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004924540990032256, "kl": 0.4465085342526436, "learning_rate": 9.997511449261078e-06, "loss": 0.0015, "num_tokens": 20177872.0, "reward": 0.7851153612136841, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7851153612136841, "rewards/rollout_reward_func/std": 0.14875726401805878, "sampling/importance_sampling_ratio/max": 0.9977946281433105, "sampling/importance_sampling_ratio/mean": 0.994469940662384, "sampling/importance_sampling_ratio/min": 0.991622805595398, "sampling/sampling_logp_difference/max": 0.006223972886800766, "sampling/sampling_logp_difference/mean": 0.001134388381615281, "step": 2355, "step_time": 4.251427692994184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05511229997500777, "epoch": 0.02356, "grad_norm": 0.00048470136243849993, "kl": 0.44656267389655113, "learning_rate": 9.997509302808534e-06, "loss": 0.0015, "step": 2356, "step_time": 2.9537599069954013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 122.125, "completions/mean_terminated_length": 122.125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.4675262700766325, "epoch": 0.02357, "frac_reward_zero_std": 0.75, "grad_norm": 0.018294604495167732, "kl": 0.5925301611423492, "learning_rate": 9.997507155431008e-06, "loss": 0.0216, "num_tokens": 20192460.0, "reward": 0.49379807710647583, "reward_std": 0.019555971026420593, "rewards/rollout_reward_func/mean": 0.49379807710647583, "rewards/rollout_reward_func/std": 0.18609148263931274, "sampling/importance_sampling_ratio/max": 1.000550627708435, "sampling/importance_sampling_ratio/mean": 0.9324221611022949, "sampling/importance_sampling_ratio/min": 0.004620977211743593, "sampling/sampling_logp_difference/max": 2.16641902923584, "sampling/sampling_logp_difference/mean": 0.06866501271724701, "step": 2357, "step_time": 3.8918227320100414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4670018870383501, "epoch": 0.02358, "grad_norm": 0.01834985241293907, "kl": 0.5928215868771076, "learning_rate": 9.997505007128496e-06, "loss": 0.0216, "step": 2358, "step_time": 2.0073845079969033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 167.5, "completions/mean_terminated_length": 167.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.0792267108336091, "epoch": 0.02359, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007192445918917656, "kl": 0.47673672065138817, "learning_rate": 9.997502857901003e-06, "loss": 0.0015, "num_tokens": 20208556.0, "reward": 0.5381923317909241, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5381923317909241, "rewards/rollout_reward_func/std": 0.13580842316150665, "sampling/importance_sampling_ratio/max": 0.9965464472770691, "sampling/importance_sampling_ratio/mean": 0.9927308559417725, "sampling/importance_sampling_ratio/min": 0.9875736236572266, "sampling/sampling_logp_difference/max": 0.007062483578920364, "sampling/sampling_logp_difference/mean": 0.0015280961524695158, "step": 2359, "step_time": 4.200656696986698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08017047867178917, "epoch": 0.0236, "grad_norm": 0.0007161074317991734, "kl": 0.47656093537807465, "learning_rate": 9.997500707748524e-06, "loss": 0.0015, "step": 2360, "step_time": 1.9425899509878946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.275398803409189, "epoch": 0.02361, "frac_reward_zero_std": 1.0, "grad_norm": 0.005747882183641195, "kl": 0.45478053763508797, "learning_rate": 9.997498556671064e-06, "loss": 0.0016, "num_tokens": 20224628.0, "reward": 0.7907692193984985, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7907692193984985, "rewards/rollout_reward_func/std": 0.22849565744400024, "sampling/importance_sampling_ratio/max": 1.0003234148025513, "sampling/importance_sampling_ratio/mean": 0.9635214805603027, "sampling/importance_sampling_ratio/min": 0.002117188647389412, "sampling/sampling_logp_difference/max": 1.9281177520751953, "sampling/sampling_logp_difference/mean": 0.03502059355378151, "step": 2361, "step_time": 4.181581047996588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2754764137789607, "epoch": 0.02362, "grad_norm": 0.005777113139629364, "kl": 0.4545242711901665, "learning_rate": 9.997496404668621e-06, "loss": 0.0016, "step": 2362, "step_time": 2.3796716119977646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 142.90625, "completions/mean_terminated_length": 142.90625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.22188233025372028, "epoch": 0.02363, "frac_reward_zero_std": 0.75, "grad_norm": 0.019924746826291084, "kl": 0.612891148775816, "learning_rate": 9.9974942517412e-06, "loss": -0.0267, "num_tokens": 20239969.0, "reward": 0.5779327154159546, "reward_std": 0.009110798127949238, "rewards/rollout_reward_func/mean": 0.5779327154159546, "rewards/rollout_reward_func/std": 0.22948245704174042, "sampling/importance_sampling_ratio/max": 0.9970713257789612, "sampling/importance_sampling_ratio/mean": 0.96319580078125, "sampling/importance_sampling_ratio/min": 0.0006493974942713976, "sampling/sampling_logp_difference/max": 3.0312955379486084, "sampling/sampling_logp_difference/mean": 0.04710531234741211, "step": 2363, "step_time": 4.41703903501184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22232329286634922, "epoch": 0.02364, "grad_norm": 0.019716566428542137, "kl": 0.6146135441958904, "learning_rate": 9.997492097888794e-06, "loss": -0.0267, "step": 2364, "step_time": 2.013602892999188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 122.5, "completions/mean_terminated_length": 122.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.08127028215676546, "epoch": 0.02365, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006770756444893777, "kl": 0.45323115587234497, "learning_rate": 9.99748994311141e-06, "loss": 0.0013, "num_tokens": 20254657.0, "reward": 0.6784615516662598, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6784615516662598, "rewards/rollout_reward_func/std": 0.287871778011322, "sampling/importance_sampling_ratio/max": 1.0017077922821045, "sampling/importance_sampling_ratio/mean": 0.9950875043869019, "sampling/importance_sampling_ratio/min": 0.9897209405899048, "sampling/sampling_logp_difference/max": 0.005350034683942795, "sampling/sampling_logp_difference/mean": 0.0012732651084661484, "step": 2365, "step_time": 3.9106314959863084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08202423248440027, "epoch": 0.02366, "grad_norm": 0.0006813382497057319, "kl": 0.45308566093444824, "learning_rate": 9.997487787409046e-06, "loss": 0.0013, "step": 2366, "step_time": 2.0092421240042313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 143.1875, "completions/mean_terminated_length": 143.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4859707006253302, "epoch": 0.02367, "frac_reward_zero_std": 0.5, "grad_norm": 0.032278236001729965, "kl": 0.6354795023798943, "learning_rate": 9.997485630781703e-06, "loss": 0.0481, "num_tokens": 20269951.0, "reward": 0.777423083782196, "reward_std": 0.020669270306825638, "rewards/rollout_reward_func/mean": 0.777423083782196, "rewards/rollout_reward_func/std": 0.08229617774486542, "sampling/importance_sampling_ratio/max": 0.9955349564552307, "sampling/importance_sampling_ratio/mean": 0.9284939169883728, "sampling/importance_sampling_ratio/min": 0.002540762070566416, "sampling/sampling_logp_difference/max": 1.9009385108947754, "sampling/sampling_logp_difference/mean": 0.07928185909986496, "step": 2367, "step_time": 4.288059391001298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.48845451790839434, "epoch": 0.02368, "grad_norm": 0.03299374505877495, "kl": 0.6275613680481911, "learning_rate": 9.997483473229378e-06, "loss": 0.048, "step": 2368, "step_time": 2.4958736800035695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 193.6875, "completions/mean_terminated_length": 193.6875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.6757230823859572, "epoch": 0.02369, "frac_reward_zero_std": 0.5, "grad_norm": 0.13616201281547546, "kl": 0.5626898892223835, "learning_rate": 9.997481314752078e-06, "loss": -0.0468, "num_tokens": 20286861.0, "reward": 0.6047307848930359, "reward_std": 0.12401565164327621, "rewards/rollout_reward_func/mean": 0.6047307848930359, "rewards/rollout_reward_func/std": 0.2460985779762268, "sampling/importance_sampling_ratio/max": 1.6735212802886963, "sampling/importance_sampling_ratio/mean": 0.9389925003051758, "sampling/importance_sampling_ratio/min": 1.9533330508636482e-10, "sampling/sampling_logp_difference/max": 11.036393165588379, "sampling/sampling_logp_difference/mean": 0.18874619901180267, "step": 2369, "step_time": 4.486887007013138 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010890151839703321, "entropy": 0.6753491768613458, "epoch": 0.0237, "grad_norm": 0.04365644231438637, "kl": 0.5579603388905525, "learning_rate": 9.997479155349801e-06, "loss": -0.0468, "step": 2370, "step_time": 2.0272801339961006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 215.03125, "completions/mean_terminated_length": 215.03125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.23654145747423172, "epoch": 0.02371, "frac_reward_zero_std": 0.75, "grad_norm": 0.009175210259854794, "kl": 0.4853561297059059, "learning_rate": 9.997476995022546e-06, "loss": -0.0261, "num_tokens": 20304510.0, "reward": 0.8838365077972412, "reward_std": 0.002311693038791418, "rewards/rollout_reward_func/mean": 0.8838365077972412, "rewards/rollout_reward_func/std": 0.3352806270122528, "sampling/importance_sampling_ratio/max": 1.0029513835906982, "sampling/importance_sampling_ratio/mean": 0.9608745574951172, "sampling/importance_sampling_ratio/min": 0.01656499132514, "sampling/sampling_logp_difference/max": 1.7847514152526855, "sampling/sampling_logp_difference/mean": 0.02461174502968788, "step": 2371, "step_time": 4.414284578000661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2359946183860302, "epoch": 0.02372, "grad_norm": 0.009614376351237297, "kl": 0.48463794216513634, "learning_rate": 9.997474833770313e-06, "loss": -0.026, "step": 2372, "step_time": 2.0497401140019065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 152.1875, "completions/mean_terminated_length": 152.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5192405749112368, "epoch": 0.02373, "frac_reward_zero_std": 0.5, "grad_norm": 0.06626840680837631, "kl": 0.4803658425807953, "learning_rate": 9.997472671593105e-06, "loss": -0.0321, "num_tokens": 20320228.0, "reward": 0.9560480117797852, "reward_std": 0.08872198313474655, "rewards/rollout_reward_func/mean": 0.9560480117797852, "rewards/rollout_reward_func/std": 0.13912338018417358, "sampling/importance_sampling_ratio/max": 1.001533031463623, "sampling/importance_sampling_ratio/mean": 0.9032830595970154, "sampling/importance_sampling_ratio/min": 0.0033369732555001974, "sampling/sampling_logp_difference/max": 1.7002315521240234, "sampling/sampling_logp_difference/mean": 0.06541333347558975, "step": 2373, "step_time": 4.714467110010446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5133892884477973, "epoch": 0.02374, "grad_norm": 0.07561777532100677, "kl": 0.4836718253791332, "learning_rate": 9.99747050849092e-06, "loss": -0.0321, "step": 2374, "step_time": 2.5205560100002913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 125.875, "completions/mean_terminated_length": 125.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4293759623542428, "epoch": 0.02375, "frac_reward_zero_std": 0.5, "grad_norm": 0.039467498660087585, "kl": 0.7238833010196686, "learning_rate": 9.997468344463764e-06, "loss": -0.0425, "num_tokens": 20335024.0, "reward": 0.6710947155952454, "reward_std": 0.0567358024418354, "rewards/rollout_reward_func/mean": 0.6710947155952454, "rewards/rollout_reward_func/std": 0.10634852945804596, "sampling/importance_sampling_ratio/max": 1.0010813474655151, "sampling/importance_sampling_ratio/mean": 0.9310053586959839, "sampling/importance_sampling_ratio/min": 2.3067475751759048e-07, "sampling/sampling_logp_difference/max": 3.907705307006836, "sampling/sampling_logp_difference/mean": 0.10278438031673431, "step": 2375, "step_time": 4.125093434995506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42587417736649513, "epoch": 0.02376, "grad_norm": 0.03840307518839836, "kl": 0.730064831674099, "learning_rate": 9.99746617951163e-06, "loss": -0.0424, "step": 2376, "step_time": 2.037581677985145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 143.125, "completions/mean_terminated_length": 143.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4510304294526577, "epoch": 0.02377, "frac_reward_zero_std": 0.75, "grad_norm": 0.006215624511241913, "kl": 0.4197700023651123, "learning_rate": 9.997464013634524e-06, "loss": -0.0274, "num_tokens": 20350316.0, "reward": 0.7630432844161987, "reward_std": 0.045785173773765564, "rewards/rollout_reward_func/mean": 0.7630432844161987, "rewards/rollout_reward_func/std": 0.3084796965122223, "sampling/importance_sampling_ratio/max": 0.9986097812652588, "sampling/importance_sampling_ratio/mean": 0.960906982421875, "sampling/importance_sampling_ratio/min": 1.7135834309671416e-13, "sampling/sampling_logp_difference/max": 3.9402873516082764, "sampling/sampling_logp_difference/mean": 0.1545991748571396, "step": 2377, "step_time": 4.1748241759996745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.44840659387409687, "epoch": 0.02378, "grad_norm": 0.006026165094226599, "kl": 0.4208827503025532, "learning_rate": 9.997461846832443e-06, "loss": -0.0274, "step": 2378, "step_time": 2.0532524599911994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.07345965085551143, "epoch": 0.02379, "frac_reward_zero_std": 1.0, "grad_norm": 0.000609598180744797, "kl": 0.41831015050411224, "learning_rate": 9.99745967910539e-06, "loss": 0.0013, "num_tokens": 20365716.0, "reward": 0.8125, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8125, "rewards/rollout_reward_func/std": 0.2932544946670532, "sampling/importance_sampling_ratio/max": 0.995994508266449, "sampling/importance_sampling_ratio/mean": 0.992980420589447, "sampling/importance_sampling_ratio/min": 0.987667441368103, "sampling/sampling_logp_difference/max": 0.01052650436758995, "sampling/sampling_logp_difference/mean": 0.0014484103303402662, "step": 2379, "step_time": 4.314443309005583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07264338806271553, "epoch": 0.0238, "grad_norm": 0.000599229650106281, "kl": 0.4184878468513489, "learning_rate": 9.997457510453365e-06, "loss": 0.0013, "step": 2380, "step_time": 2.4594040590163786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 141.5, "completions/mean_terminated_length": 141.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07781513873487711, "epoch": 0.02381, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006805354496464133, "kl": 0.39118507876992226, "learning_rate": 9.997455340876367e-06, "loss": 0.0012, "num_tokens": 20380980.0, "reward": 0.7438461780548096, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7438461780548096, "rewards/rollout_reward_func/std": 0.28455865383148193, "sampling/importance_sampling_ratio/max": 0.9999122023582458, "sampling/importance_sampling_ratio/mean": 0.9932071566581726, "sampling/importance_sampling_ratio/min": 0.9876561164855957, "sampling/sampling_logp_difference/max": 0.0063642896711826324, "sampling/sampling_logp_difference/mean": 0.001479293336160481, "step": 2381, "step_time": 3.9333928120104247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07675773650407791, "epoch": 0.02382, "grad_norm": 0.0006646909168921411, "kl": 0.3913968913257122, "learning_rate": 9.997453170374398e-06, "loss": 0.0012, "step": 2382, "step_time": 1.9971359349874547 }, { "clip_ratio/high_max": 0.02291666716337204, "clip_ratio/high_mean": 0.01145833358168602, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01145833358168602, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 167.84375, "completions/mean_terminated_length": 167.84375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.6492441594600677, "epoch": 0.02383, "frac_reward_zero_std": 0.5, "grad_norm": 0.33529964089393616, "kl": 0.4299086928367615, "learning_rate": 9.99745099894746e-06, "loss": -0.0691, "num_tokens": 20397087.0, "reward": 0.7085624933242798, "reward_std": 0.03811577335000038, "rewards/rollout_reward_func/mean": 0.7085624933242798, "rewards/rollout_reward_func/std": 0.15851183235645294, "sampling/importance_sampling_ratio/max": 1.5506500005722046, "sampling/importance_sampling_ratio/mean": 0.962486982345581, "sampling/importance_sampling_ratio/min": 5.938063066837597e-16, "sampling/sampling_logp_difference/max": 3.1091885566711426, "sampling/sampling_logp_difference/mean": 0.20981526374816895, "step": 2383, "step_time": 4.260592008002277 }, { "clip_ratio/high_max": 0.04791666753590107, "clip_ratio/high_mean": 0.023958333767950535, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023958333767950535, "entropy": 0.647800178732723, "epoch": 0.02384, "grad_norm": 0.15892527997493744, "kl": 0.4302844926714897, "learning_rate": 9.997448826595548e-06, "loss": -0.0687, "step": 2384, "step_time": 2.028314176990534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 194.625, "completions/mean_terminated_length": 194.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.23227400053292513, "epoch": 0.02385, "frac_reward_zero_std": 1.0, "grad_norm": 0.004639137536287308, "kl": 0.412823598831892, "learning_rate": 9.99744665331867e-06, "loss": 0.0015, "num_tokens": 20414051.0, "reward": 0.9095385074615479, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9095385074615479, "rewards/rollout_reward_func/std": 0.3408235013484955, "sampling/importance_sampling_ratio/max": 1.0006327629089355, "sampling/importance_sampling_ratio/mean": 0.962306022644043, "sampling/importance_sampling_ratio/min": 0.014878706075251102, "sampling/sampling_logp_difference/max": 1.2995587587356567, "sampling/sampling_logp_difference/mean": 0.02116185612976551, "step": 2385, "step_time": 4.836958982996293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23010702338069677, "epoch": 0.02386, "grad_norm": 0.004643588792532682, "kl": 0.4130753017961979, "learning_rate": 9.997444479116823e-06, "loss": 0.0015, "step": 2386, "step_time": 2.460656135008321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 174.3125, "completions/mean_terminated_length": 174.3125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.5632699839770794, "epoch": 0.02387, "frac_reward_zero_std": 0.5, "grad_norm": 0.016321450471878052, "kl": 0.42028170451521873, "learning_rate": 9.997442303990005e-06, "loss": -0.0458, "num_tokens": 20430485.0, "reward": 0.8359212875366211, "reward_std": 0.03253195062279701, "rewards/rollout_reward_func/mean": 0.8359212875366211, "rewards/rollout_reward_func/std": 0.29363879561424255, "sampling/importance_sampling_ratio/max": 0.9991676211357117, "sampling/importance_sampling_ratio/mean": 0.9324229955673218, "sampling/importance_sampling_ratio/min": 3.9202864045295716e-12, "sampling/sampling_logp_difference/max": 3.305567741394043, "sampling/sampling_logp_difference/mean": 0.16277089715003967, "step": 2387, "step_time": 4.249773809002363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5579129089601338, "epoch": 0.02388, "grad_norm": 0.016140630468726158, "kl": 0.42273835092782974, "learning_rate": 9.997440127938221e-06, "loss": -0.0458, "step": 2388, "step_time": 2.060744030000933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 143.53125, "completions/mean_terminated_length": 143.53125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.32769658137112856, "epoch": 0.02389, "frac_reward_zero_std": 0.5, "grad_norm": 0.04059232771396637, "kl": 0.6055639088153839, "learning_rate": 9.997437950961467e-06, "loss": -0.0001, "num_tokens": 20445758.0, "reward": 0.7635576725006104, "reward_std": 0.05983210727572441, "rewards/rollout_reward_func/mean": 0.7635576725006104, "rewards/rollout_reward_func/std": 0.26852285861968994, "sampling/importance_sampling_ratio/max": 0.9962393045425415, "sampling/importance_sampling_ratio/mean": 0.9366141557693481, "sampling/importance_sampling_ratio/min": 0.03705442696809769, "sampling/sampling_logp_difference/max": 1.3253657817840576, "sampling/sampling_logp_difference/mean": 0.035167377442121506, "step": 2389, "step_time": 3.9782809980024467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3230703976005316, "epoch": 0.0239, "grad_norm": 0.040304798632860184, "kl": 0.6037769019603729, "learning_rate": 9.997435773059748e-06, "loss": -0.0001, "step": 2390, "step_time": 2.0205486529885093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 177.75, "completions/mean_terminated_length": 177.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.30335464235395193, "epoch": 0.02391, "frac_reward_zero_std": 0.75, "grad_norm": 0.011495903134346008, "kl": 0.5022767372429371, "learning_rate": 9.997433594233063e-06, "loss": -0.0173, "num_tokens": 20462214.0, "reward": 0.7391424775123596, "reward_std": 0.04082675278186798, "rewards/rollout_reward_func/mean": 0.7391424775123596, "rewards/rollout_reward_func/std": 0.1991264820098877, "sampling/importance_sampling_ratio/max": 1.0764672756195068, "sampling/importance_sampling_ratio/mean": 0.9681991338729858, "sampling/importance_sampling_ratio/min": 0.0035104569979012012, "sampling/sampling_logp_difference/max": 1.2631099224090576, "sampling/sampling_logp_difference/mean": 0.032162800431251526, "step": 2391, "step_time": 4.852085017009813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2945153368636966, "epoch": 0.02392, "grad_norm": 0.012067921459674835, "kl": 0.5053924284875393, "learning_rate": 9.99743141448141e-06, "loss": -0.0172, "step": 2392, "step_time": 2.500497806991916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 154.6875, "completions/mean_terminated_length": 154.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.43390887789428234, "epoch": 0.02393, "frac_reward_zero_std": 0.5, "grad_norm": 0.0123716676607728, "kl": 0.4987242817878723, "learning_rate": 9.997429233804795e-06, "loss": -0.0458, "num_tokens": 20477988.0, "reward": 0.5633653402328491, "reward_std": 0.0176776722073555, "rewards/rollout_reward_func/mean": 0.5633653402328491, "rewards/rollout_reward_func/std": 0.16998496651649475, "sampling/importance_sampling_ratio/max": 0.9994559288024902, "sampling/importance_sampling_ratio/mean": 0.932836651802063, "sampling/importance_sampling_ratio/min": 0.001833918271586299, "sampling/sampling_logp_difference/max": 2.281196355819702, "sampling/sampling_logp_difference/mean": 0.06625412404537201, "step": 2393, "step_time": 4.104172180988826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42690057028084993, "epoch": 0.02394, "grad_norm": 0.012502548284828663, "kl": 0.49861909821629524, "learning_rate": 9.997427052203212e-06, "loss": -0.0458, "step": 2394, "step_time": 2.0327826219945564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 143.21875, "completions/mean_terminated_length": 143.21875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.391151268966496, "epoch": 0.02395, "frac_reward_zero_std": 0.5, "grad_norm": 0.02366417832672596, "kl": 0.46081025153398514, "learning_rate": 9.997424869676666e-06, "loss": -0.0192, "num_tokens": 20493339.0, "reward": 0.5856778621673584, "reward_std": 0.05334576591849327, "rewards/rollout_reward_func/mean": 0.5856778621673584, "rewards/rollout_reward_func/std": 0.1784723997116089, "sampling/importance_sampling_ratio/max": 0.9996769428253174, "sampling/importance_sampling_ratio/mean": 0.9335945844650269, "sampling/importance_sampling_ratio/min": 2.2747935872757807e-05, "sampling/sampling_logp_difference/max": 2.7348344326019287, "sampling/sampling_logp_difference/mean": 0.08099126815795898, "step": 2395, "step_time": 4.150551209990226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38785285223275423, "epoch": 0.02396, "grad_norm": 0.02420957013964653, "kl": 0.45980092510581017, "learning_rate": 9.997422686225155e-06, "loss": -0.0192, "step": 2396, "step_time": 2.498681190001662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 174.75, "completions/mean_terminated_length": 174.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2489040605723858, "epoch": 0.02397, "frac_reward_zero_std": 0.75, "grad_norm": 0.10548107326030731, "kl": 0.4352954775094986, "learning_rate": 9.997420501848683e-06, "loss": -0.0361, "num_tokens": 20509723.0, "reward": 0.6210514307022095, "reward_std": 0.01910955272614956, "rewards/rollout_reward_func/mean": 0.6210514307022095, "rewards/rollout_reward_func/std": 0.23825035989284515, "sampling/importance_sampling_ratio/max": 1.0922129154205322, "sampling/importance_sampling_ratio/mean": 0.9584938883781433, "sampling/importance_sampling_ratio/min": 0.0009274158510379493, "sampling/sampling_logp_difference/max": 2.016787052154541, "sampling/sampling_logp_difference/mean": 0.041594281792640686, "step": 2397, "step_time": 4.878701197994815 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.24659270700067282, "epoch": 0.02398, "grad_norm": 0.05911314859986305, "kl": 0.43354184925556183, "learning_rate": 9.997418316547247e-06, "loss": -0.0361, "step": 2398, "step_time": 2.0483479340036865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.08896175120025873, "epoch": 0.02399, "frac_reward_zero_std": 1.0, "grad_norm": 0.006595119833946228, "kl": 0.4194040820002556, "learning_rate": 9.99741613032085e-06, "loss": 0.0014, "num_tokens": 20525955.0, "reward": 0.9170384407043457, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9170384407043457, "rewards/rollout_reward_func/std": 0.26986250281333923, "sampling/importance_sampling_ratio/max": 1.1063283681869507, "sampling/importance_sampling_ratio/mean": 1.0155609846115112, "sampling/importance_sampling_ratio/min": 0.990725576877594, "sampling/sampling_logp_difference/max": 0.10094240307807922, "sampling/sampling_logp_difference/mean": 0.003926432691514492, "step": 2399, "step_time": 4.260354488011217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08898659888654947, "epoch": 0.024, "grad_norm": 0.00755518302321434, "kl": 0.418862659484148, "learning_rate": 9.99741394316949e-06, "loss": 0.0014, "step": 2400, "step_time": 1.9901042899946333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 146.21875, "completions/mean_terminated_length": 146.21875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2549576358869672, "epoch": 0.02401, "frac_reward_zero_std": 0.75, "grad_norm": 0.016290798783302307, "kl": 0.4396720752120018, "learning_rate": 9.99741175509317e-06, "loss": -0.017, "num_tokens": 20541314.0, "reward": 0.7172307968139648, "reward_std": 0.013598206453025341, "rewards/rollout_reward_func/mean": 0.7172307968139648, "rewards/rollout_reward_func/std": 0.37097397446632385, "sampling/importance_sampling_ratio/max": 0.9990646839141846, "sampling/importance_sampling_ratio/mean": 0.9644794464111328, "sampling/importance_sampling_ratio/min": 0.027404310181736946, "sampling/sampling_logp_difference/max": 1.0852503776550293, "sampling/sampling_logp_difference/mean": 0.019509125500917435, "step": 2401, "step_time": 4.112391287009814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.252809502184391, "epoch": 0.02402, "grad_norm": 0.017221735790371895, "kl": 0.43581414595246315, "learning_rate": 9.99740956609189e-06, "loss": -0.017, "step": 2402, "step_time": 2.4241716869946686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 165.90625, "completions/mean_terminated_length": 165.90625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.7091800589114428, "epoch": 0.02403, "frac_reward_zero_std": 0.25, "grad_norm": 0.07019945979118347, "kl": 0.5757781863212585, "learning_rate": 9.997407376165649e-06, "loss": -0.0689, "num_tokens": 20557247.0, "reward": 0.8323606252670288, "reward_std": 0.06327245384454727, "rewards/rollout_reward_func/mean": 0.8323606252670288, "rewards/rollout_reward_func/std": 0.22218482196331024, "sampling/importance_sampling_ratio/max": 0.9984073042869568, "sampling/importance_sampling_ratio/mean": 0.908586859703064, "sampling/importance_sampling_ratio/min": 9.377161373780307e-12, "sampling/sampling_logp_difference/max": 16.044639587402344, "sampling/sampling_logp_difference/mean": 0.21674948930740356, "step": 2403, "step_time": 4.618781256998773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7114963955245912, "epoch": 0.02404, "grad_norm": 0.06641201674938202, "kl": 0.5804972127079964, "learning_rate": 9.997405185314448e-06, "loss": -0.0692, "step": 2404, "step_time": 2.0182500340088154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.08502837549895048, "epoch": 0.02405, "frac_reward_zero_std": 1.0, "grad_norm": 0.001528485445305705, "kl": 0.42833420634269714, "learning_rate": 9.99740299353829e-06, "loss": 0.0015, "num_tokens": 20573407.0, "reward": 0.805961549282074, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.805961549282074, "rewards/rollout_reward_func/std": 0.2227320820093155, "sampling/importance_sampling_ratio/max": 1.1140121221542358, "sampling/importance_sampling_ratio/mean": 0.9957656860351562, "sampling/importance_sampling_ratio/min": 0.6544938683509827, "sampling/sampling_logp_difference/max": 0.4079822301864624, "sampling/sampling_logp_difference/mean": 0.0076540010049939156, "step": 2405, "step_time": 3.9694375219987705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08510779589414597, "epoch": 0.02406, "grad_norm": 0.00253856647759676, "kl": 0.4275941513478756, "learning_rate": 9.997400800837172e-06, "loss": 0.0015, "step": 2406, "step_time": 1.957194762013387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 154.9375, "completions/mean_terminated_length": 154.9375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2096176859922707, "epoch": 0.02407, "frac_reward_zero_std": 0.75, "grad_norm": 0.00824599340558052, "kl": 0.5220332108438015, "learning_rate": 9.997398607211096e-06, "loss": -0.0168, "num_tokens": 20589245.0, "reward": 0.7017451524734497, "reward_std": 0.02878740429878235, "rewards/rollout_reward_func/mean": 0.7017451524734497, "rewards/rollout_reward_func/std": 0.15201161801815033, "sampling/importance_sampling_ratio/max": 0.997255802154541, "sampling/importance_sampling_ratio/mean": 0.9646238684654236, "sampling/importance_sampling_ratio/min": 0.02412075735628605, "sampling/sampling_logp_difference/max": 1.4411396980285645, "sampling/sampling_logp_difference/mean": 0.03108224645256996, "step": 2407, "step_time": 4.142185369011713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21055992739275098, "epoch": 0.02408, "grad_norm": 0.0066102310083806515, "kl": 0.5306543745100498, "learning_rate": 9.997396412660063e-06, "loss": -0.0168, "step": 2408, "step_time": 2.520157613005722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 119.75, "completions/mean_terminated_length": 119.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08182189799845219, "epoch": 0.02409, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005803655949421227, "kl": 0.6404589638113976, "learning_rate": 9.997394217184075e-06, "loss": 0.0017, "num_tokens": 20603845.0, "reward": 0.6635769605636597, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6635769605636597, "rewards/rollout_reward_func/std": 0.12448589503765106, "sampling/importance_sampling_ratio/max": 0.9993833899497986, "sampling/importance_sampling_ratio/mean": 0.9944977164268494, "sampling/importance_sampling_ratio/min": 0.9894793629646301, "sampling/sampling_logp_difference/max": 0.007290635257959366, "sampling/sampling_logp_difference/mean": 0.0013698757393285632, "step": 2409, "step_time": 4.404662892004126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0788237452507019, "epoch": 0.0241, "grad_norm": 0.0005526724271476269, "kl": 0.6410273388028145, "learning_rate": 9.997392020783129e-06, "loss": 0.0017, "step": 2410, "step_time": 2.0155982759897597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 155.375, "completions/mean_terminated_length": 155.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2784669501706958, "epoch": 0.02411, "frac_reward_zero_std": 0.75, "grad_norm": 0.00907712709158659, "kl": 0.6254278793931007, "learning_rate": 9.997389823457227e-06, "loss": -0.0077, "num_tokens": 20619673.0, "reward": 0.8151634335517883, "reward_std": 0.0673111230134964, "rewards/rollout_reward_func/mean": 0.8151634335517883, "rewards/rollout_reward_func/std": 0.3606289327144623, "sampling/importance_sampling_ratio/max": 0.9981011152267456, "sampling/importance_sampling_ratio/mean": 0.9626777768135071, "sampling/importance_sampling_ratio/min": 0.0018464686581864953, "sampling/sampling_logp_difference/max": 1.697662353515625, "sampling/sampling_logp_difference/mean": 0.05014026537537575, "step": 2411, "step_time": 4.221698545989057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2791421702131629, "epoch": 0.02412, "grad_norm": 0.009414811618626118, "kl": 0.6302061602473259, "learning_rate": 9.997387625206372e-06, "loss": -0.0077, "step": 2412, "step_time": 2.0340499480007566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 152.5625, "completions/mean_terminated_length": 152.5625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5929355225525796, "epoch": 0.02413, "frac_reward_zero_std": 0.75, "grad_norm": 0.015365952625870705, "kl": 0.6004365049302578, "learning_rate": 9.997385426030561e-06, "loss": -0.0269, "num_tokens": 20635347.0, "reward": 0.35432690382003784, "reward_std": 0.1280951052904129, "rewards/rollout_reward_func/mean": 0.35432690382003784, "rewards/rollout_reward_func/std": 0.3293328881263733, "sampling/importance_sampling_ratio/max": 0.9993361830711365, "sampling/importance_sampling_ratio/mean": 0.9340699315071106, "sampling/importance_sampling_ratio/min": 3.159548214171082e-05, "sampling/sampling_logp_difference/max": 2.3012900352478027, "sampling/sampling_logp_difference/mean": 0.0988808125257492, "step": 2413, "step_time": 3.991976036981214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5939208627678454, "epoch": 0.02414, "grad_norm": 0.015329848974943161, "kl": 0.6030677631497383, "learning_rate": 9.997383225929795e-06, "loss": -0.0269, "step": 2414, "step_time": 2.481808625976555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 190.5, "completions/mean_terminated_length": 190.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.055438471492379904, "epoch": 0.02415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004158366355113685, "kl": 0.501843798905611, "learning_rate": 9.997381024904078e-06, "loss": 0.0016, "num_tokens": 20652155.0, "reward": 1.0763846635818481, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0763846635818481, "rewards/rollout_reward_func/std": 0.36249086260795593, "sampling/importance_sampling_ratio/max": 1.000565528869629, "sampling/importance_sampling_ratio/mean": 0.9951267242431641, "sampling/importance_sampling_ratio/min": 0.9884622693061829, "sampling/sampling_logp_difference/max": 0.0096528809517622, "sampling/sampling_logp_difference/mean": 0.0011570954229682684, "step": 2415, "step_time": 4.875355070013029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05567788705229759, "epoch": 0.02416, "grad_norm": 0.00041745410999283195, "kl": 0.5018283277750015, "learning_rate": 9.997378822953408e-06, "loss": 0.0016, "step": 2416, "step_time": 1.9697546530005638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 147.25, "completions/mean_terminated_length": 147.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2816002178005874, "epoch": 0.02417, "frac_reward_zero_std": 0.5, "grad_norm": 0.04290197044610977, "kl": 0.719272043555975, "learning_rate": 9.997376620077784e-06, "loss": 0.0101, "num_tokens": 20667723.0, "reward": 0.6071346402168274, "reward_std": 0.023307327181100845, "rewards/rollout_reward_func/mean": 0.6071346402168274, "rewards/rollout_reward_func/std": 0.21410980820655823, "sampling/importance_sampling_ratio/max": 1.0151081085205078, "sampling/importance_sampling_ratio/mean": 0.9405163526535034, "sampling/importance_sampling_ratio/min": 0.00786507222801447, "sampling/sampling_logp_difference/max": 2.426320791244507, "sampling/sampling_logp_difference/mean": 0.047153569757938385, "step": 2417, "step_time": 3.9361199289924116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28197744861245155, "epoch": 0.02418, "grad_norm": 0.04121508076786995, "kl": 0.6946723759174347, "learning_rate": 9.997374416277207e-06, "loss": 0.01, "step": 2418, "step_time": 1.9951842209993629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 119.03125, "completions/mean_terminated_length": 119.03125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.39569958904758096, "epoch": 0.02419, "frac_reward_zero_std": 0.5, "grad_norm": 0.030761951580643654, "kl": 0.6418274715542793, "learning_rate": 9.997372211551683e-06, "loss": -0.0436, "num_tokens": 20682356.0, "reward": 0.765625, "reward_std": 0.036307211965322495, "rewards/rollout_reward_func/mean": 0.765625, "rewards/rollout_reward_func/std": 0.24048517644405365, "sampling/importance_sampling_ratio/max": 0.9984559416770935, "sampling/importance_sampling_ratio/mean": 0.9390733242034912, "sampling/importance_sampling_ratio/min": 0.018756505101919174, "sampling/sampling_logp_difference/max": 1.8689756393432617, "sampling/sampling_logp_difference/mean": 0.04568806290626526, "step": 2419, "step_time": 3.7941782609996153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39421561593189836, "epoch": 0.0242, "grad_norm": 0.030573248863220215, "kl": 0.6293397769331932, "learning_rate": 9.997370005901205e-06, "loss": -0.0436, "step": 2420, "step_time": 2.8931831199952285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 211.25, "completions/mean_terminated_length": 211.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.046956032048910856, "epoch": 0.02421, "frac_reward_zero_std": 1.0, "grad_norm": 0.00046051276149228215, "kl": 0.43081308528780937, "learning_rate": 9.997367799325778e-06, "loss": 0.0018, "num_tokens": 20699740.0, "reward": 0.7286922931671143, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7286922931671143, "rewards/rollout_reward_func/std": 0.16791291534900665, "sampling/importance_sampling_ratio/max": 0.9996544718742371, "sampling/importance_sampling_ratio/mean": 0.9952706098556519, "sampling/importance_sampling_ratio/min": 0.9917266964912415, "sampling/sampling_logp_difference/max": 0.005707751959562302, "sampling/sampling_logp_difference/mean": 0.0008077258244156837, "step": 2421, "step_time": 4.279286564000358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.046991942916065454, "epoch": 0.02422, "grad_norm": 0.00046211780863814056, "kl": 0.4308018274605274, "learning_rate": 9.997365591825401e-06, "loss": 0.0018, "step": 2422, "step_time": 2.0109388170021703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 122.40625, "completions/mean_terminated_length": 122.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2940068254247308, "epoch": 0.02423, "frac_reward_zero_std": 0.75, "grad_norm": 0.0035814999137073755, "kl": 0.6502028778195381, "learning_rate": 9.997363383400076e-06, "loss": -0.0271, "num_tokens": 20714513.0, "reward": 0.974057674407959, "reward_std": 0.038346949964761734, "rewards/rollout_reward_func/mean": 0.974057674407959, "rewards/rollout_reward_func/std": 0.2580467760562897, "sampling/importance_sampling_ratio/max": 0.9983152151107788, "sampling/importance_sampling_ratio/mean": 0.9634942412376404, "sampling/importance_sampling_ratio/min": 0.00036428746534511447, "sampling/sampling_logp_difference/max": 2.3534491062164307, "sampling/sampling_logp_difference/mean": 0.04457659274339676, "step": 2423, "step_time": 4.047508660994936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2969197449274361, "epoch": 0.02424, "grad_norm": 0.0038008016999810934, "kl": 0.651810310781002, "learning_rate": 9.9973611740498e-06, "loss": -0.0271, "step": 2424, "step_time": 2.0172563879968948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 174.0625, "completions/mean_terminated_length": 174.0625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.21177630173042417, "epoch": 0.02425, "frac_reward_zero_std": 0.5, "grad_norm": 0.0366976223886013, "kl": 0.5903746969997883, "learning_rate": 9.997358963774577e-06, "loss": 0.0094, "num_tokens": 20730819.0, "reward": 0.8125624656677246, "reward_std": 0.03852371871471405, "rewards/rollout_reward_func/mean": 0.8125624656677246, "rewards/rollout_reward_func/std": 0.35257863998413086, "sampling/importance_sampling_ratio/max": 0.9984253644943237, "sampling/importance_sampling_ratio/mean": 0.9383264780044556, "sampling/importance_sampling_ratio/min": 0.10489809513092041, "sampling/sampling_logp_difference/max": 2.3145713806152344, "sampling/sampling_logp_difference/mean": 0.02998965047299862, "step": 2425, "step_time": 4.6794286699805525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2124970848672092, "epoch": 0.02426, "grad_norm": 0.036713726818561554, "kl": 0.5876137092709541, "learning_rate": 9.997356752574406e-06, "loss": 0.0093, "step": 2426, "step_time": 2.4723692839907017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 119.71875, "completions/mean_terminated_length": 119.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5094779185019433, "epoch": 0.02427, "frac_reward_zero_std": 0.75, "grad_norm": 0.005115825217217207, "kl": 0.574441559612751, "learning_rate": 9.99735454044929e-06, "loss": -0.0273, "num_tokens": 20745506.0, "reward": 0.8871201872825623, "reward_std": 0.030228814110159874, "rewards/rollout_reward_func/mean": 0.8871201872825623, "rewards/rollout_reward_func/std": 0.13572093844413757, "sampling/importance_sampling_ratio/max": 0.9997656345367432, "sampling/importance_sampling_ratio/mean": 0.963526725769043, "sampling/importance_sampling_ratio/min": 1.8554700831341364e-10, "sampling/sampling_logp_difference/max": 4.514436721801758, "sampling/sampling_logp_difference/mean": 0.1635814905166626, "step": 2427, "step_time": 3.885078839994094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5100735463202, "epoch": 0.02428, "grad_norm": 0.005041079130023718, "kl": 0.5745163857936859, "learning_rate": 9.997352327399228e-06, "loss": -0.0273, "step": 2428, "step_time": 1.9938332219899166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 0.5306765143759549, "epoch": 0.02429, "frac_reward_zero_std": 0.75, "grad_norm": 0.01597055047750473, "kl": 0.4492543153464794, "learning_rate": 9.997350113424217e-06, "loss": -0.0177, "num_tokens": 20760738.0, "reward": 0.7560769319534302, "reward_std": 0.056024618446826935, "rewards/rollout_reward_func/mean": 0.7560769319534302, "rewards/rollout_reward_func/std": 0.29310500621795654, "sampling/importance_sampling_ratio/max": 1.0003094673156738, "sampling/importance_sampling_ratio/mean": 0.9331835508346558, "sampling/importance_sampling_ratio/min": 3.729151224048723e-16, "sampling/sampling_logp_difference/max": 4.093080997467041, "sampling/sampling_logp_difference/mean": 0.24190402030944824, "step": 2429, "step_time": 4.080207400002109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5328977350145578, "epoch": 0.0243, "grad_norm": 0.016314901411533356, "kl": 0.4486417807638645, "learning_rate": 9.997347898524262e-06, "loss": -0.0177, "step": 2430, "step_time": 1.9881429629967897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 142.5, "completions/mean_terminated_length": 142.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06274760095402598, "epoch": 0.02431, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008265978540293872, "kl": 0.3716690428555012, "learning_rate": 9.997345682699363e-06, "loss": 0.0012, "num_tokens": 20776034.0, "reward": 0.4669230878353119, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.4669230878353119, "rewards/rollout_reward_func/std": 0.09745453298091888, "sampling/importance_sampling_ratio/max": 1.0086784362792969, "sampling/importance_sampling_ratio/mean": 0.9964567422866821, "sampling/importance_sampling_ratio/min": 0.9885968565940857, "sampling/sampling_logp_difference/max": 0.00996825098991394, "sampling/sampling_logp_difference/mean": 0.0013626435538753867, "step": 2431, "step_time": 4.376285393002036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0636133523657918, "epoch": 0.02432, "grad_norm": 0.0008353109005838633, "kl": 0.37153147906064987, "learning_rate": 9.99734346594952e-06, "loss": 0.0012, "step": 2432, "step_time": 2.41797887798748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 122.03125, "completions/mean_terminated_length": 122.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6706656711176038, "epoch": 0.02433, "frac_reward_zero_std": 0.5, "grad_norm": 0.008971923030912876, "kl": 0.6306871026754379, "learning_rate": 9.997341248274732e-06, "loss": -0.0459, "num_tokens": 20790739.0, "reward": 0.687947154045105, "reward_std": 0.23099274933338165, "rewards/rollout_reward_func/mean": 0.687947154045105, "rewards/rollout_reward_func/std": 0.3925003111362457, "sampling/importance_sampling_ratio/max": 0.9966866374015808, "sampling/importance_sampling_ratio/mean": 0.9288539886474609, "sampling/importance_sampling_ratio/min": 8.594844065421816e-20, "sampling/sampling_logp_difference/max": 2.7878377437591553, "sampling/sampling_logp_difference/mean": 0.2587166428565979, "step": 2433, "step_time": 4.154834072003723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.66988545935601, "epoch": 0.02434, "grad_norm": 0.008893780410289764, "kl": 0.6392672806978226, "learning_rate": 9.997339029675001e-06, "loss": -0.0459, "step": 2434, "step_time": 2.0656127170004766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 144.875, "completions/mean_terminated_length": 144.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.35776166897267103, "epoch": 0.02435, "frac_reward_zero_std": 0.5, "grad_norm": 0.9501222372055054, "kl": 0.5550642870366573, "learning_rate": 9.99733681015033e-06, "loss": -0.0164, "num_tokens": 20806055.0, "reward": 0.6597884297370911, "reward_std": 0.03649759292602539, "rewards/rollout_reward_func/mean": 0.6597884297370911, "rewards/rollout_reward_func/std": 0.1423436403274536, "sampling/importance_sampling_ratio/max": 0.9978036880493164, "sampling/importance_sampling_ratio/mean": 0.9564929604530334, "sampling/importance_sampling_ratio/min": 0.0008526698802597821, "sampling/sampling_logp_difference/max": 1.2835158109664917, "sampling/sampling_logp_difference/mean": 0.04712724685668945, "step": 2435, "step_time": 4.124483887004317 }, { "clip_ratio/high_max": 0.07916666846722364, "clip_ratio/high_mean": 0.03958333423361182, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.044791667722165585, "entropy": 0.35630293376743793, "epoch": 0.02436, "grad_norm": 0.1530590057373047, "kl": 0.5872343815863132, "learning_rate": 9.997334589700714e-06, "loss": -0.0195, "step": 2436, "step_time": 2.073559526987083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 215.0, "completions/mean_terminated_length": 215.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.1279990435577929, "epoch": 0.02437, "frac_reward_zero_std": 0.75, "grad_norm": 0.024993406608700752, "kl": 0.4528156891465187, "learning_rate": 9.997332368326157e-06, "loss": -0.0149, "num_tokens": 20823647.0, "reward": 1.080519199371338, "reward_std": 0.01767767407000065, "rewards/rollout_reward_func/mean": 1.080519199371338, "rewards/rollout_reward_func/std": 0.3983922302722931, "sampling/importance_sampling_ratio/max": 1.0001198053359985, "sampling/importance_sampling_ratio/mean": 0.968614399433136, "sampling/importance_sampling_ratio/min": 0.13614758849143982, "sampling/sampling_logp_difference/max": 1.9919233322143555, "sampling/sampling_logp_difference/mean": 0.011158163659274578, "step": 2437, "step_time": 4.855150313000195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1287108468823135, "epoch": 0.02438, "grad_norm": 0.025264425203204155, "kl": 0.4511535316705704, "learning_rate": 9.997330146026661e-06, "loss": -0.0148, "step": 2438, "step_time": 2.486361051007407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 143.46875, "completions/mean_terminated_length": 143.46875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.9857873381115496, "epoch": 0.02439, "frac_reward_zero_std": 0.25, "grad_norm": 0.02959585189819336, "kl": 0.5932977311313152, "learning_rate": 9.997327922802223e-06, "loss": -0.0228, "num_tokens": 20839030.0, "reward": 0.6673077344894409, "reward_std": 0.2409977912902832, "rewards/rollout_reward_func/mean": 0.6673077344894409, "rewards/rollout_reward_func/std": 0.402592271566391, "sampling/importance_sampling_ratio/max": 0.9984247088432312, "sampling/importance_sampling_ratio/mean": 0.8715899586677551, "sampling/importance_sampling_ratio/min": 8.504770791707874e-18, "sampling/sampling_logp_difference/max": 3.0771541595458984, "sampling/sampling_logp_difference/mean": 0.28434649109840393, "step": 2439, "step_time": 4.198645604992635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9886466981843114, "epoch": 0.0244, "grad_norm": 0.029581010341644287, "kl": 0.5915088579058647, "learning_rate": 9.997325698652846e-06, "loss": -0.0228, "step": 2440, "step_time": 2.368434135998541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 178.1875, "completions/mean_terminated_length": 178.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.20180142298340797, "epoch": 0.02441, "frac_reward_zero_std": 0.75, "grad_norm": 0.02337106503546238, "kl": 0.46357355266809464, "learning_rate": 9.99732347357853e-06, "loss": 0.019, "num_tokens": 20855468.0, "reward": 0.6577404141426086, "reward_std": 0.0006799129769206047, "rewards/rollout_reward_func/mean": 0.6577404141426086, "rewards/rollout_reward_func/std": 0.34912657737731934, "sampling/importance_sampling_ratio/max": 0.998389720916748, "sampling/importance_sampling_ratio/mean": 0.961931049823761, "sampling/importance_sampling_ratio/min": 0.04176252335309982, "sampling/sampling_logp_difference/max": 1.4657280445098877, "sampling/sampling_logp_difference/mean": 0.020885635167360306, "step": 2441, "step_time": 4.135444112995174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20302249770611525, "epoch": 0.02442, "grad_norm": 0.023902567103505135, "kl": 0.4637703374028206, "learning_rate": 9.997321247579274e-06, "loss": 0.019, "step": 2442, "step_time": 2.033025960001396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 219.6875, "completions/mean_terminated_length": 219.6875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.5815102700144053, "epoch": 0.02443, "frac_reward_zero_std": 0.25, "grad_norm": 0.03899175673723221, "kl": 0.7853060178458691, "learning_rate": 9.99731902065508e-06, "loss": -0.0043, "num_tokens": 20873234.0, "reward": 0.5562740564346313, "reward_std": 0.2199782133102417, "rewards/rollout_reward_func/mean": 0.5562740564346313, "rewards/rollout_reward_func/std": 0.36909547448158264, "sampling/importance_sampling_ratio/max": 0.998964250087738, "sampling/importance_sampling_ratio/mean": 0.9025384783744812, "sampling/importance_sampling_ratio/min": 1.5675450520946654e-11, "sampling/sampling_logp_difference/max": 4.083805084228516, "sampling/sampling_logp_difference/mean": 0.16285139322280884, "step": 2443, "step_time": 4.881930793999345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5803625541739166, "epoch": 0.02444, "grad_norm": 0.035378336906433105, "kl": 0.765863873064518, "learning_rate": 9.99731679280595e-06, "loss": -0.0044, "step": 2444, "step_time": 2.4556214960102807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 168.6875, "completions/mean_terminated_length": 168.6875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3070858148857951, "epoch": 0.02445, "frac_reward_zero_std": 0.75, "grad_norm": 0.0036751748993992805, "kl": 0.4669885225594044, "learning_rate": 9.997314564031882e-06, "loss": -0.0272, "num_tokens": 20889344.0, "reward": 0.9449471235275269, "reward_std": 0.008634859696030617, "rewards/rollout_reward_func/mean": 0.9449471235275269, "rewards/rollout_reward_func/std": 0.3484176695346832, "sampling/importance_sampling_ratio/max": 0.9985083341598511, "sampling/importance_sampling_ratio/mean": 0.9637705087661743, "sampling/importance_sampling_ratio/min": 1.702781446510926e-05, "sampling/sampling_logp_difference/max": 2.791327953338623, "sampling/sampling_logp_difference/mean": 0.04872101917862892, "step": 2445, "step_time": 4.273636019010155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31014982983469963, "epoch": 0.02446, "grad_norm": 0.004124943166971207, "kl": 0.46653370186686516, "learning_rate": 9.997312334332876e-06, "loss": -0.0272, "step": 2446, "step_time": 1.9884682910051197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 100.0, "completions/max_terminated_length": 100.0, "completions/mean_length": 90.75, "completions/mean_terminated_length": 90.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.306297842413187, "epoch": 0.02447, "frac_reward_zero_std": 0.75, "grad_norm": 0.006685493979603052, "kl": 0.6147950626909733, "learning_rate": 9.997310103708936e-06, "loss": -0.0176, "num_tokens": 20902960.0, "reward": 0.6400961875915527, "reward_std": 0.019309453666210175, "rewards/rollout_reward_func/mean": 0.6400961875915527, "rewards/rollout_reward_func/std": 0.16076070070266724, "sampling/importance_sampling_ratio/max": 0.9942417740821838, "sampling/importance_sampling_ratio/mean": 0.9614579677581787, "sampling/importance_sampling_ratio/min": 0.0022630440071225166, "sampling/sampling_logp_difference/max": 2.656489133834839, "sampling/sampling_logp_difference/mean": 0.04943425580859184, "step": 2447, "step_time": 3.4995283070093137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3074901048094034, "epoch": 0.02448, "grad_norm": 0.006771016400307417, "kl": 0.6141564771533012, "learning_rate": 9.99730787216006e-06, "loss": -0.0176, "step": 2448, "step_time": 1.9083625170096639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0021551724057644606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021551724057644606, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 176.5625, "completions/mean_terminated_length": 176.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 1.1313944598659873, "epoch": 0.02449, "frac_reward_zero_std": 0.5, "grad_norm": 0.030888631939888, "kl": 0.854240033775568, "learning_rate": 9.99730563968625e-06, "loss": -0.0651, "num_tokens": 20919434.0, "reward": 0.8759375214576721, "reward_std": 0.1591765135526657, "rewards/rollout_reward_func/mean": 0.8759375214576721, "rewards/rollout_reward_func/std": 0.42839697003364563, "sampling/importance_sampling_ratio/max": 0.9984861016273499, "sampling/importance_sampling_ratio/mean": 0.8234807252883911, "sampling/importance_sampling_ratio/min": 7.566144900868728e-26, "sampling/sampling_logp_difference/max": 12.616388320922852, "sampling/sampling_logp_difference/mean": 0.4985610246658325, "step": 2449, "step_time": 5.002503615010937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1323711778968573, "epoch": 0.0245, "grad_norm": 0.028996365144848824, "kl": 0.8388538733124733, "learning_rate": 9.997303406287505e-06, "loss": -0.0651, "step": 2450, "step_time": 2.507229439004732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 196.75, "completions/mean_terminated_length": 196.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07657961919903755, "epoch": 0.02451, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007824577623978257, "kl": 0.43253279849886894, "learning_rate": 9.997301171963825e-06, "loss": 0.0016, "num_tokens": 20936522.0, "reward": 0.9980000257492065, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9980000257492065, "rewards/rollout_reward_func/std": 0.25967782735824585, "sampling/importance_sampling_ratio/max": 0.9987274408340454, "sampling/importance_sampling_ratio/mean": 0.9945522546768188, "sampling/importance_sampling_ratio/min": 0.9850890636444092, "sampling/sampling_logp_difference/max": 0.010490603744983673, "sampling/sampling_logp_difference/mean": 0.0012707264395430684, "step": 2451, "step_time": 4.1468824870025855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07895545568317175, "epoch": 0.02452, "grad_norm": 0.0008186694467440248, "kl": 0.43210919946432114, "learning_rate": 9.997298936715215e-06, "loss": 0.0016, "step": 2452, "step_time": 2.028339581011096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.0817081555724144, "epoch": 0.02453, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006819178233854473, "kl": 0.5063713118433952, "learning_rate": 9.99729670054167e-06, "loss": 0.0015, "num_tokens": 20951922.0, "reward": 0.5149999856948853, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5149999856948853, "rewards/rollout_reward_func/std": 0.20764946937561035, "sampling/importance_sampling_ratio/max": 0.9941827654838562, "sampling/importance_sampling_ratio/mean": 0.9901301860809326, "sampling/importance_sampling_ratio/min": 0.9837407469749451, "sampling/sampling_logp_difference/max": 0.01056019589304924, "sampling/sampling_logp_difference/mean": 0.002006848342716694, "step": 2453, "step_time": 3.808358747017337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08407439850270748, "epoch": 0.02454, "grad_norm": 0.0007188204326666892, "kl": 0.5058853328227997, "learning_rate": 9.997294463443193e-06, "loss": 0.0015, "step": 2454, "step_time": 1.9734311460051686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 102.5, "completions/mean_terminated_length": 102.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.10155789088457823, "epoch": 0.02455, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007974699838086963, "kl": 0.4891158267855644, "learning_rate": 9.997292225419784e-06, "loss": 0.0012, "num_tokens": 20966082.0, "reward": 0.5680769681930542, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5680769681930542, "rewards/rollout_reward_func/std": 0.15439453721046448, "sampling/importance_sampling_ratio/max": 1.003428339958191, "sampling/importance_sampling_ratio/mean": 0.9965054988861084, "sampling/importance_sampling_ratio/min": 0.9878751039505005, "sampling/sampling_logp_difference/max": 0.0064552053809165955, "sampling/sampling_logp_difference/mean": 0.0013857752783223987, "step": 2455, "step_time": 3.9747413269942626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10491779912263155, "epoch": 0.02456, "grad_norm": 0.0008322405628859997, "kl": 0.48845306411385536, "learning_rate": 9.997289986471445e-06, "loss": 0.0012, "step": 2456, "step_time": 2.3873761680006282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 120.65625, "completions/mean_terminated_length": 120.65625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2743208799511194, "epoch": 0.02457, "frac_reward_zero_std": 0.75, "grad_norm": 0.008123790845274925, "kl": 0.7187372259795666, "learning_rate": 9.997287746598175e-06, "loss": -0.0225, "num_tokens": 20980711.0, "reward": 0.8407836556434631, "reward_std": 0.19263823330402374, "rewards/rollout_reward_func/mean": 0.8407836556434631, "rewards/rollout_reward_func/std": 0.3998786509037018, "sampling/importance_sampling_ratio/max": 1.0002126693725586, "sampling/importance_sampling_ratio/mean": 0.9352577328681946, "sampling/importance_sampling_ratio/min": 0.07538919150829315, "sampling/sampling_logp_difference/max": 2.6407477855682373, "sampling/sampling_logp_difference/mean": 0.034560032188892365, "step": 2457, "step_time": 3.944418589999259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.274000845849514, "epoch": 0.02458, "grad_norm": 0.009001161903142929, "kl": 0.6811534352600574, "learning_rate": 9.997285505799974e-06, "loss": -0.0225, "step": 2458, "step_time": 2.0102963330064085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 175.5, "completions/mean_terminated_length": 175.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.08211343735456467, "epoch": 0.02459, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008903710404410958, "kl": 0.3703249469399452, "learning_rate": 9.997283264076846e-06, "loss": 0.0013, "num_tokens": 20997175.0, "reward": 0.8671153783798218, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8671153783798218, "rewards/rollout_reward_func/std": 0.3239767253398895, "sampling/importance_sampling_ratio/max": 0.9986078143119812, "sampling/importance_sampling_ratio/mean": 0.9932708740234375, "sampling/importance_sampling_ratio/min": 0.9895784258842468, "sampling/sampling_logp_difference/max": 0.006561689078807831, "sampling/sampling_logp_difference/mean": 0.0012655544560402632, "step": 2459, "step_time": 4.008432838993031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08249971736222506, "epoch": 0.0246, "grad_norm": 0.0009033780661411583, "kl": 0.37027160823345184, "learning_rate": 9.997281021428788e-06, "loss": 0.0013, "step": 2460, "step_time": 1.9933400479931151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 148.5, "completions/mean_terminated_length": 148.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2149880714714527, "epoch": 0.02461, "frac_reward_zero_std": 0.75, "grad_norm": 0.014716864563524723, "kl": 0.4254121035337448, "learning_rate": 9.997278777855802e-06, "loss": -0.0173, "num_tokens": 21012695.0, "reward": 0.6691057682037354, "reward_std": 0.0088388342410326, "rewards/rollout_reward_func/mean": 0.6691057682037354, "rewards/rollout_reward_func/std": 0.4044671058654785, "sampling/importance_sampling_ratio/max": 1.000332236289978, "sampling/importance_sampling_ratio/mean": 0.9616370797157288, "sampling/importance_sampling_ratio/min": 0.014878940768539906, "sampling/sampling_logp_difference/max": 2.2462918758392334, "sampling/sampling_logp_difference/mean": 0.027381962165236473, "step": 2461, "step_time": 4.2768202899969765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2145060859620571, "epoch": 0.02462, "grad_norm": 0.014289738610386848, "kl": 0.42541578784585, "learning_rate": 9.99727653335789e-06, "loss": -0.0173, "step": 2462, "step_time": 2.428282833992853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 146.28125, "completions/mean_terminated_length": 146.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.264673019759357, "epoch": 0.02463, "frac_reward_zero_std": 0.75, "grad_norm": 0.0249776691198349, "kl": 0.532606802880764, "learning_rate": 9.997274287935048e-06, "loss": 0.0192, "num_tokens": 21028200.0, "reward": 0.8600673675537109, "reward_std": 0.0020397298503667116, "rewards/rollout_reward_func/mean": 0.8600673675537109, "rewards/rollout_reward_func/std": 0.19401586055755615, "sampling/importance_sampling_ratio/max": 1.0044457912445068, "sampling/importance_sampling_ratio/mean": 0.9628583192825317, "sampling/importance_sampling_ratio/min": 0.06308586895465851, "sampling/sampling_logp_difference/max": 1.452638030052185, "sampling/sampling_logp_difference/mean": 0.01700841635465622, "step": 2463, "step_time": 4.098889446002431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2617326155304909, "epoch": 0.02464, "grad_norm": 0.025125836953520775, "kl": 0.5328678414225578, "learning_rate": 9.99727204158728e-06, "loss": 0.0192, "step": 2464, "step_time": 2.0071479989783256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 94.1875, "completions/mean_terminated_length": 94.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1918268082663417, "epoch": 0.02465, "frac_reward_zero_std": 0.75, "grad_norm": 0.014712600037455559, "kl": 0.725609615445137, "learning_rate": 9.997269794314587e-06, "loss": -0.0249, "num_tokens": 21042070.0, "reward": 0.552884578704834, "reward_std": 0.1321745663881302, "rewards/rollout_reward_func/mean": 0.552884578704834, "rewards/rollout_reward_func/std": 0.3536829352378845, "sampling/importance_sampling_ratio/max": 0.9964118599891663, "sampling/importance_sampling_ratio/mean": 0.961661696434021, "sampling/importance_sampling_ratio/min": 0.10864908993244171, "sampling/sampling_logp_difference/max": 2.3985180854797363, "sampling/sampling_logp_difference/mean": 0.019741196185350418, "step": 2465, "step_time": 3.7898643690132303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.193153265863657, "epoch": 0.02466, "grad_norm": 0.016261428594589233, "kl": 0.7180748507380486, "learning_rate": 9.997267546116967e-06, "loss": -0.0249, "step": 2466, "step_time": 1.980669022006623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3118903087452054, "epoch": 0.02467, "frac_reward_zero_std": 0.75, "grad_norm": 0.011477242223918438, "kl": 0.4401169903576374, "learning_rate": 9.997265296994425e-06, "loss": -0.0178, "num_tokens": 21057582.0, "reward": 0.7010236978530884, "reward_std": 0.0308280847966671, "rewards/rollout_reward_func/mean": 0.7010236978530884, "rewards/rollout_reward_func/std": 0.25308758020401, "sampling/importance_sampling_ratio/max": 0.9979459047317505, "sampling/importance_sampling_ratio/mean": 0.959221363067627, "sampling/importance_sampling_ratio/min": 0.0003331693005748093, "sampling/sampling_logp_difference/max": 1.9257378578186035, "sampling/sampling_logp_difference/mean": 0.04845108464360237, "step": 2467, "step_time": 4.926441734998662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31150245759636164, "epoch": 0.02468, "grad_norm": 0.011867637746036053, "kl": 0.44041142240166664, "learning_rate": 9.997263046946956e-06, "loss": -0.0178, "step": 2468, "step_time": 2.018054341999232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 122.5625, "completions/mean_terminated_length": 122.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.516001065261662, "epoch": 0.02469, "frac_reward_zero_std": 0.25, "grad_norm": 0.12261640280485153, "kl": 0.6258652880787849, "learning_rate": 9.997260795974564e-06, "loss": 0.0084, "num_tokens": 21072216.0, "reward": 0.7921105623245239, "reward_std": 0.03134387731552124, "rewards/rollout_reward_func/mean": 0.7921105623245239, "rewards/rollout_reward_func/std": 0.3849877119064331, "sampling/importance_sampling_ratio/max": 1.0007328987121582, "sampling/importance_sampling_ratio/mean": 0.910675048828125, "sampling/importance_sampling_ratio/min": 0.04896596819162369, "sampling/sampling_logp_difference/max": 1.4706510305404663, "sampling/sampling_logp_difference/mean": 0.05294432491064072, "step": 2469, "step_time": 4.209079580992693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5147173870354891, "epoch": 0.0247, "grad_norm": 0.11048159748315811, "kl": 0.6193723380565643, "learning_rate": 9.997258544077248e-06, "loss": 0.0083, "step": 2470, "step_time": 2.0372086010102066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 221.125, "completions/mean_terminated_length": 221.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3221675381064415, "epoch": 0.02471, "frac_reward_zero_std": 0.75, "grad_norm": 0.013824621215462685, "kl": 0.46749401092529297, "learning_rate": 9.997256291255011e-06, "loss": 0.0318, "num_tokens": 21090060.0, "reward": 0.9787149429321289, "reward_std": 0.0033274756278842688, "rewards/rollout_reward_func/mean": 0.9787149429321289, "rewards/rollout_reward_func/std": 0.24482892453670502, "sampling/importance_sampling_ratio/max": 1.0123215913772583, "sampling/importance_sampling_ratio/mean": 0.9169784188270569, "sampling/importance_sampling_ratio/min": 0.0018265035469084978, "sampling/sampling_logp_difference/max": 1.7373425960540771, "sampling/sampling_logp_difference/mean": 0.04981914535164833, "step": 2471, "step_time": 4.420776002996718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.32211134023964405, "epoch": 0.02472, "grad_norm": 0.013862624764442444, "kl": 0.46481984853744507, "learning_rate": 9.997254037507851e-06, "loss": 0.0318, "step": 2472, "step_time": 2.056666459000553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 152.4375, "completions/mean_terminated_length": 152.4375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.5659948624670506, "epoch": 0.02473, "frac_reward_zero_std": 0.75, "grad_norm": 0.019714243710041046, "kl": 0.4363499544560909, "learning_rate": 9.997251782835769e-06, "loss": -0.0486, "num_tokens": 21105762.0, "reward": 0.9718207120895386, "reward_std": 0.01255687978118658, "rewards/rollout_reward_func/mean": 0.9718207120895386, "rewards/rollout_reward_func/std": 0.2570189833641052, "sampling/importance_sampling_ratio/max": 0.9997512698173523, "sampling/importance_sampling_ratio/mean": 0.9300500154495239, "sampling/importance_sampling_ratio/min": 2.0691886627067035e-17, "sampling/sampling_logp_difference/max": 3.7428789138793945, "sampling/sampling_logp_difference/mean": 0.20171312987804413, "step": 2473, "step_time": 5.337010919000022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5613014418631792, "epoch": 0.02474, "grad_norm": 0.015873050317168236, "kl": 0.43629560247063637, "learning_rate": 9.997249527238767e-06, "loss": -0.0487, "step": 2474, "step_time": 2.05429613400338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 123.15625, "completions/mean_terminated_length": 123.15625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9852268667891622, "epoch": 0.02475, "frac_reward_zero_std": 0.25, "grad_norm": 0.04467035457491875, "kl": 0.5178476944565773, "learning_rate": 9.997247270716843e-06, "loss": -0.0587, "num_tokens": 21120471.0, "reward": 0.8360576629638672, "reward_std": 0.07172583043575287, "rewards/rollout_reward_func/mean": 0.8360576629638672, "rewards/rollout_reward_func/std": 0.17090125381946564, "sampling/importance_sampling_ratio/max": 0.9982326626777649, "sampling/importance_sampling_ratio/mean": 0.842372477054596, "sampling/importance_sampling_ratio/min": 0.0016639174427837133, "sampling/sampling_logp_difference/max": 2.6165099143981934, "sampling/sampling_logp_difference/mean": 0.1439850777387619, "step": 2475, "step_time": 4.254949032983859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9937011273577809, "epoch": 0.02476, "grad_norm": 0.044063739478588104, "kl": 0.5201312378048897, "learning_rate": 9.99724501327e-06, "loss": -0.0587, "step": 2476, "step_time": 2.044954104007047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.1546945981681347, "epoch": 0.02477, "frac_reward_zero_std": 0.75, "grad_norm": 0.014269583858549595, "kl": 0.5982608087360859, "learning_rate": 9.997242754898239e-06, "loss": -0.0234, "num_tokens": 21136047.0, "reward": 0.743288516998291, "reward_std": 0.15637937188148499, "rewards/rollout_reward_func/mean": 0.743288516998291, "rewards/rollout_reward_func/std": 0.402129590511322, "sampling/importance_sampling_ratio/max": 1.0050029754638672, "sampling/importance_sampling_ratio/mean": 0.9655858278274536, "sampling/importance_sampling_ratio/min": 0.11887352168560028, "sampling/sampling_logp_difference/max": 2.2419066429138184, "sampling/sampling_logp_difference/mean": 0.016418209299445152, "step": 2477, "step_time": 4.133140442012518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15176233556121588, "epoch": 0.02478, "grad_norm": 0.01431995164602995, "kl": 0.5933037139475346, "learning_rate": 9.997240495601558e-06, "loss": -0.0234, "step": 2478, "step_time": 2.4757797889833455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 153.4375, "completions/mean_terminated_length": 153.4375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.7301677735522389, "epoch": 0.02479, "frac_reward_zero_std": 0.25, "grad_norm": 0.04183944687247276, "kl": 0.49093612283468246, "learning_rate": 9.997238235379958e-06, "loss": -0.0738, "num_tokens": 21151693.0, "reward": 0.7240947484970093, "reward_std": 0.048576872795820236, "rewards/rollout_reward_func/mean": 0.7240947484970093, "rewards/rollout_reward_func/std": 0.3689168393611908, "sampling/importance_sampling_ratio/max": 0.9973059892654419, "sampling/importance_sampling_ratio/mean": 0.8967268466949463, "sampling/importance_sampling_ratio/min": 1.1523936336743645e-05, "sampling/sampling_logp_difference/max": 1.9649040699005127, "sampling/sampling_logp_difference/mean": 0.12830033898353577, "step": 2479, "step_time": 4.915894105004554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7288402477279305, "epoch": 0.0248, "grad_norm": 0.03891237452626228, "kl": 0.4950961619615555, "learning_rate": 9.99723597423344e-06, "loss": -0.0739, "step": 2480, "step_time": 2.0420140490023186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 121.65625, "completions/mean_terminated_length": 121.65625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5805520731955767, "epoch": 0.02481, "frac_reward_zero_std": 0.75, "grad_norm": 0.010876993648707867, "kl": 0.4731554314494133, "learning_rate": 9.997233712162007e-06, "loss": -0.0175, "num_tokens": 21166386.0, "reward": 0.6769711971282959, "reward_std": 0.022709006443619728, "rewards/rollout_reward_func/mean": 0.6769711971282959, "rewards/rollout_reward_func/std": 0.20371189713478088, "sampling/importance_sampling_ratio/max": 1.0003795623779297, "sampling/importance_sampling_ratio/mean": 0.930212676525116, "sampling/importance_sampling_ratio/min": 3.6106873722019373e-06, "sampling/sampling_logp_difference/max": 3.3604788780212402, "sampling/sampling_logp_difference/mean": 0.12134911864995956, "step": 2481, "step_time": 3.918521887018869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5763626070693135, "epoch": 0.02482, "grad_norm": 0.0097423130646348, "kl": 0.4748273529112339, "learning_rate": 9.997231449165657e-06, "loss": -0.0176, "step": 2482, "step_time": 1.9887769580163877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 171.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.0743891941383481, "epoch": 0.02483, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006735767237842083, "kl": 0.4031328074634075, "learning_rate": 9.99722918524439e-06, "loss": 0.0014, "num_tokens": 21182538.0, "reward": 0.8453077077865601, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8453077077865601, "rewards/rollout_reward_func/std": 0.44581758975982666, "sampling/importance_sampling_ratio/max": 0.9973159432411194, "sampling/importance_sampling_ratio/mean": 0.9924320578575134, "sampling/importance_sampling_ratio/min": 0.9826722741127014, "sampling/sampling_logp_difference/max": 0.012291409075260162, "sampling/sampling_logp_difference/mean": 0.0014386891853064299, "step": 2483, "step_time": 4.138678342998901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07122328784316778, "epoch": 0.02484, "grad_norm": 0.0006318808300420642, "kl": 0.40373867750167847, "learning_rate": 9.997226920398209e-06, "loss": 0.0014, "step": 2484, "step_time": 2.8795854400086682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.07379993796348572, "epoch": 0.02485, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007005734951235354, "kl": 0.4364323802292347, "learning_rate": 9.997224654627113e-06, "loss": 0.0015, "num_tokens": 21198650.0, "reward": 0.747692346572876, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.747692346572876, "rewards/rollout_reward_func/std": 0.21604155004024506, "sampling/importance_sampling_ratio/max": 1.0001736879348755, "sampling/importance_sampling_ratio/mean": 0.9942889213562012, "sampling/importance_sampling_ratio/min": 0.9881479740142822, "sampling/sampling_logp_difference/max": 0.007084541022777557, "sampling/sampling_logp_difference/mean": 0.0011619338765740395, "step": 2485, "step_time": 3.9277016810010537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07076935842633247, "epoch": 0.02486, "grad_norm": 0.0006628013798035681, "kl": 0.43698402494192123, "learning_rate": 9.9972223879311e-06, "loss": 0.0015, "step": 2486, "step_time": 1.9659444589924533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.0691477358341217, "epoch": 0.02487, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006419956334866583, "kl": 0.46670181676745415, "learning_rate": 9.997220120310176e-06, "loss": 0.0016, "num_tokens": 21214882.0, "reward": 0.7958846092224121, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7958846092224121, "rewards/rollout_reward_func/std": 0.31414148211479187, "sampling/importance_sampling_ratio/max": 0.9987707734107971, "sampling/importance_sampling_ratio/mean": 0.9940887093544006, "sampling/importance_sampling_ratio/min": 0.9894695281982422, "sampling/sampling_logp_difference/max": 0.005821503698825836, "sampling/sampling_logp_difference/mean": 0.001189244445413351, "step": 2487, "step_time": 4.04952620799304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06640257313847542, "epoch": 0.02488, "grad_norm": 0.0006018989952281117, "kl": 0.46719273179769516, "learning_rate": 9.997217851764337e-06, "loss": 0.0016, "step": 2488, "step_time": 2.0177895549932146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 103.59375, "completions/mean_terminated_length": 103.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3684405963867903, "epoch": 0.02489, "frac_reward_zero_std": 0.75, "grad_norm": 0.0059876940213143826, "kl": 0.7654236331582069, "learning_rate": 9.997215582293586e-06, "loss": -0.0172, "num_tokens": 21228997.0, "reward": 0.7058172821998596, "reward_std": 0.029236145317554474, "rewards/rollout_reward_func/mean": 0.7058172821998596, "rewards/rollout_reward_func/std": 0.06797563284635544, "sampling/importance_sampling_ratio/max": 0.9973764419555664, "sampling/importance_sampling_ratio/mean": 0.9613600373268127, "sampling/importance_sampling_ratio/min": 0.0012145863147452474, "sampling/sampling_logp_difference/max": 1.8617326021194458, "sampling/sampling_logp_difference/mean": 0.06898527592420578, "step": 2489, "step_time": 4.023090087008313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.365120742470026, "epoch": 0.0249, "grad_norm": 0.005859920755028725, "kl": 0.7655608206987381, "learning_rate": 9.997213311897925e-06, "loss": -0.0172, "step": 2490, "step_time": 3.0280621380079538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 117.875, "completions/mean_terminated_length": 117.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.18272804096341133, "epoch": 0.02491, "frac_reward_zero_std": 0.75, "grad_norm": 0.019009504467248917, "kl": 0.5931885540485382, "learning_rate": 9.99721104057735e-06, "loss": -0.0161, "num_tokens": 21243481.0, "reward": 0.673480749130249, "reward_std": 0.006799102760851383, "rewards/rollout_reward_func/mean": 0.673480749130249, "rewards/rollout_reward_func/std": 0.19782865047454834, "sampling/importance_sampling_ratio/max": 0.9999419450759888, "sampling/importance_sampling_ratio/mean": 0.9652639627456665, "sampling/importance_sampling_ratio/min": 0.0732957273721695, "sampling/sampling_logp_difference/max": 1.487930417060852, "sampling/sampling_logp_difference/mean": 0.015739772468805313, "step": 2491, "step_time": 4.159284204979485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17895694728940725, "epoch": 0.02492, "grad_norm": 0.017723137512803078, "kl": 0.5934320464730263, "learning_rate": 9.997208768331865e-06, "loss": -0.0161, "step": 2492, "step_time": 2.0361209560069256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 116.78125, "completions/mean_terminated_length": 116.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6081118360161781, "epoch": 0.02493, "frac_reward_zero_std": 0.75, "grad_norm": 0.00731603242456913, "kl": 0.6300422847270966, "learning_rate": 9.99720649516147e-06, "loss": -0.0376, "num_tokens": 21258018.0, "reward": 0.6756297945976257, "reward_std": 0.16438141465187073, "rewards/rollout_reward_func/mean": 0.6756297945976257, "rewards/rollout_reward_func/std": 0.4228082001209259, "sampling/importance_sampling_ratio/max": 0.9989030957221985, "sampling/importance_sampling_ratio/mean": 0.9320294857025146, "sampling/importance_sampling_ratio/min": 7.816790770220905e-08, "sampling/sampling_logp_difference/max": 4.113171100616455, "sampling/sampling_logp_difference/mean": 0.13670647144317627, "step": 2493, "step_time": 4.0664879320174805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6044643735513091, "epoch": 0.02494, "grad_norm": 0.00700240908190608, "kl": 0.6264486759901047, "learning_rate": 9.997204221066162e-06, "loss": -0.0376, "step": 2494, "step_time": 2.0714841979861376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 144.75, "completions/mean_terminated_length": 144.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06432516360655427, "epoch": 0.02495, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004750111256726086, "kl": 0.5137108154594898, "learning_rate": 9.99720194604595e-06, "loss": 0.0015, "num_tokens": 21273530.0, "reward": 0.7119231224060059, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7119231224060059, "rewards/rollout_reward_func/std": 0.2670266330242157, "sampling/importance_sampling_ratio/max": 0.9991094470024109, "sampling/importance_sampling_ratio/mean": 0.9943464398384094, "sampling/importance_sampling_ratio/min": 0.9865577816963196, "sampling/sampling_logp_difference/max": 0.01144346222281456, "sampling/sampling_logp_difference/mean": 0.0013468294637277722, "step": 2495, "step_time": 4.593267289012147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06316233705729246, "epoch": 0.02496, "grad_norm": 0.00046920820022933185, "kl": 0.5139083936810493, "learning_rate": 9.997199670100824e-06, "loss": 0.0015, "step": 2496, "step_time": 2.481138507988362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 119.0, "completions/mean_terminated_length": 119.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.0448439521715045, "epoch": 0.02497, "frac_reward_zero_std": 1.0, "grad_norm": 0.00033539882861077785, "kl": 0.3972753770649433, "learning_rate": 9.997197393230794e-06, "loss": 0.0011, "num_tokens": 21288018.0, "reward": 0.7719230651855469, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7719230651855469, "rewards/rollout_reward_func/std": 0.3028469681739807, "sampling/importance_sampling_ratio/max": 1.0003783702850342, "sampling/importance_sampling_ratio/mean": 0.9970084428787231, "sampling/importance_sampling_ratio/min": 0.9929184913635254, "sampling/sampling_logp_difference/max": 0.004956159740686417, "sampling/sampling_logp_difference/mean": 0.0007944657700136304, "step": 2497, "step_time": 3.775826346987742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04417917178943753, "epoch": 0.02498, "grad_norm": 0.0003306334256194532, "kl": 0.3973737582564354, "learning_rate": 9.997195115435853e-06, "loss": 0.0011, "step": 2498, "step_time": 1.9736735400074394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 213.28125, "completions/mean_terminated_length": 213.28125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.49154716171324253, "epoch": 0.02499, "frac_reward_zero_std": 0.5, "grad_norm": 0.01369493082165718, "kl": 0.5090655498206615, "learning_rate": 9.997192836716008e-06, "loss": 0.002, "num_tokens": 21305579.0, "reward": 0.6433846354484558, "reward_std": 0.21349185705184937, "rewards/rollout_reward_func/mean": 0.6433846354484558, "rewards/rollout_reward_func/std": 0.456233948469162, "sampling/importance_sampling_ratio/max": 1.000471591949463, "sampling/importance_sampling_ratio/mean": 0.9343252182006836, "sampling/importance_sampling_ratio/min": 2.631005372677464e-06, "sampling/sampling_logp_difference/max": 3.125680446624756, "sampling/sampling_logp_difference/mean": 0.10197471082210541, "step": 2499, "step_time": 4.670216993996291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.49139697663486004, "epoch": 0.025, "grad_norm": 0.013044262304902077, "kl": 0.5027960166335106, "learning_rate": 9.997190557071254e-06, "loss": 0.002, "step": 2500, "step_time": 2.050289509978029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 97.5, "completions/mean_terminated_length": 97.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06481495732441545, "epoch": 0.02501, "frac_reward_zero_std": 1.0, "grad_norm": 0.00041967781726270914, "kl": 0.5343891270458698, "learning_rate": 9.997188276501595e-06, "loss": 0.0013, "num_tokens": 21319555.0, "reward": 0.7976922988891602, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7976922988891602, "rewards/rollout_reward_func/std": 0.2563508450984955, "sampling/importance_sampling_ratio/max": 1.0004767179489136, "sampling/importance_sampling_ratio/mean": 0.9963643550872803, "sampling/importance_sampling_ratio/min": 0.9900253415107727, "sampling/sampling_logp_difference/max": 0.007938478142023087, "sampling/sampling_logp_difference/mean": 0.0011094973888248205, "step": 2501, "step_time": 4.36412316399219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06424412969499826, "epoch": 0.02502, "grad_norm": 0.0004137418291065842, "kl": 0.5345155261456966, "learning_rate": 9.997185995007032e-06, "loss": 0.0013, "step": 2502, "step_time": 2.4197414720183588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 125.90625, "completions/mean_terminated_length": 125.90625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4705393295735121, "epoch": 0.02503, "frac_reward_zero_std": 0.75, "grad_norm": 0.020960358902812004, "kl": 0.6459123678505421, "learning_rate": 9.997183712587563e-06, "loss": 0.0265, "num_tokens": 21334208.0, "reward": 0.7808172702789307, "reward_std": 0.02143808640539646, "rewards/rollout_reward_func/mean": 0.7808172702789307, "rewards/rollout_reward_func/std": 0.2700802981853485, "sampling/importance_sampling_ratio/max": 1.001198649406433, "sampling/importance_sampling_ratio/mean": 0.9349400997161865, "sampling/importance_sampling_ratio/min": 0.00033864370197989047, "sampling/sampling_logp_difference/max": 2.7416810989379883, "sampling/sampling_logp_difference/mean": 0.091429702937603, "step": 2503, "step_time": 4.069822525001655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4693513778038323, "epoch": 0.02504, "grad_norm": 0.02164403349161148, "kl": 0.6526788286864758, "learning_rate": 9.99718142924319e-06, "loss": 0.0265, "step": 2504, "step_time": 1.9864796700057923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 121.25, "completions/mean_terminated_length": 121.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.05203015683218837, "epoch": 0.02505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003602733777370304, "kl": 0.5125105753540993, "learning_rate": 9.997179144973915e-06, "loss": 0.0014, "num_tokens": 21348824.0, "reward": 0.7507691979408264, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7507691979408264, "rewards/rollout_reward_func/std": 0.16391469538211823, "sampling/importance_sampling_ratio/max": 0.9998128414154053, "sampling/importance_sampling_ratio/mean": 0.9962984323501587, "sampling/importance_sampling_ratio/min": 0.9923188090324402, "sampling/sampling_logp_difference/max": 0.005629098042845726, "sampling/sampling_logp_difference/mean": 0.0009660947835072875, "step": 2505, "step_time": 3.8697511719947215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05278846947476268, "epoch": 0.02506, "grad_norm": 0.0003649899153970182, "kl": 0.512372363358736, "learning_rate": 9.997176859779733e-06, "loss": 0.0014, "step": 2506, "step_time": 1.99319640500471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05002794647589326, "epoch": 0.02507, "frac_reward_zero_std": 1.0, "grad_norm": 0.00038957069045864046, "kl": 0.4533931314945221, "learning_rate": 9.997174573660652e-06, "loss": 0.0014, "num_tokens": 21364304.0, "reward": 0.889423131942749, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.889423131942749, "rewards/rollout_reward_func/std": 0.2563939392566681, "sampling/importance_sampling_ratio/max": 1.0004302263259888, "sampling/importance_sampling_ratio/mean": 0.9971837997436523, "sampling/importance_sampling_ratio/min": 0.9942029118537903, "sampling/sampling_logp_difference/max": 0.004028750583529472, "sampling/sampling_logp_difference/mean": 0.0006717884098179638, "step": 2507, "step_time": 4.879744843005028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05032426444813609, "epoch": 0.02508, "grad_norm": 0.0003923935873899609, "kl": 0.4533507898449898, "learning_rate": 9.997172286616667e-06, "loss": 0.0014, "step": 2508, "step_time": 2.0112258520020987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 222.25, "completions/mean_terminated_length": 222.25, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.042594477999955416, "epoch": 0.02509, "frac_reward_zero_std": 1.0, "grad_norm": 0.00043501032632775605, "kl": 0.3843422420322895, "learning_rate": 9.997169998647782e-06, "loss": 0.0016, "num_tokens": 21382272.0, "reward": 1.0547692775726318, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0547692775726318, "rewards/rollout_reward_func/std": 0.30412599444389343, "sampling/importance_sampling_ratio/max": 0.9972980618476868, "sampling/importance_sampling_ratio/mean": 0.9954907298088074, "sampling/importance_sampling_ratio/min": 0.9930092096328735, "sampling/sampling_logp_difference/max": 0.003985647112131119, "sampling/sampling_logp_difference/mean": 0.0007420509355142713, "step": 2509, "step_time": 4.397872087007272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04279919434338808, "epoch": 0.0251, "grad_norm": 0.0004377600271254778, "kl": 0.38429830595850945, "learning_rate": 9.997167709753995e-06, "loss": 0.0016, "step": 2510, "step_time": 2.0360166810060036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 167.9375, "completions/mean_terminated_length": 167.9375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.29446645360440016, "epoch": 0.02511, "frac_reward_zero_std": 0.75, "grad_norm": 0.009344142861664295, "kl": 0.5315300561487675, "learning_rate": 9.997165419935309e-06, "loss": -0.027, "num_tokens": 21398326.0, "reward": 0.5523269176483154, "reward_std": 0.04487408697605133, "rewards/rollout_reward_func/mean": 0.5523269176483154, "rewards/rollout_reward_func/std": 0.19638246297836304, "sampling/importance_sampling_ratio/max": 0.9993624091148376, "sampling/importance_sampling_ratio/mean": 0.9644830226898193, "sampling/importance_sampling_ratio/min": 0.00025012006517499685, "sampling/sampling_logp_difference/max": 2.1041603088378906, "sampling/sampling_logp_difference/mean": 0.05145065486431122, "step": 2511, "step_time": 4.05761020200589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2938828645274043, "epoch": 0.02512, "grad_norm": 0.008763154037296772, "kl": 0.5262594595551491, "learning_rate": 9.997163129191722e-06, "loss": -0.027, "step": 2512, "step_time": 2.000676007992297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 150.65625, "completions/mean_terminated_length": 150.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5830827606841922, "epoch": 0.02513, "frac_reward_zero_std": 0.5, "grad_norm": 0.005867458879947662, "kl": 0.6370238773524761, "learning_rate": 9.997160837523238e-06, "loss": -0.0266, "num_tokens": 21413971.0, "reward": 0.9028557538986206, "reward_std": 0.04011470824480057, "rewards/rollout_reward_func/mean": 0.9028557538986206, "rewards/rollout_reward_func/std": 0.3548628091812134, "sampling/importance_sampling_ratio/max": 0.9980925917625427, "sampling/importance_sampling_ratio/mean": 0.9306924343109131, "sampling/importance_sampling_ratio/min": 0.0028971147257834673, "sampling/sampling_logp_difference/max": 2.1563429832458496, "sampling/sampling_logp_difference/mean": 0.10618490725755692, "step": 2513, "step_time": 5.3574465999918175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5831713294610381, "epoch": 0.02514, "grad_norm": 0.00583846727386117, "kl": 0.6393425464630127, "learning_rate": 9.997158544929854e-06, "loss": -0.0266, "step": 2514, "step_time": 2.032873654992727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 132.53125, "completions/mean_terminated_length": 132.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6853899750858545, "epoch": 0.02515, "frac_reward_zero_std": 0.5, "grad_norm": 0.008659610524773598, "kl": 0.5783506855368614, "learning_rate": 9.997156251411572e-06, "loss": -0.0483, "num_tokens": 21429068.0, "reward": 0.8982846140861511, "reward_std": 0.049324680119752884, "rewards/rollout_reward_func/mean": 0.8982846140861511, "rewards/rollout_reward_func/std": 0.3596617877483368, "sampling/importance_sampling_ratio/max": 0.9990370273590088, "sampling/importance_sampling_ratio/mean": 0.9015099406242371, "sampling/importance_sampling_ratio/min": 0.001259203883819282, "sampling/sampling_logp_difference/max": 2.3415093421936035, "sampling/sampling_logp_difference/mean": 0.13453787565231323, "step": 2515, "step_time": 4.282243579997157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6838767831213772, "epoch": 0.02516, "grad_norm": 0.007639233488589525, "kl": 0.586428239941597, "learning_rate": 9.997153956968392e-06, "loss": -0.0483, "step": 2516, "step_time": 2.058333721004601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 119.75, "completions/mean_terminated_length": 119.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3982966407202184, "epoch": 0.02517, "frac_reward_zero_std": 0.75, "grad_norm": 0.006234790198504925, "kl": 0.6477964818477631, "learning_rate": 9.997151661600316e-06, "loss": -0.0077, "num_tokens": 21443612.0, "reward": 0.6118749380111694, "reward_std": 0.03603524714708328, "rewards/rollout_reward_func/mean": 0.6118749380111694, "rewards/rollout_reward_func/std": 0.1447252631187439, "sampling/importance_sampling_ratio/max": 0.9989467263221741, "sampling/importance_sampling_ratio/mean": 0.9630381464958191, "sampling/importance_sampling_ratio/min": 1.9842443066409032e-07, "sampling/sampling_logp_difference/max": 3.264101028442383, "sampling/sampling_logp_difference/mean": 0.11176081746816635, "step": 2517, "step_time": 3.9301374590140767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39648021617904305, "epoch": 0.02518, "grad_norm": 0.006419313605874777, "kl": 0.649233989417553, "learning_rate": 9.997149365307343e-06, "loss": -0.0077, "step": 2518, "step_time": 2.501611733008758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.17546099703758955, "epoch": 0.02519, "frac_reward_zero_std": 0.75, "grad_norm": 0.008398666977882385, "kl": 0.5747948437929153, "learning_rate": 9.997147068089475e-06, "loss": -0.0258, "num_tokens": 21459976.0, "reward": 0.7929999828338623, "reward_std": 0.013598206453025341, "rewards/rollout_reward_func/mean": 0.7929999828338623, "rewards/rollout_reward_func/std": 0.2816106677055359, "sampling/importance_sampling_ratio/max": 0.999792754650116, "sampling/importance_sampling_ratio/mean": 0.9647135138511658, "sampling/importance_sampling_ratio/min": 0.03231518343091011, "sampling/sampling_logp_difference/max": 1.55148184299469, "sampling/sampling_logp_difference/mean": 0.018830908462405205, "step": 2519, "step_time": 4.585545371002809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1761709339916706, "epoch": 0.0252, "grad_norm": 0.008604465052485466, "kl": 0.5749511830508709, "learning_rate": 9.997144769946711e-06, "loss": -0.0258, "step": 2520, "step_time": 2.0058862550140475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 124.5625, "completions/mean_terminated_length": 124.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3952972418628633, "epoch": 0.02521, "frac_reward_zero_std": 0.75, "grad_norm": 0.006017811596393585, "kl": 0.6368974596261978, "learning_rate": 9.997142470879054e-06, "loss": -0.0172, "num_tokens": 21474730.0, "reward": 0.732326865196228, "reward_std": 0.0013598214136436582, "rewards/rollout_reward_func/mean": 0.732326865196228, "rewards/rollout_reward_func/std": 0.18792155385017395, "sampling/importance_sampling_ratio/max": 0.998545229434967, "sampling/importance_sampling_ratio/mean": 0.9626758098602295, "sampling/importance_sampling_ratio/min": 2.3710483219474554e-05, "sampling/sampling_logp_difference/max": 2.2607836723327637, "sampling/sampling_logp_difference/mean": 0.05499214306473732, "step": 2521, "step_time": 4.130802381012472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3948880354873836, "epoch": 0.02522, "grad_norm": 0.005828545894473791, "kl": 0.6352761164307594, "learning_rate": 9.9971401708865e-06, "loss": -0.0172, "step": 2522, "step_time": 2.0001678930129856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 148.625, "completions/mean_terminated_length": 148.625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.46734262770041823, "epoch": 0.02523, "frac_reward_zero_std": 0.5, "grad_norm": 0.02997651882469654, "kl": 0.6830167882144451, "learning_rate": 9.997137869969055e-06, "loss": -0.0363, "num_tokens": 21490254.0, "reward": 0.7865865230560303, "reward_std": 0.16956965625286102, "rewards/rollout_reward_func/mean": 0.7865865230560303, "rewards/rollout_reward_func/std": 0.44404178857803345, "sampling/importance_sampling_ratio/max": 0.998588502407074, "sampling/importance_sampling_ratio/mean": 0.933171272277832, "sampling/importance_sampling_ratio/min": 0.000479774345876649, "sampling/sampling_logp_difference/max": 2.318150043487549, "sampling/sampling_logp_difference/mean": 0.08763030171394348, "step": 2523, "step_time": 3.9251691949830274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4671711279079318, "epoch": 0.02524, "grad_norm": 0.027586832642555237, "kl": 0.6719244383275509, "learning_rate": 9.997135568126718e-06, "loss": -0.0363, "step": 2524, "step_time": 2.9120198089949554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 128.25, "completions/mean_terminated_length": 128.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3570413989946246, "epoch": 0.02525, "frac_reward_zero_std": 0.75, "grad_norm": 0.005413261242210865, "kl": 0.4924246408045292, "learning_rate": 9.997133265359486e-06, "loss": -0.0179, "num_tokens": 21505238.0, "reward": 0.6350192427635193, "reward_std": 0.0176776684820652, "rewards/rollout_reward_func/mean": 0.6350192427635193, "rewards/rollout_reward_func/std": 0.17659719288349152, "sampling/importance_sampling_ratio/max": 0.9982599020004272, "sampling/importance_sampling_ratio/mean": 0.9632086157798767, "sampling/importance_sampling_ratio/min": 0.00022542051738128066, "sampling/sampling_logp_difference/max": 1.5569214820861816, "sampling/sampling_logp_difference/mean": 0.06298476457595825, "step": 2525, "step_time": 4.037779034995765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35811473010107875, "epoch": 0.02526, "grad_norm": 0.005681571085005999, "kl": 0.49247052147984505, "learning_rate": 9.997130961667362e-06, "loss": -0.0179, "step": 2526, "step_time": 2.045335568000155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 154.6875, "completions/mean_terminated_length": 154.6875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3870711596682668, "epoch": 0.02527, "frac_reward_zero_std": 0.75, "grad_norm": 0.004354312550276518, "kl": 0.5561546832323074, "learning_rate": 9.99712865705035e-06, "loss": -0.027, "num_tokens": 21520956.0, "reward": 0.7007355690002441, "reward_std": 0.04578516632318497, "rewards/rollout_reward_func/mean": 0.7007355690002441, "rewards/rollout_reward_func/std": 0.1307285726070404, "sampling/importance_sampling_ratio/max": 0.9989371299743652, "sampling/importance_sampling_ratio/mean": 0.9645142555236816, "sampling/importance_sampling_ratio/min": 5.507310163466173e-10, "sampling/sampling_logp_difference/max": 2.1426198482513428, "sampling/sampling_logp_difference/mean": 0.10319390147924423, "step": 2527, "step_time": 4.600061981989711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38640338834375143, "epoch": 0.02528, "grad_norm": 0.00444926368072629, "kl": 0.5561994537711143, "learning_rate": 9.997126351508447e-06, "loss": -0.027, "step": 2528, "step_time": 2.0857803910039365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 174.4375, "completions/mean_terminated_length": 174.4375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2292260411195457, "epoch": 0.02529, "frac_reward_zero_std": 0.75, "grad_norm": 0.0032271184027194977, "kl": 0.46936821192502975, "learning_rate": 9.997124045041651e-06, "loss": -0.0271, "num_tokens": 21537386.0, "reward": 0.8821635246276855, "reward_std": 0.04990541934967041, "rewards/rollout_reward_func/mean": 0.8821635246276855, "rewards/rollout_reward_func/std": 0.25816917419433594, "sampling/importance_sampling_ratio/max": 0.998041570186615, "sampling/importance_sampling_ratio/mean": 0.9642535448074341, "sampling/importance_sampling_ratio/min": 0.006425048224627972, "sampling/sampling_logp_difference/max": 1.831050157546997, "sampling/sampling_logp_difference/mean": 0.02911558374762535, "step": 2529, "step_time": 4.0019894030047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22816296434029937, "epoch": 0.0253, "grad_norm": 0.0032265877816826105, "kl": 0.46790771558880806, "learning_rate": 9.997121737649966e-06, "loss": -0.0271, "step": 2530, "step_time": 2.889543170989782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7350387745536864, "epoch": 0.02531, "frac_reward_zero_std": 0.5, "grad_norm": 0.005533947609364986, "kl": 0.7100449167191982, "learning_rate": 9.997119429333394e-06, "loss": -0.0362, "num_tokens": 21552106.0, "reward": 0.8338461518287659, "reward_std": 0.06962281465530396, "rewards/rollout_reward_func/mean": 0.8338461518287659, "rewards/rollout_reward_func/std": 0.20891745388507843, "sampling/importance_sampling_ratio/max": 1.0003594160079956, "sampling/importance_sampling_ratio/mean": 0.9339194297790527, "sampling/importance_sampling_ratio/min": 1.2475288653200915e-15, "sampling/sampling_logp_difference/max": 3.2921719551086426, "sampling/sampling_logp_difference/mean": 0.20512887835502625, "step": 2531, "step_time": 4.162025581987109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.735308192204684, "epoch": 0.02532, "grad_norm": 0.005418307147920132, "kl": 0.7042564377188683, "learning_rate": 9.997117120091933e-06, "loss": -0.0361, "step": 2532, "step_time": 2.0133434889939963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.10393003188073635, "epoch": 0.02533, "frac_reward_zero_std": 0.75, "grad_norm": 0.04290428385138512, "kl": 0.6905321516096592, "learning_rate": 9.997114809925584e-06, "loss": -0.0334, "num_tokens": 21568330.0, "reward": 0.9622533917427063, "reward_std": 0.010270735248923302, "rewards/rollout_reward_func/mean": 0.9622533917427063, "rewards/rollout_reward_func/std": 0.3473912179470062, "sampling/importance_sampling_ratio/max": 0.9999783039093018, "sampling/importance_sampling_ratio/mean": 0.9667328596115112, "sampling/importance_sampling_ratio/min": 0.045186467468738556, "sampling/sampling_logp_difference/max": 2.792734146118164, "sampling/sampling_logp_difference/mean": 0.014356040395796299, "step": 2533, "step_time": 4.4919465349958045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10532266972586513, "epoch": 0.02534, "grad_norm": 0.03945079445838928, "kl": 0.6765199787914753, "learning_rate": 9.997112498834348e-06, "loss": -0.0335, "step": 2534, "step_time": 2.0566878730023745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.25833718944340944, "epoch": 0.02535, "frac_reward_zero_std": 0.75, "grad_norm": 0.012109518982470036, "kl": 0.5144297108054161, "learning_rate": 9.997110186818224e-06, "loss": 0.0203, "num_tokens": 21583074.0, "reward": 0.5608654022216797, "reward_std": 0.006799102760851383, "rewards/rollout_reward_func/mean": 0.5608654022216797, "rewards/rollout_reward_func/std": 0.12777206301689148, "sampling/importance_sampling_ratio/max": 0.9998698234558105, "sampling/importance_sampling_ratio/mean": 0.9649627208709717, "sampling/importance_sampling_ratio/min": 0.019120166078209877, "sampling/sampling_logp_difference/max": 1.847531795501709, "sampling/sampling_logp_difference/mean": 0.03389740735292435, "step": 2535, "step_time": 3.8173918400061666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2598951067775488, "epoch": 0.02536, "grad_norm": 0.011768106371164322, "kl": 0.4994403049349785, "learning_rate": 9.997107873877216e-06, "loss": 0.0202, "step": 2536, "step_time": 2.906061205001606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 104.65625, "completions/mean_terminated_length": 104.65625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3627389641478658, "epoch": 0.02537, "frac_reward_zero_std": 0.75, "grad_norm": 0.009280560538172722, "kl": 0.4938751272857189, "learning_rate": 9.997105560011322e-06, "loss": 0.0205, "num_tokens": 21597223.0, "reward": 0.7248702049255371, "reward_std": 0.02791711874306202, "rewards/rollout_reward_func/mean": 0.7248702049255371, "rewards/rollout_reward_func/std": 0.22749817371368408, "sampling/importance_sampling_ratio/max": 0.9994677305221558, "sampling/importance_sampling_ratio/mean": 0.9656776189804077, "sampling/importance_sampling_ratio/min": 0.0002521712158340961, "sampling/sampling_logp_difference/max": 1.4741361141204834, "sampling/sampling_logp_difference/mean": 0.0628037378191948, "step": 2537, "step_time": 3.907411027998023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36292859772220254, "epoch": 0.02538, "grad_norm": 0.008818457834422588, "kl": 0.4876616299152374, "learning_rate": 9.997103245220542e-06, "loss": 0.0205, "step": 2538, "step_time": 2.0497223759957706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 121.96875, "completions/mean_terminated_length": 121.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4244437087327242, "epoch": 0.02539, "frac_reward_zero_std": 0.75, "grad_norm": 0.00412483885884285, "kl": 0.6076600402593613, "learning_rate": 9.997100929504879e-06, "loss": -0.0271, "num_tokens": 21611982.0, "reward": 0.8053653836250305, "reward_std": 0.05858106538653374, "rewards/rollout_reward_func/mean": 0.8053653836250305, "rewards/rollout_reward_func/std": 0.3110952079296112, "sampling/importance_sampling_ratio/max": 0.9993632435798645, "sampling/importance_sampling_ratio/mean": 0.9633818864822388, "sampling/importance_sampling_ratio/min": 1.783621514372946e-11, "sampling/sampling_logp_difference/max": 4.332608222961426, "sampling/sampling_logp_difference/mean": 0.14692635834217072, "step": 2539, "step_time": 3.9265149109996855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42767919041216373, "epoch": 0.0254, "grad_norm": 0.003913864493370056, "kl": 0.6049055904150009, "learning_rate": 9.997098612864332e-06, "loss": -0.0271, "step": 2540, "step_time": 2.020227071996487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 149.53125, "completions/mean_terminated_length": 149.53125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.32008253829553723, "epoch": 0.02541, "frac_reward_zero_std": 0.75, "grad_norm": 0.007113027852028608, "kl": 0.6232487335801125, "learning_rate": 9.997096295298901e-06, "loss": -0.0173, "num_tokens": 21627535.0, "reward": 0.8003557324409485, "reward_std": 0.04011470824480057, "rewards/rollout_reward_func/mean": 0.8003557324409485, "rewards/rollout_reward_func/std": 0.3654707670211792, "sampling/importance_sampling_ratio/max": 0.9982494711875916, "sampling/importance_sampling_ratio/mean": 0.9634256958961487, "sampling/importance_sampling_ratio/min": 0.0013292260700836778, "sampling/sampling_logp_difference/max": 2.0535922050476074, "sampling/sampling_logp_difference/mean": 0.042232632637023926, "step": 2541, "step_time": 4.402272287996311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.318592787720263, "epoch": 0.02542, "grad_norm": 0.006655695382505655, "kl": 0.6172487512230873, "learning_rate": 9.997093976808587e-06, "loss": -0.0173, "step": 2542, "step_time": 2.4387016219916404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 78.8125, "completions/mean_terminated_length": 78.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.300580739043653, "epoch": 0.02543, "frac_reward_zero_std": 0.75, "grad_norm": 0.006626773159950972, "kl": 0.6555883213877678, "learning_rate": 9.997091657393394e-06, "loss": -0.0079, "num_tokens": 21640945.0, "reward": 0.7025673389434814, "reward_std": 0.03331560641527176, "rewards/rollout_reward_func/mean": 0.7025673389434814, "rewards/rollout_reward_func/std": 0.19337572157382965, "sampling/importance_sampling_ratio/max": 0.9957369565963745, "sampling/importance_sampling_ratio/mean": 0.9603012800216675, "sampling/importance_sampling_ratio/min": 0.016296450048685074, "sampling/sampling_logp_difference/max": 2.1107187271118164, "sampling/sampling_logp_difference/mean": 0.04721472039818764, "step": 2543, "step_time": 3.6230541850236477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3008805923163891, "epoch": 0.02544, "grad_norm": 0.006205336190760136, "kl": 0.6579228043556213, "learning_rate": 9.997089337053317e-06, "loss": -0.0079, "step": 2544, "step_time": 1.9990993849933147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 145.90625, "completions/mean_terminated_length": 145.90625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.22742228768765926, "epoch": 0.02545, "frac_reward_zero_std": 0.75, "grad_norm": 0.026985105127096176, "kl": 0.5582432374358177, "learning_rate": 9.997087015788358e-06, "loss": 0.0195, "num_tokens": 21656350.0, "reward": 0.8445384502410889, "reward_std": 0.01631784811615944, "rewards/rollout_reward_func/mean": 0.8445384502410889, "rewards/rollout_reward_func/std": 0.3565260171890259, "sampling/importance_sampling_ratio/max": 1.0009171962738037, "sampling/importance_sampling_ratio/mean": 0.9660283327102661, "sampling/importance_sampling_ratio/min": 0.07382191717624664, "sampling/sampling_logp_difference/max": 1.350954294204712, "sampling/sampling_logp_difference/mean": 0.021497463807463646, "step": 2545, "step_time": 4.174044051011151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2276093950495124, "epoch": 0.02546, "grad_norm": 0.027399761602282524, "kl": 0.5579355508089066, "learning_rate": 9.99708469359852e-06, "loss": 0.0195, "step": 2546, "step_time": 2.0422080760108656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 144.78125, "completions/mean_terminated_length": 144.78125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.47089905850589275, "epoch": 0.02547, "frac_reward_zero_std": 0.5, "grad_norm": 0.0075420658104121685, "kl": 0.5133271180093288, "learning_rate": 9.997082370483802e-06, "loss": 0.0021, "num_tokens": 21671663.0, "reward": 0.8111538887023926, "reward_std": 0.03807498514652252, "rewards/rollout_reward_func/mean": 0.8111538887023926, "rewards/rollout_reward_func/std": 0.371636301279068, "sampling/importance_sampling_ratio/max": 0.9964609742164612, "sampling/importance_sampling_ratio/mean": 0.9322603940963745, "sampling/importance_sampling_ratio/min": 0.002973572351038456, "sampling/sampling_logp_difference/max": 1.6584453582763672, "sampling/sampling_logp_difference/mean": 0.07388380914926529, "step": 2547, "step_time": 4.545091550993675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4704573703929782, "epoch": 0.02548, "grad_norm": 0.007071978412568569, "kl": 0.5048513747751713, "learning_rate": 9.997080046444205e-06, "loss": 0.002, "step": 2548, "step_time": 2.416564144012227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 122.96875, "completions/mean_terminated_length": 122.96875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3272625021636486, "epoch": 0.02549, "frac_reward_zero_std": 0.75, "grad_norm": 0.005097489804029465, "kl": 0.5862706899642944, "learning_rate": 9.99707772147973e-06, "loss": -0.0175, "num_tokens": 21686366.0, "reward": 0.5968749523162842, "reward_std": 0.04038666933774948, "rewards/rollout_reward_func/mean": 0.5968749523162842, "rewards/rollout_reward_func/std": 0.21668219566345215, "sampling/importance_sampling_ratio/max": 0.9999958872795105, "sampling/importance_sampling_ratio/mean": 0.963457465171814, "sampling/importance_sampling_ratio/min": 0.0022127688862383366, "sampling/sampling_logp_difference/max": 2.092799663543701, "sampling/sampling_logp_difference/mean": 0.05031222105026245, "step": 2549, "step_time": 3.76385840200237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.32965332362800837, "epoch": 0.0255, "grad_norm": 0.005119094159454107, "kl": 0.5842222198843956, "learning_rate": 9.997075395590376e-06, "loss": -0.0175, "step": 2550, "step_time": 1.9720674339987454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 164.71875, "completions/mean_terminated_length": 164.71875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.485179846175015, "epoch": 0.02551, "frac_reward_zero_std": 0.5, "grad_norm": 0.03557833656668663, "kl": 0.5316490344703197, "learning_rate": 9.997073068776145e-06, "loss": -0.0516, "num_tokens": 21702405.0, "reward": 0.7388509511947632, "reward_std": 0.04382701590657234, "rewards/rollout_reward_func/mean": 0.7388509511947632, "rewards/rollout_reward_func/std": 0.3109537661075592, "sampling/importance_sampling_ratio/max": 0.997382640838623, "sampling/importance_sampling_ratio/mean": 0.9059423208236694, "sampling/importance_sampling_ratio/min": 0.007380051538348198, "sampling/sampling_logp_difference/max": 2.068897008895874, "sampling/sampling_logp_difference/mean": 0.059541232883930206, "step": 2551, "step_time": 3.8319885050150333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.48505418561398983, "epoch": 0.02552, "grad_norm": 0.033698879182338715, "kl": 0.5383177101612091, "learning_rate": 9.997070741037037e-06, "loss": -0.0517, "step": 2552, "step_time": 2.008792723994702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 119.1875, "completions/mean_terminated_length": 119.1875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.35727469250559807, "epoch": 0.02553, "frac_reward_zero_std": 0.75, "grad_norm": 0.009604813531041145, "kl": 0.6791299134492874, "learning_rate": 9.997068412373053e-06, "loss": -0.0144, "num_tokens": 21716955.0, "reward": 0.6704807281494141, "reward_std": 0.031275875866413116, "rewards/rollout_reward_func/mean": 0.6704807281494141, "rewards/rollout_reward_func/std": 0.17630831897258759, "sampling/importance_sampling_ratio/max": 0.9959151744842529, "sampling/importance_sampling_ratio/mean": 0.9286762475967407, "sampling/importance_sampling_ratio/min": 2.690118350301418e-09, "sampling/sampling_logp_difference/max": 17.27167510986328, "sampling/sampling_logp_difference/mean": 0.2030792385339737, "step": 2553, "step_time": 4.33346739999979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35748607758432627, "epoch": 0.02554, "grad_norm": 0.009073798544704914, "kl": 0.6736406534910202, "learning_rate": 9.997066082784192e-06, "loss": -0.0144, "step": 2554, "step_time": 2.4546176079966244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 201.625, "completions/mean_terminated_length": 201.625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.39101514779031277, "epoch": 0.02555, "frac_reward_zero_std": 0.5, "grad_norm": 0.018636051565408707, "kl": 0.45678336918354034, "learning_rate": 9.997063752270457e-06, "loss": -0.0639, "num_tokens": 21734087.0, "reward": 1.02333664894104, "reward_std": 0.022437047213315964, "rewards/rollout_reward_func/mean": 1.02333664894104, "rewards/rollout_reward_func/std": 0.30850380659103394, "sampling/importance_sampling_ratio/max": 0.9963520765304565, "sampling/importance_sampling_ratio/mean": 0.9319133758544922, "sampling/importance_sampling_ratio/min": 0.0008758275071159005, "sampling/sampling_logp_difference/max": 1.9812602996826172, "sampling/sampling_logp_difference/mean": 0.05423366278409958, "step": 2555, "step_time": 4.312695656000869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39743449073284864, "epoch": 0.02556, "grad_norm": 0.018355241045355797, "kl": 0.4578031823039055, "learning_rate": 9.997061420831846e-06, "loss": -0.0639, "step": 2556, "step_time": 2.0600851499984856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 217.40625, "completions/mean_terminated_length": 217.40625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4589048153720796, "epoch": 0.02557, "frac_reward_zero_std": 0.5, "grad_norm": 0.013277382589876652, "kl": 0.44767094403505325, "learning_rate": 9.997059088468362e-06, "loss": -0.0651, "num_tokens": 21751836.0, "reward": 0.843548059463501, "reward_std": 0.13421431183815002, "rewards/rollout_reward_func/mean": 0.843548059463501, "rewards/rollout_reward_func/std": 0.564426064491272, "sampling/importance_sampling_ratio/max": 0.9953838586807251, "sampling/importance_sampling_ratio/mean": 0.9296896457672119, "sampling/importance_sampling_ratio/min": 0.00039606852806173265, "sampling/sampling_logp_difference/max": 2.3316094875335693, "sampling/sampling_logp_difference/mean": 0.06657104194164276, "step": 2557, "step_time": 4.446145667003293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4551474996842444, "epoch": 0.02558, "grad_norm": 0.012876644730567932, "kl": 0.4445347636938095, "learning_rate": 9.997056755180006e-06, "loss": -0.0651, "step": 2558, "step_time": 2.0646542069953284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 101.125, "completions/mean_terminated_length": 101.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6512956274673343, "epoch": 0.02559, "frac_reward_zero_std": 0.5, "grad_norm": 0.010337518528103828, "kl": 0.6255622021853924, "learning_rate": 9.997054420966776e-06, "loss": -0.0409, "num_tokens": 21765960.0, "reward": 0.8456250429153442, "reward_std": 0.047863177955150604, "rewards/rollout_reward_func/mean": 0.8456250429153442, "rewards/rollout_reward_func/std": 0.2850984036922455, "sampling/importance_sampling_ratio/max": 1.0003418922424316, "sampling/importance_sampling_ratio/mean": 0.906329870223999, "sampling/importance_sampling_ratio/min": 0.00032366334926337004, "sampling/sampling_logp_difference/max": 2.191351890563965, "sampling/sampling_logp_difference/mean": 0.11888957768678665, "step": 2559, "step_time": 4.220127997010422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6484061097726226, "epoch": 0.0256, "grad_norm": 0.00930294580757618, "kl": 0.6434941105544567, "learning_rate": 9.997052085828673e-06, "loss": -0.0409, "step": 2560, "step_time": 2.411436952002987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 127.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08050571475178003, "epoch": 0.02561, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005190796800889075, "kl": 0.5605266056954861, "learning_rate": 9.9970497497657e-06, "loss": 0.0014, "num_tokens": 21780808.0, "reward": 1.0099999904632568, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0099999904632568, "rewards/rollout_reward_func/std": 0.1670859009027481, "sampling/importance_sampling_ratio/max": 0.9990299344062805, "sampling/importance_sampling_ratio/mean": 0.9932597279548645, "sampling/importance_sampling_ratio/min": 0.9837760329246521, "sampling/sampling_logp_difference/max": 0.014699205756187439, "sampling/sampling_logp_difference/mean": 0.001997765153646469, "step": 2561, "step_time": 4.17945214498468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07916158065199852, "epoch": 0.02562, "grad_norm": 0.0005173755926080048, "kl": 0.5607507564127445, "learning_rate": 9.997047412777854e-06, "loss": 0.0014, "step": 2562, "step_time": 1.9764309199890704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 125.25, "completions/mean_terminated_length": 125.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.17534167598932981, "epoch": 0.02563, "frac_reward_zero_std": 0.75, "grad_norm": 0.017875224351882935, "kl": 0.5486653596162796, "learning_rate": 9.997045074865137e-06, "loss": -0.0259, "num_tokens": 21795552.0, "reward": 0.6476442813873291, "reward_std": 0.008838837966322899, "rewards/rollout_reward_func/mean": 0.6476442813873291, "rewards/rollout_reward_func/std": 0.096989206969738, "sampling/importance_sampling_ratio/max": 0.9978815913200378, "sampling/importance_sampling_ratio/mean": 0.9644724726676941, "sampling/importance_sampling_ratio/min": 0.03474032133817673, "sampling/sampling_logp_difference/max": 2.2653517723083496, "sampling/sampling_logp_difference/mean": 0.024020163342356682, "step": 2563, "step_time": 4.051742737989116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17133334837853909, "epoch": 0.02564, "grad_norm": 0.01790943183004856, "kl": 0.5503277778625488, "learning_rate": 9.99704273602755e-06, "loss": -0.0259, "step": 2564, "step_time": 2.0337194870153326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 123.6875, "completions/mean_terminated_length": 123.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.497486992739141, "epoch": 0.02565, "frac_reward_zero_std": 0.75, "grad_norm": 0.0034892556723207235, "kl": 0.613032877445221, "learning_rate": 9.997040396265096e-06, "loss": -0.0175, "num_tokens": 21810222.0, "reward": 0.6183173060417175, "reward_std": 0.015637937933206558, "rewards/rollout_reward_func/mean": 0.6183173060417175, "rewards/rollout_reward_func/std": 0.21968773007392883, "sampling/importance_sampling_ratio/max": 0.9973575472831726, "sampling/importance_sampling_ratio/mean": 0.9624422788619995, "sampling/importance_sampling_ratio/min": 6.535305865597706e-17, "sampling/sampling_logp_difference/max": 4.142054557800293, "sampling/sampling_logp_difference/mean": 0.20242276787757874, "step": 2565, "step_time": 4.5094894130161265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4960270933806896, "epoch": 0.02566, "grad_norm": 0.0033309038262814283, "kl": 0.6145973652601242, "learning_rate": 9.997038055577772e-06, "loss": -0.0175, "step": 2566, "step_time": 2.462465897006041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 165.03125, "completions/mean_terminated_length": 165.03125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6300068879500031, "epoch": 0.02567, "frac_reward_zero_std": 0.5, "grad_norm": 0.047436852008104324, "kl": 0.4825340174138546, "learning_rate": 9.997035713965579e-06, "loss": -0.002, "num_tokens": 21826183.0, "reward": 0.6504374742507935, "reward_std": 0.02745090425014496, "rewards/rollout_reward_func/mean": 0.6504374742507935, "rewards/rollout_reward_func/std": 0.29636725783348083, "sampling/importance_sampling_ratio/max": 0.9996197819709778, "sampling/importance_sampling_ratio/mean": 0.9074221849441528, "sampling/importance_sampling_ratio/min": 2.573889560153475e-06, "sampling/sampling_logp_difference/max": 3.840460777282715, "sampling/sampling_logp_difference/mean": 0.12594148516654968, "step": 2567, "step_time": 4.206323224003427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6308087529614568, "epoch": 0.02568, "grad_norm": 0.050057511776685715, "kl": 0.47726016119122505, "learning_rate": 9.99703337142852e-06, "loss": -0.0021, "step": 2568, "step_time": 2.0449269259916036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 103.625, "completions/mean_terminated_length": 100.70967102050781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5293891066685319, "epoch": 0.02569, "frac_reward_zero_std": 0.75, "grad_norm": 0.005097043234854937, "kl": 0.6377602107822895, "learning_rate": 9.997031027966593e-06, "loss": -0.0273, "num_tokens": 21840411.0, "reward": 0.606249988079071, "reward_std": 0.02474873699247837, "rewards/rollout_reward_func/mean": 0.606249988079071, "rewards/rollout_reward_func/std": 0.21572674810886383, "sampling/importance_sampling_ratio/max": 0.998816967010498, "sampling/importance_sampling_ratio/mean": 0.9617215394973755, "sampling/importance_sampling_ratio/min": 9.500194127752438e-30, "sampling/sampling_logp_difference/max": 3.0083107948303223, "sampling/sampling_logp_difference/mean": 0.37835201621055603, "step": 2569, "step_time": 3.9198405239876593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5301686809398234, "epoch": 0.0257, "grad_norm": 0.0049823252484202385, "kl": 0.6378570310771465, "learning_rate": 9.997028683579798e-06, "loss": -0.0273, "step": 2570, "step_time": 2.4769528649994754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 190.84375, "completions/mean_terminated_length": 190.84375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.22377102402970195, "epoch": 0.02571, "frac_reward_zero_std": 0.5, "grad_norm": 0.008782991208136082, "kl": 0.6323908381164074, "learning_rate": 9.997026338268139e-06, "loss": -0.0705, "num_tokens": 21857254.0, "reward": 0.9799278974533081, "reward_std": 0.30491259694099426, "rewards/rollout_reward_func/mean": 0.9799278974533081, "rewards/rollout_reward_func/std": 0.5382915735244751, "sampling/importance_sampling_ratio/max": 1.001035451889038, "sampling/importance_sampling_ratio/mean": 0.9367056488990784, "sampling/importance_sampling_ratio/min": 0.02567516267299652, "sampling/sampling_logp_difference/max": 2.4718050956726074, "sampling/sampling_logp_difference/mean": 0.02740670181810856, "step": 2571, "step_time": 4.453307320000022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22421947540715337, "epoch": 0.02572, "grad_norm": 0.008279126137495041, "kl": 0.6283896416425705, "learning_rate": 9.997023992031614e-06, "loss": -0.0705, "step": 2572, "step_time": 2.4424870259899762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 102.21875, "completions/mean_terminated_length": 102.21875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5804889681749046, "epoch": 0.02573, "frac_reward_zero_std": 0.5, "grad_norm": 0.004689424764364958, "kl": 0.5091192200779915, "learning_rate": 9.997021644870223e-06, "loss": -0.0369, "num_tokens": 21871205.0, "reward": 0.5834134817123413, "reward_std": 0.008838832378387451, "rewards/rollout_reward_func/mean": 0.5834134817123413, "rewards/rollout_reward_func/std": 0.20896227657794952, "sampling/importance_sampling_ratio/max": 1.0024806261062622, "sampling/importance_sampling_ratio/mean": 0.9350413680076599, "sampling/importance_sampling_ratio/min": 0.0008967566536739469, "sampling/sampling_logp_difference/max": 2.0623514652252197, "sampling/sampling_logp_difference/mean": 0.10217860341072083, "step": 2573, "step_time": 3.743995114004065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5804580063559115, "epoch": 0.02574, "grad_norm": 0.004980729427188635, "kl": 0.511802189052105, "learning_rate": 9.99701929678397e-06, "loss": -0.0369, "step": 2574, "step_time": 2.005923362012254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 129.28125, "completions/mean_terminated_length": 129.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.39859566325321794, "epoch": 0.02575, "frac_reward_zero_std": 0.75, "grad_norm": 0.0048449416644871235, "kl": 0.6858264319598675, "learning_rate": 9.997016947772852e-06, "loss": -0.0078, "num_tokens": 21886166.0, "reward": 0.8149712085723877, "reward_std": 0.0673111230134964, "rewards/rollout_reward_func/mean": 0.8149712085723877, "rewards/rollout_reward_func/std": 0.3898696005344391, "sampling/importance_sampling_ratio/max": 0.9999810457229614, "sampling/importance_sampling_ratio/mean": 0.965308666229248, "sampling/importance_sampling_ratio/min": 0.0011658088769763708, "sampling/sampling_logp_difference/max": 1.9830167293548584, "sampling/sampling_logp_difference/mean": 0.04166145995259285, "step": 2575, "step_time": 4.10092044799967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39785093907266855, "epoch": 0.02576, "grad_norm": 0.004834089428186417, "kl": 0.6863474808633327, "learning_rate": 9.997014597836872e-06, "loss": -0.0078, "step": 2576, "step_time": 2.5203830099926563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 188.03125, "completions/mean_terminated_length": 188.03125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.24452911177650094, "epoch": 0.02577, "frac_reward_zero_std": 0.75, "grad_norm": 0.006886584684252739, "kl": 0.4465089961886406, "learning_rate": 9.997012246976028e-06, "loss": -0.0268, "num_tokens": 21902919.0, "reward": 0.5842981338500977, "reward_std": 0.029508108273148537, "rewards/rollout_reward_func/mean": 0.5842981338500977, "rewards/rollout_reward_func/std": 0.28366321325302124, "sampling/importance_sampling_ratio/max": 0.9967246055603027, "sampling/importance_sampling_ratio/mean": 0.9638720750808716, "sampling/importance_sampling_ratio/min": 0.025876201689243317, "sampling/sampling_logp_difference/max": 1.6101698875427246, "sampling/sampling_logp_difference/mean": 0.026030199602246284, "step": 2577, "step_time": 4.953748273983365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2463702172972262, "epoch": 0.02578, "grad_norm": 0.007057597395032644, "kl": 0.4426877200603485, "learning_rate": 9.997009895190323e-06, "loss": -0.0268, "step": 2578, "step_time": 2.053265485999873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.053689382039010525, "epoch": 0.02579, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005359250353649259, "kl": 0.3944665640592575, "learning_rate": 9.997007542479758e-06, "loss": 0.0016, "num_tokens": 21920615.0, "reward": 0.9502307176589966, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9502307176589966, "rewards/rollout_reward_func/std": 0.32807812094688416, "sampling/importance_sampling_ratio/max": 0.9986534118652344, "sampling/importance_sampling_ratio/mean": 0.9946275949478149, "sampling/importance_sampling_ratio/min": 0.9903400540351868, "sampling/sampling_logp_difference/max": 0.005029712803661823, "sampling/sampling_logp_difference/mean": 0.0009382455609738827, "step": 2579, "step_time": 4.366274173997226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05345040513202548, "epoch": 0.0258, "grad_norm": 0.0005282877245917916, "kl": 0.39450592920184135, "learning_rate": 9.99700518884433e-06, "loss": 0.0016, "step": 2580, "step_time": 2.0423190540095675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3956876527518034, "epoch": 0.02581, "frac_reward_zero_std": 0.5, "grad_norm": 0.013254703022539616, "kl": 0.4737202674150467, "learning_rate": 9.997002834284042e-06, "loss": -0.0553, "num_tokens": 21937199.0, "reward": 0.7500048279762268, "reward_std": 0.034036312252283096, "rewards/rollout_reward_func/mean": 0.7500048279762268, "rewards/rollout_reward_func/std": 0.19932889938354492, "sampling/importance_sampling_ratio/max": 1.0014400482177734, "sampling/importance_sampling_ratio/mean": 0.9345115423202515, "sampling/importance_sampling_ratio/min": 0.005765162408351898, "sampling/sampling_logp_difference/max": 1.66304349899292, "sampling/sampling_logp_difference/mean": 0.05088166519999504, "step": 2581, "step_time": 4.449350021015562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39792492613196373, "epoch": 0.02582, "grad_norm": 0.01212962158024311, "kl": 0.48522963374853134, "learning_rate": 9.997000478798895e-06, "loss": -0.0553, "step": 2582, "step_time": 2.517891810995934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 198.375, "completions/mean_terminated_length": 198.375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.5084023023955524, "epoch": 0.02583, "frac_reward_zero_std": 0.5, "grad_norm": 0.04216751456260681, "kl": 0.5704907029867172, "learning_rate": 9.99699812238889e-06, "loss": -0.0266, "num_tokens": 21954283.0, "reward": 0.6329711675643921, "reward_std": 0.01920214667916298, "rewards/rollout_reward_func/mean": 0.6329711675643921, "rewards/rollout_reward_func/std": 0.166029155254364, "sampling/importance_sampling_ratio/max": 0.9982395172119141, "sampling/importance_sampling_ratio/mean": 0.903860330581665, "sampling/importance_sampling_ratio/min": 0.0021311298478394747, "sampling/sampling_logp_difference/max": 1.8322914838790894, "sampling/sampling_logp_difference/mean": 0.06525430828332901, "step": 2583, "step_time": 4.813913417994627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.520525359082967, "epoch": 0.02584, "grad_norm": 0.03852273151278496, "kl": 0.5718678310513496, "learning_rate": 9.996995765054025e-06, "loss": -0.0266, "step": 2584, "step_time": 2.068975935995695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 203.0, "completions/mean_terminated_length": 203.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.8785260883159935, "epoch": 0.02585, "frac_reward_zero_std": 0.25, "grad_norm": 0.01005402859300375, "kl": 0.5665176212787628, "learning_rate": 9.996993406794304e-06, "loss": -0.0676, "num_tokens": 21971491.0, "reward": 0.8475817441940308, "reward_std": 0.05432598665356636, "rewards/rollout_reward_func/mean": 0.8475817441940308, "rewards/rollout_reward_func/std": 0.4523457884788513, "sampling/importance_sampling_ratio/max": 0.9986035227775574, "sampling/importance_sampling_ratio/mean": 0.8715898990631104, "sampling/importance_sampling_ratio/min": 4.0843471538209997e-07, "sampling/sampling_logp_difference/max": 2.1724796295166016, "sampling/sampling_logp_difference/mean": 0.15907332301139832, "step": 2585, "step_time": 4.45915920301195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.882673401851207, "epoch": 0.02586, "grad_norm": 0.010600592941045761, "kl": 0.5727724321186543, "learning_rate": 9.996991047609723e-06, "loss": -0.0676, "step": 2586, "step_time": 2.074654207994172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05111181130632758, "epoch": 0.02587, "frac_reward_zero_std": 1.0, "grad_norm": 0.00040850104414857924, "kl": 0.43697506934404373, "learning_rate": 9.996988687500287e-06, "loss": 0.0013, "num_tokens": 21987131.0, "reward": 0.8941538333892822, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8941538333892822, "rewards/rollout_reward_func/std": 0.36653512716293335, "sampling/importance_sampling_ratio/max": 0.9990972876548767, "sampling/importance_sampling_ratio/mean": 0.99699866771698, "sampling/importance_sampling_ratio/min": 0.9949939846992493, "sampling/sampling_logp_difference/max": 0.003422277048230171, "sampling/sampling_logp_difference/mean": 0.0006861158180981874, "step": 2587, "step_time": 4.471879515993351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0509025901556015, "epoch": 0.02588, "grad_norm": 0.00040412467205896974, "kl": 0.4369889907538891, "learning_rate": 9.996986326465996e-06, "loss": 0.0013, "step": 2588, "step_time": 1.9769980659984867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.04532085359096527, "epoch": 0.02589, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004813561972696334, "kl": 0.39081529527902603, "learning_rate": 9.996983964506848e-06, "loss": 0.0016, "num_tokens": 22004843.0, "reward": 1.100115418434143, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.100115418434143, "rewards/rollout_reward_func/std": 0.15165241062641144, "sampling/importance_sampling_ratio/max": 0.9984144568443298, "sampling/importance_sampling_ratio/mean": 0.9950510263442993, "sampling/importance_sampling_ratio/min": 0.9911834597587585, "sampling/sampling_logp_difference/max": 0.005799995735287666, "sampling/sampling_logp_difference/mean": 0.0008336383616551757, "step": 2589, "step_time": 4.79756635701051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04528716905042529, "epoch": 0.0259, "grad_norm": 0.0004883252549916506, "kl": 0.3908033035695553, "learning_rate": 9.996981601622846e-06, "loss": 0.0016, "step": 2590, "step_time": 2.0360878110077465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 125.8125, "completions/mean_terminated_length": 125.8125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.8244939539581537, "epoch": 0.02591, "frac_reward_zero_std": 0.25, "grad_norm": 0.009535095654428005, "kl": 0.7203380763530731, "learning_rate": 9.996979237813989e-06, "loss": -0.065, "num_tokens": 22019637.0, "reward": 0.7526971101760864, "reward_std": 0.0982198417186737, "rewards/rollout_reward_func/mean": 0.7526971101760864, "rewards/rollout_reward_func/std": 0.2980044484138489, "sampling/importance_sampling_ratio/max": 1.001418113708496, "sampling/importance_sampling_ratio/mean": 0.9028761982917786, "sampling/importance_sampling_ratio/min": 5.923511126049164e-10, "sampling/sampling_logp_difference/max": 10.427316665649414, "sampling/sampling_logp_difference/mean": 0.2415379285812378, "step": 2591, "step_time": 4.054135738995683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8226715633645654, "epoch": 0.02592, "grad_norm": 0.009143629111349583, "kl": 0.7166844680905342, "learning_rate": 9.996976873080278e-06, "loss": -0.065, "step": 2592, "step_time": 2.0244540730054723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.40168095100671053, "epoch": 0.02593, "frac_reward_zero_std": 0.25, "grad_norm": 0.026906242594122887, "kl": 0.5099778175354004, "learning_rate": 9.996974507421716e-06, "loss": -0.0253, "num_tokens": 22036701.0, "reward": 1.157442331314087, "reward_std": 0.07595957815647125, "rewards/rollout_reward_func/mean": 1.157442331314087, "rewards/rollout_reward_func/std": 0.18859286606311798, "sampling/importance_sampling_ratio/max": 0.9987617135047913, "sampling/importance_sampling_ratio/mean": 0.9080528020858765, "sampling/importance_sampling_ratio/min": 0.0012437072582542896, "sampling/sampling_logp_difference/max": 2.054431438446045, "sampling/sampling_logp_difference/mean": 0.06334276497364044, "step": 2593, "step_time": 4.703210890998889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4015681934542954, "epoch": 0.02594, "grad_norm": 0.025835633277893066, "kl": 0.5174403674900532, "learning_rate": 9.996972140838299e-06, "loss": -0.0254, "step": 2594, "step_time": 2.041581737001252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 166.5, "completions/mean_terminated_length": 166.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4248202000744641, "epoch": 0.02595, "frac_reward_zero_std": 0.75, "grad_norm": 0.008639369159936905, "kl": 0.3747796379029751, "learning_rate": 9.996969773330032e-06, "loss": 0.0396, "num_tokens": 22052765.0, "reward": 0.543466329574585, "reward_std": 0.004038667771965265, "rewards/rollout_reward_func/mean": 0.543466329574585, "rewards/rollout_reward_func/std": 0.18356364965438843, "sampling/importance_sampling_ratio/max": 1.0005134344100952, "sampling/importance_sampling_ratio/mean": 0.9656564593315125, "sampling/importance_sampling_ratio/min": 5.765115412310167e-19, "sampling/sampling_logp_difference/max": 3.0183277130126953, "sampling/sampling_logp_difference/mean": 0.16742894053459167, "step": 2595, "step_time": 4.81017376499949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4244162682443857, "epoch": 0.02596, "grad_norm": 0.008588260971009731, "kl": 0.37491900473833084, "learning_rate": 9.996967404896911e-06, "loss": 0.0396, "step": 2596, "step_time": 2.0025661200124887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 84.5, "completions/mean_terminated_length": 84.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6240271562710404, "epoch": 0.02597, "frac_reward_zero_std": 0.5, "grad_norm": 0.0195964016020298, "kl": 0.6649646833539009, "learning_rate": 9.996965035538941e-06, "loss": 0.0298, "num_tokens": 22066325.0, "reward": 0.5885336399078369, "reward_std": 0.03610324487090111, "rewards/rollout_reward_func/mean": 0.5885336399078369, "rewards/rollout_reward_func/std": 0.18043439090251923, "sampling/importance_sampling_ratio/max": 0.9998607039451599, "sampling/importance_sampling_ratio/mean": 0.9328559637069702, "sampling/importance_sampling_ratio/min": 6.53200622764416e-05, "sampling/sampling_logp_difference/max": 2.2955212593078613, "sampling/sampling_logp_difference/mean": 0.12155464291572571, "step": 2597, "step_time": 3.818269050985691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6254756185226142, "epoch": 0.02598, "grad_norm": 0.019318977370858192, "kl": 0.6599715612828732, "learning_rate": 9.99696266525612e-06, "loss": 0.0298, "step": 2598, "step_time": 2.041255882002588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 129.4375, "completions/mean_terminated_length": 129.4375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.5257347542792559, "epoch": 0.02599, "frac_reward_zero_std": 0.75, "grad_norm": 0.005533427931368351, "kl": 0.4864122346043587, "learning_rate": 9.99696029404845e-06, "loss": -0.0149, "num_tokens": 22081235.0, "reward": 0.6431249380111694, "reward_std": 0.03545885533094406, "rewards/rollout_reward_func/mean": 0.6431249380111694, "rewards/rollout_reward_func/std": 0.190095454454422, "sampling/importance_sampling_ratio/max": 1.0015078783035278, "sampling/importance_sampling_ratio/mean": 0.9366213083267212, "sampling/importance_sampling_ratio/min": 8.12059261079412e-06, "sampling/sampling_logp_difference/max": 1.4438226222991943, "sampling/sampling_logp_difference/mean": 0.10131965577602386, "step": 2599, "step_time": 4.4768658470056835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5231574801728129, "epoch": 0.026, "grad_norm": 0.005486803129315376, "kl": 0.48510997369885445, "learning_rate": 9.99695792191593e-06, "loss": -0.0149, "step": 2600, "step_time": 2.027893793012481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 70.25, "completions/mean_terminated_length": 70.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0882285749539733, "epoch": 0.02601, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005329091800376773, "kl": 0.5837553068995476, "learning_rate": 9.996955548858564e-06, "loss": 0.0012, "num_tokens": 22094195.0, "reward": 0.625, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.625, "rewards/rollout_reward_func/std": 0.16719962656497955, "sampling/importance_sampling_ratio/max": 1.0006234645843506, "sampling/importance_sampling_ratio/mean": 0.9951155185699463, "sampling/importance_sampling_ratio/min": 0.9876593947410583, "sampling/sampling_logp_difference/max": 0.011191543191671371, "sampling/sampling_logp_difference/mean": 0.0016943843802437186, "step": 2601, "step_time": 4.031315779997385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09037715755403042, "epoch": 0.02602, "grad_norm": 0.0005535407108254731, "kl": 0.5832861997187138, "learning_rate": 9.996953174876348e-06, "loss": 0.0012, "step": 2602, "step_time": 1.9704041660079383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 105.1875, "completions/mean_terminated_length": 105.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.20233626198023558, "epoch": 0.02603, "frac_reward_zero_std": 0.75, "grad_norm": 0.02316387929022312, "kl": 0.4542331397533417, "learning_rate": 9.996950799969284e-06, "loss": 0.0183, "num_tokens": 22108441.0, "reward": 0.5683172941207886, "reward_std": 0.0006799129769206047, "rewards/rollout_reward_func/mean": 0.5683172941207886, "rewards/rollout_reward_func/std": 0.2671269476413727, "sampling/importance_sampling_ratio/max": 0.9993820190429688, "sampling/importance_sampling_ratio/mean": 0.9666061401367188, "sampling/importance_sampling_ratio/min": 0.04921532794833183, "sampling/sampling_logp_difference/max": 1.322191596031189, "sampling/sampling_logp_difference/mean": 0.023824241012334824, "step": 2603, "step_time": 3.740318265998212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20531571842730045, "epoch": 0.02604, "grad_norm": 0.023128224536776543, "kl": 0.4571482762694359, "learning_rate": 9.996948424137373e-06, "loss": 0.0183, "step": 2604, "step_time": 2.00421804600046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.07379714958369732, "epoch": 0.02605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006531854160130024, "kl": 0.4403902590274811, "learning_rate": 9.996946047380619e-06, "loss": 0.0013, "num_tokens": 22123697.0, "reward": 0.5226923227310181, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5226923227310181, "rewards/rollout_reward_func/std": 0.09291035681962967, "sampling/importance_sampling_ratio/max": 0.9996099472045898, "sampling/importance_sampling_ratio/mean": 0.9938598871231079, "sampling/importance_sampling_ratio/min": 0.9883388876914978, "sampling/sampling_logp_difference/max": 0.006594838574528694, "sampling/sampling_logp_difference/mean": 0.0013342206366360188, "step": 2605, "step_time": 4.453490356005204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07707298547029495, "epoch": 0.02606, "grad_norm": 0.0006670531583949924, "kl": 0.43976742029190063, "learning_rate": 9.996943669699017e-06, "loss": 0.0013, "step": 2606, "step_time": 1.9467972320271656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 143.84375, "completions/mean_terminated_length": 143.84375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6752731846645474, "epoch": 0.02607, "frac_reward_zero_std": 0.5, "grad_norm": 0.015401477925479412, "kl": 0.5677847042679787, "learning_rate": 9.99694129109257e-06, "loss": -0.0287, "num_tokens": 22139036.0, "reward": 0.6930288672447205, "reward_std": 0.06284848600625992, "rewards/rollout_reward_func/mean": 0.6930288672447205, "rewards/rollout_reward_func/std": 0.2845245599746704, "sampling/importance_sampling_ratio/max": 0.9972454309463501, "sampling/importance_sampling_ratio/mean": 0.8760654926300049, "sampling/importance_sampling_ratio/min": 0.03530041128396988, "sampling/sampling_logp_difference/max": 1.71299409866333, "sampling/sampling_logp_difference/mean": 0.08607650548219681, "step": 2607, "step_time": 4.572814635001123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6791931195184588, "epoch": 0.02608, "grad_norm": 0.015141462907195091, "kl": 0.555442564189434, "learning_rate": 9.99693891156128e-06, "loss": -0.0287, "step": 2608, "step_time": 2.0073620050097816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 127.6875, "completions/mean_terminated_length": 127.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3807446528226137, "epoch": 0.02609, "frac_reward_zero_std": 0.5, "grad_norm": 0.03281637281179428, "kl": 0.9392063245177269, "learning_rate": 9.996936531105144e-06, "loss": -0.0094, "num_tokens": 22153978.0, "reward": 0.6308653950691223, "reward_std": 0.05847228318452835, "rewards/rollout_reward_func/mean": 0.6308653950691223, "rewards/rollout_reward_func/std": 0.14766332507133484, "sampling/importance_sampling_ratio/max": 0.9973583221435547, "sampling/importance_sampling_ratio/mean": 0.9398069977760315, "sampling/importance_sampling_ratio/min": 0.0355420857667923, "sampling/sampling_logp_difference/max": 1.721282720565796, "sampling/sampling_logp_difference/mean": 0.038294240832328796, "step": 2609, "step_time": 3.9634184210008243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3846561573445797, "epoch": 0.0261, "grad_norm": 0.03148285299539566, "kl": 0.9588735401630402, "learning_rate": 9.996934149724167e-06, "loss": -0.0094, "step": 2610, "step_time": 2.030152642997564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 216.65625, "completions/mean_terminated_length": 216.65625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.6335189188830554, "epoch": 0.02611, "frac_reward_zero_std": 0.5, "grad_norm": 0.030337145552039146, "kl": 0.4740973711013794, "learning_rate": 9.996931767418348e-06, "loss": 0.0006, "num_tokens": 22171647.0, "reward": 0.6675240397453308, "reward_std": 0.021428536623716354, "rewards/rollout_reward_func/mean": 0.6675240397453308, "rewards/rollout_reward_func/std": 0.20286080241203308, "sampling/importance_sampling_ratio/max": 1.0005558729171753, "sampling/importance_sampling_ratio/mean": 0.903053879737854, "sampling/importance_sampling_ratio/min": 0.0005897653754800558, "sampling/sampling_logp_difference/max": 1.4677785634994507, "sampling/sampling_logp_difference/mean": 0.0864817276597023, "step": 2611, "step_time": 4.940009732992621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6318839923478663, "epoch": 0.02612, "grad_norm": 0.03242726996541023, "kl": 0.46775850653648376, "learning_rate": 9.996929384187685e-06, "loss": 0.0007, "step": 2612, "step_time": 2.0484250159934163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 195.21875, "completions/mean_terminated_length": 195.21875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.48098273016512394, "epoch": 0.02613, "frac_reward_zero_std": 0.25, "grad_norm": 0.021173248067498207, "kl": 0.5291308052837849, "learning_rate": 9.99692700003218e-06, "loss": -0.0822, "num_tokens": 22188630.0, "reward": 0.41333654522895813, "reward_std": 0.15733125805854797, "rewards/rollout_reward_func/mean": 0.41333654522895813, "rewards/rollout_reward_func/std": 0.29623833298683167, "sampling/importance_sampling_ratio/max": 0.9987358450889587, "sampling/importance_sampling_ratio/mean": 0.8997549414634705, "sampling/importance_sampling_ratio/min": 0.0017353992443531752, "sampling/sampling_logp_difference/max": 2.148747682571411, "sampling/sampling_logp_difference/mean": 0.0686006098985672, "step": 2613, "step_time": 4.888273926000693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.48468220606446266, "epoch": 0.02614, "grad_norm": 0.02060435339808464, "kl": 0.5340553484857082, "learning_rate": 9.996924614951837e-06, "loss": -0.0823, "step": 2614, "step_time": 2.048241123004118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 245.40625, "completions/mean_terminated_length": 245.40625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.24530857102945447, "epoch": 0.02615, "frac_reward_zero_std": 0.75, "grad_norm": 0.007592888083308935, "kl": 0.39847465604543686, "learning_rate": 9.996922228946652e-06, "loss": -0.0271, "num_tokens": 22207275.0, "reward": 0.9622500538825989, "reward_std": 0.01115052867680788, "rewards/rollout_reward_func/mean": 0.9622500538825989, "rewards/rollout_reward_func/std": 0.342330664396286, "sampling/importance_sampling_ratio/max": 0.9978851079940796, "sampling/importance_sampling_ratio/mean": 0.9627286195755005, "sampling/importance_sampling_ratio/min": 0.0008078372338786721, "sampling/sampling_logp_difference/max": 1.8587768077850342, "sampling/sampling_logp_difference/mean": 0.03268004581332207, "step": 2615, "step_time": 4.3466850239856285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24851732654497027, "epoch": 0.02616, "grad_norm": 0.006844923831522465, "kl": 0.3992592953145504, "learning_rate": 9.996919842016626e-06, "loss": -0.0271, "step": 2616, "step_time": 2.4849439009994967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 99.8125, "completions/mean_terminated_length": 99.8125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.24926592409610748, "epoch": 0.02617, "frac_reward_zero_std": 0.75, "grad_norm": 0.018890898674726486, "kl": 0.4400184042751789, "learning_rate": 9.996917454161764e-06, "loss": -0.0167, "num_tokens": 22221261.0, "reward": 0.48028847575187683, "reward_std": 0.00679910508915782, "rewards/rollout_reward_func/mean": 0.48028847575187683, "rewards/rollout_reward_func/std": 0.11222196370363235, "sampling/importance_sampling_ratio/max": 0.9996982216835022, "sampling/importance_sampling_ratio/mean": 0.9668958783149719, "sampling/importance_sampling_ratio/min": 0.06898420304059982, "sampling/sampling_logp_difference/max": 1.4231042861938477, "sampling/sampling_logp_difference/mean": 0.02302144281566143, "step": 2617, "step_time": 3.6330464589991607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2548137800768018, "epoch": 0.02618, "grad_norm": 0.0189988873898983, "kl": 0.4411264397203922, "learning_rate": 9.99691506538206e-06, "loss": -0.0168, "step": 2618, "step_time": 1.9790125439976691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08450467977672815, "epoch": 0.02619, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006244687829166651, "kl": 0.5015834420919418, "learning_rate": 9.996912675677522e-06, "loss": 0.0013, "num_tokens": 22236069.0, "reward": 0.8144230842590332, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8144230842590332, "rewards/rollout_reward_func/std": 0.25548410415649414, "sampling/importance_sampling_ratio/max": 0.9976761937141418, "sampling/importance_sampling_ratio/mean": 0.9912232160568237, "sampling/importance_sampling_ratio/min": 0.984311580657959, "sampling/sampling_logp_difference/max": 0.014502767473459244, "sampling/sampling_logp_difference/mean": 0.002136940136551857, "step": 2619, "step_time": 4.362414683011593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08211649395525455, "epoch": 0.0262, "grad_norm": 0.0006031480152159929, "kl": 0.5021004341542721, "learning_rate": 9.996910285048144e-06, "loss": 0.0013, "step": 2620, "step_time": 2.0137969519855687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.0746296402066946, "epoch": 0.02621, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006590568809770048, "kl": 0.5442937687039375, "learning_rate": 9.996907893493929e-06, "loss": 0.0017, "num_tokens": 22251325.0, "reward": 0.623846173286438, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.623846173286438, "rewards/rollout_reward_func/std": 0.10217859596014023, "sampling/importance_sampling_ratio/max": 0.9992780089378357, "sampling/importance_sampling_ratio/mean": 0.9940279722213745, "sampling/importance_sampling_ratio/min": 0.9886635541915894, "sampling/sampling_logp_difference/max": 0.007287194952368736, "sampling/sampling_logp_difference/mean": 0.0014265832724049687, "step": 2621, "step_time": 3.908230780994927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07269404456019402, "epoch": 0.02622, "grad_norm": 0.0006481387536041439, "kl": 0.5446363687515259, "learning_rate": 9.99690550101488e-06, "loss": 0.0017, "step": 2622, "step_time": 2.449339251004858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 126.0, "completions/mean_terminated_length": 126.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.061524533201009035, "epoch": 0.02623, "frac_reward_zero_std": 1.0, "grad_norm": 0.00045956936082802713, "kl": 0.41313032805919647, "learning_rate": 9.996903107610993e-06, "loss": 0.0012, "num_tokens": 22266093.0, "reward": 0.9040384888648987, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9040384888648987, "rewards/rollout_reward_func/std": 0.2170269638299942, "sampling/importance_sampling_ratio/max": 1.0021322965621948, "sampling/importance_sampling_ratio/mean": 0.9955472946166992, "sampling/importance_sampling_ratio/min": 0.986466646194458, "sampling/sampling_logp_difference/max": 0.010234715417027473, "sampling/sampling_logp_difference/mean": 0.00125885009765625, "step": 2623, "step_time": 3.903186935989652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06086124200373888, "epoch": 0.02624, "grad_norm": 0.0004533917526714504, "kl": 0.41323475167155266, "learning_rate": 9.996900713282273e-06, "loss": 0.0012, "step": 2624, "step_time": 2.008742872014409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 52.0625, "completions/mean_terminated_length": 52.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5369001273065805, "epoch": 0.02625, "frac_reward_zero_std": 0.5, "grad_norm": 0.008627655915915966, "kl": 0.8521233275532722, "learning_rate": 9.996898318028719e-06, "loss": -0.0267, "num_tokens": 22278647.0, "reward": 0.7588461637496948, "reward_std": 0.031547825783491135, "rewards/rollout_reward_func/mean": 0.7588461637496948, "rewards/rollout_reward_func/std": 0.04984418302774429, "sampling/importance_sampling_ratio/max": 0.9990536570549011, "sampling/importance_sampling_ratio/mean": 0.9303842782974243, "sampling/importance_sampling_ratio/min": 0.008358651772141457, "sampling/sampling_logp_difference/max": 2.103696584701538, "sampling/sampling_logp_difference/mean": 0.09750859439373016, "step": 2625, "step_time": 3.9062008390246774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5300543662160635, "epoch": 0.02626, "grad_norm": 0.007792919874191284, "kl": 0.8579233661293983, "learning_rate": 9.99689592185033e-06, "loss": -0.0267, "step": 2626, "step_time": 1.984643871022854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 148.5, "completions/mean_terminated_length": 148.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8770614909008145, "epoch": 0.02627, "frac_reward_zero_std": 0.5, "grad_norm": 0.01588408648967743, "kl": 0.5556573122739792, "learning_rate": 9.996893524747107e-06, "loss": -0.0128, "num_tokens": 22294167.0, "reward": 0.9002091288566589, "reward_std": 0.04093829542398453, "rewards/rollout_reward_func/mean": 0.9002091288566589, "rewards/rollout_reward_func/std": 0.12776267528533936, "sampling/importance_sampling_ratio/max": 0.997522234916687, "sampling/importance_sampling_ratio/mean": 0.8999254703521729, "sampling/importance_sampling_ratio/min": 7.045248154857866e-17, "sampling/sampling_logp_difference/max": 2.9901065826416016, "sampling/sampling_logp_difference/mean": 0.2529377043247223, "step": 2627, "step_time": 4.272778278995247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8791748066432774, "epoch": 0.02628, "grad_norm": 0.01663743518292904, "kl": 0.5587641783058643, "learning_rate": 9.996891126719052e-06, "loss": -0.0128, "step": 2628, "step_time": 2.504383436011267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 130.15625, "completions/mean_terminated_length": 130.15625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.8118293615989387, "epoch": 0.02629, "frac_reward_zero_std": 0.5, "grad_norm": 0.009947962127625942, "kl": 0.5897703282535076, "learning_rate": 9.996888727766164e-06, "loss": -0.0456, "num_tokens": 22309124.0, "reward": 0.6353029012680054, "reward_std": 0.032866865396499634, "rewards/rollout_reward_func/mean": 0.6353029012680054, "rewards/rollout_reward_func/std": 0.07023109495639801, "sampling/importance_sampling_ratio/max": 0.9979639649391174, "sampling/importance_sampling_ratio/mean": 0.9327329993247986, "sampling/importance_sampling_ratio/min": 5.341512601706582e-18, "sampling/sampling_logp_difference/max": 3.5998358726501465, "sampling/sampling_logp_difference/mean": 0.3574947118759155, "step": 2629, "step_time": 4.147515788987221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8124162461608648, "epoch": 0.0263, "grad_norm": 0.009975691325962543, "kl": 0.5934752225875854, "learning_rate": 9.996886327888447e-06, "loss": -0.0456, "step": 2630, "step_time": 2.5050785789935617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 121.375, "completions/mean_terminated_length": 121.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.15521251689642668, "epoch": 0.02631, "frac_reward_zero_std": 0.75, "grad_norm": 0.027459105476737022, "kl": 0.6823059022426605, "learning_rate": 9.996883927085898e-06, "loss": 0.0191, "num_tokens": 22323720.0, "reward": 0.6806923151016235, "reward_std": 0.01359820831567049, "rewards/rollout_reward_func/mean": 0.6806923151016235, "rewards/rollout_reward_func/std": 0.2042783796787262, "sampling/importance_sampling_ratio/max": 0.9979903697967529, "sampling/importance_sampling_ratio/mean": 0.9644807577133179, "sampling/importance_sampling_ratio/min": 0.06297822296619415, "sampling/sampling_logp_difference/max": 2.239732503890991, "sampling/sampling_logp_difference/mean": 0.015826907008886337, "step": 2631, "step_time": 4.088016526009596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15582910645753145, "epoch": 0.02632, "grad_norm": 0.02803446166217327, "kl": 0.6824908256530762, "learning_rate": 9.99688152535852e-06, "loss": 0.0191, "step": 2632, "step_time": 2.002357163000852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 182.09677124023438, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.756985034327954, "epoch": 0.02633, "frac_reward_zero_std": 0.5, "grad_norm": 0.010441021993756294, "kl": 0.47572537139058113, "learning_rate": 9.99687912270631e-06, "loss": 0.0115, "num_tokens": 22340288.0, "reward": 0.8939504623413086, "reward_std": 0.0217258520424366, "rewards/rollout_reward_func/mean": 0.8939504623413086, "rewards/rollout_reward_func/std": 0.4072233736515045, "sampling/importance_sampling_ratio/max": 1.0016767978668213, "sampling/importance_sampling_ratio/mean": 0.9336668848991394, "sampling/importance_sampling_ratio/min": 7.094833655253516e-28, "sampling/sampling_logp_difference/max": 15.984912872314453, "sampling/sampling_logp_difference/mean": 0.38216400146484375, "step": 2633, "step_time": 4.365369776001899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7570370743051171, "epoch": 0.02634, "grad_norm": 0.010572131723165512, "kl": 0.4731074497103691, "learning_rate": 9.996876719129273e-06, "loss": 0.0115, "step": 2634, "step_time": 2.539098021989048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 169.0, "completions/mean_terminated_length": 169.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.06177482008934021, "epoch": 0.02635, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005377031047828496, "kl": 0.4716300331056118, "learning_rate": 9.996874314627408e-06, "loss": 0.0016, "num_tokens": 22356376.0, "reward": 0.9203076958656311, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9203076958656311, "rewards/rollout_reward_func/std": 0.34147951006889343, "sampling/importance_sampling_ratio/max": 0.9992443323135376, "sampling/importance_sampling_ratio/mean": 0.9957976341247559, "sampling/importance_sampling_ratio/min": 0.9925461411476135, "sampling/sampling_logp_difference/max": 0.004803644493222237, "sampling/sampling_logp_difference/mean": 0.0010185872670263052, "step": 2635, "step_time": 4.101997454999946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06272201938554645, "epoch": 0.02636, "grad_norm": 0.0005465629510581493, "kl": 0.4714535251259804, "learning_rate": 9.996871909200713e-06, "loss": 0.0016, "step": 2636, "step_time": 2.530815372003417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 195.71875, "completions/mean_terminated_length": 195.71875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.32724315766245127, "epoch": 0.02637, "frac_reward_zero_std": 0.75, "grad_norm": 0.0044899689964950085, "kl": 0.546218253672123, "learning_rate": 9.996869502849193e-06, "loss": -0.0267, "num_tokens": 22373375.0, "reward": 0.7910769581794739, "reward_std": 0.06255175918340683, "rewards/rollout_reward_func/mean": 0.7910769581794739, "rewards/rollout_reward_func/std": 0.3587552309036255, "sampling/importance_sampling_ratio/max": 0.9969635605812073, "sampling/importance_sampling_ratio/mean": 0.9633147120475769, "sampling/importance_sampling_ratio/min": 8.364965664586776e-11, "sampling/sampling_logp_difference/max": 11.296513557434082, "sampling/sampling_logp_difference/mean": 0.1158103495836258, "step": 2637, "step_time": 4.590070207006647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.32765580993145704, "epoch": 0.02638, "grad_norm": 0.004637204110622406, "kl": 0.5455540977418423, "learning_rate": 9.996867095572844e-06, "loss": -0.0267, "step": 2638, "step_time": 2.044715014999383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 216.8125, "completions/mean_terminated_length": 216.8125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.546880035661161, "epoch": 0.02639, "frac_reward_zero_std": 0.5, "grad_norm": 0.04419036582112312, "kl": 0.38535450771450996, "learning_rate": 9.99686468737167e-06, "loss": 0.0078, "num_tokens": 22391049.0, "reward": 1.0778558254241943, "reward_std": 0.03712311014533043, "rewards/rollout_reward_func/mean": 1.0778558254241943, "rewards/rollout_reward_func/std": 0.15393084287643433, "sampling/importance_sampling_ratio/max": 0.9979209899902344, "sampling/importance_sampling_ratio/mean": 0.9348522424697876, "sampling/importance_sampling_ratio/min": 6.659256295449363e-22, "sampling/sampling_logp_difference/max": 4.302419662475586, "sampling/sampling_logp_difference/mean": 0.2303999364376068, "step": 2639, "step_time": 4.414777677993698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5425466052256525, "epoch": 0.0264, "grad_norm": 0.048171572387218475, "kl": 0.3788522332906723, "learning_rate": 9.996862278245671e-06, "loss": 0.0077, "step": 2640, "step_time": 2.058858912998403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 192.625, "completions/mean_terminated_length": 192.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2017825241200626, "epoch": 0.02641, "frac_reward_zero_std": 0.75, "grad_norm": 0.006393441930413246, "kl": 0.45293426513671875, "learning_rate": 9.996859868194846e-06, "loss": -0.0268, "num_tokens": 22407925.0, "reward": 0.8225433230400085, "reward_std": 0.01978539302945137, "rewards/rollout_reward_func/mean": 0.8225433230400085, "rewards/rollout_reward_func/std": 0.246819406747818, "sampling/importance_sampling_ratio/max": 0.9981826543807983, "sampling/importance_sampling_ratio/mean": 0.9630680084228516, "sampling/importance_sampling_ratio/min": 0.004504589829593897, "sampling/sampling_logp_difference/max": 2.746792793273926, "sampling/sampling_logp_difference/mean": 0.02926035039126873, "step": 2641, "step_time": 4.211277620008332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1983149121515453, "epoch": 0.02642, "grad_norm": 0.006070268340408802, "kl": 0.45227091386914253, "learning_rate": 9.9968574572192e-06, "loss": -0.0268, "step": 2642, "step_time": 2.5078604230075143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.060978473629802465, "epoch": 0.02643, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006276079220697284, "kl": 0.39577533304691315, "learning_rate": 9.996855045318727e-06, "loss": 0.0016, "num_tokens": 22425677.0, "reward": 1.0140769481658936, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0140769481658936, "rewards/rollout_reward_func/std": 0.35900962352752686, "sampling/importance_sampling_ratio/max": 1.0021511316299438, "sampling/importance_sampling_ratio/mean": 0.9943324327468872, "sampling/importance_sampling_ratio/min": 0.984749972820282, "sampling/sampling_logp_difference/max": 0.008218308910727501, "sampling/sampling_logp_difference/mean": 0.0009529876988381147, "step": 2643, "step_time": 4.532463932991959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.061894853599369526, "epoch": 0.02644, "grad_norm": 0.0006439914577640593, "kl": 0.3955981768667698, "learning_rate": 9.996852632493431e-06, "loss": 0.0016, "step": 2644, "step_time": 2.035393088008277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 146.75, "completions/mean_terminated_length": 146.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2540976987220347, "epoch": 0.02645, "frac_reward_zero_std": 0.75, "grad_norm": 0.012513983994722366, "kl": 0.5074859783053398, "learning_rate": 9.996850218743313e-06, "loss": 0.0203, "num_tokens": 22441109.0, "reward": 0.6924038529396057, "reward_std": 0.006799102760851383, "rewards/rollout_reward_func/mean": 0.6924038529396057, "rewards/rollout_reward_func/std": 0.2638901472091675, "sampling/importance_sampling_ratio/max": 0.9983065724372864, "sampling/importance_sampling_ratio/mean": 0.9642988443374634, "sampling/importance_sampling_ratio/min": 0.024954188615083694, "sampling/sampling_logp_difference/max": 1.6912899017333984, "sampling/sampling_logp_difference/mean": 0.025666648522019386, "step": 2645, "step_time": 4.332247363010538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2530408836901188, "epoch": 0.02646, "grad_norm": 0.012500996701419353, "kl": 0.5067532397806644, "learning_rate": 9.996847804068374e-06, "loss": 0.0203, "step": 2646, "step_time": 1.975260184990475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.06753660552203655, "epoch": 0.02647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005413297330960631, "kl": 0.4798414930701256, "learning_rate": 9.996845388468612e-06, "loss": 0.0014, "num_tokens": 22456549.0, "reward": 0.9023076891899109, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9023076891899109, "rewards/rollout_reward_func/std": 0.223520889878273, "sampling/importance_sampling_ratio/max": 0.9994750022888184, "sampling/importance_sampling_ratio/mean": 0.995449960231781, "sampling/importance_sampling_ratio/min": 0.9901734590530396, "sampling/sampling_logp_difference/max": 0.004789307713508606, "sampling/sampling_logp_difference/mean": 0.0010254322551190853, "step": 2647, "step_time": 4.060133569997561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06871623639017344, "epoch": 0.02648, "grad_norm": 0.0005463158595375717, "kl": 0.47962453216314316, "learning_rate": 9.996842971944032e-06, "loss": 0.0014, "step": 2648, "step_time": 2.44824556200183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.32399784214794636, "epoch": 0.02649, "frac_reward_zero_std": 0.75, "grad_norm": 0.005933037959039211, "kl": 0.4081382602453232, "learning_rate": 9.996840554494628e-06, "loss": -0.0274, "num_tokens": 22472885.0, "reward": 0.6198750138282776, "reward_std": 0.008838837966322899, "rewards/rollout_reward_func/mean": 0.6198750138282776, "rewards/rollout_reward_func/std": 0.319979190826416, "sampling/importance_sampling_ratio/max": 0.9987586140632629, "sampling/importance_sampling_ratio/mean": 0.9634057283401489, "sampling/importance_sampling_ratio/min": 0.00036159323644824326, "sampling/sampling_logp_difference/max": 2.019591808319092, "sampling/sampling_logp_difference/mean": 0.049384672194719315, "step": 2649, "step_time": 4.147495245015307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.32133277133107185, "epoch": 0.0265, "grad_norm": 0.006083415821194649, "kl": 0.4078233577311039, "learning_rate": 9.996838136120408e-06, "loss": -0.0274, "step": 2650, "step_time": 2.034700745003647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 119.25, "completions/mean_terminated_length": 119.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.49223399441689253, "epoch": 0.02651, "frac_reward_zero_std": 0.5, "grad_norm": 0.009480472654104233, "kl": 0.4988168776035309, "learning_rate": 9.996835716821369e-06, "loss": -0.0079, "num_tokens": 22487437.0, "reward": 0.6336058378219604, "reward_std": 0.2062847912311554, "rewards/rollout_reward_func/mean": 0.6336058378219604, "rewards/rollout_reward_func/std": 0.4138326346874237, "sampling/importance_sampling_ratio/max": 1.0004076957702637, "sampling/importance_sampling_ratio/mean": 0.9317299127578735, "sampling/importance_sampling_ratio/min": 0.002657435368746519, "sampling/sampling_logp_difference/max": 1.983101487159729, "sampling/sampling_logp_difference/mean": 0.06946573406457901, "step": 2651, "step_time": 4.459854989006999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.49286184180527925, "epoch": 0.02652, "grad_norm": 0.009361207485198975, "kl": 0.49791907146573067, "learning_rate": 9.996833296597508e-06, "loss": -0.0079, "step": 2652, "step_time": 1.9871417639806168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 139.28125, "completions/mean_terminated_length": 139.28125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2238926663994789, "epoch": 0.02653, "frac_reward_zero_std": 0.75, "grad_norm": 0.015872936695814133, "kl": 0.5760550126433372, "learning_rate": 9.996830875448833e-06, "loss": -0.0263, "num_tokens": 22502606.0, "reward": 0.8122596144676208, "reward_std": 0.029508108273148537, "rewards/rollout_reward_func/mean": 0.8122596144676208, "rewards/rollout_reward_func/std": 0.14847779273986816, "sampling/importance_sampling_ratio/max": 1.0000354051589966, "sampling/importance_sampling_ratio/mean": 0.9639207124710083, "sampling/importance_sampling_ratio/min": 0.03847048431634903, "sampling/sampling_logp_difference/max": 1.4632484912872314, "sampling/sampling_logp_difference/mean": 0.023763010278344154, "step": 2653, "step_time": 4.405162085000484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2251152591779828, "epoch": 0.02654, "grad_norm": 0.015927471220493317, "kl": 0.5761536434292793, "learning_rate": 9.99682845337534e-06, "loss": -0.0263, "step": 2654, "step_time": 2.0196930800011614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 241.625, "completions/mean_terminated_length": 241.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.36364029813557863, "epoch": 0.02655, "frac_reward_zero_std": 0.5, "grad_norm": 0.011538224294781685, "kl": 0.4146093688905239, "learning_rate": 9.996826030377029e-06, "loss": -0.0651, "num_tokens": 22521074.0, "reward": 0.8752644062042236, "reward_std": 0.03906765207648277, "rewards/rollout_reward_func/mean": 0.8752644062042236, "rewards/rollout_reward_func/std": 0.14959745109081268, "sampling/importance_sampling_ratio/max": 1.0015193223953247, "sampling/importance_sampling_ratio/mean": 0.9317338466644287, "sampling/importance_sampling_ratio/min": 0.0016183824045583606, "sampling/sampling_logp_difference/max": 2.613020896911621, "sampling/sampling_logp_difference/mean": 0.05640176683664322, "step": 2655, "step_time": 4.469159005988331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36515672877430916, "epoch": 0.02656, "grad_norm": 0.010931815952062607, "kl": 0.41329460963606834, "learning_rate": 9.996823606453903e-06, "loss": -0.0652, "step": 2656, "step_time": 2.0572877149897977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 143.0625, "completions/mean_terminated_length": 143.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.20039255917072296, "epoch": 0.02657, "frac_reward_zero_std": 0.75, "grad_norm": 0.005149307195097208, "kl": 0.4663431793451309, "learning_rate": 9.996821181605963e-06, "loss": -0.0173, "num_tokens": 22536364.0, "reward": 0.781125009059906, "reward_std": 0.009110799990594387, "rewards/rollout_reward_func/mean": 0.781125009059906, "rewards/rollout_reward_func/std": 0.19729968905448914, "sampling/importance_sampling_ratio/max": 1.000900149345398, "sampling/importance_sampling_ratio/mean": 0.9649006128311157, "sampling/importance_sampling_ratio/min": 0.026556940749287605, "sampling/sampling_logp_difference/max": 1.6722943782806396, "sampling/sampling_logp_difference/mean": 0.02080225758254528, "step": 2657, "step_time": 4.692480671998055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20017645321786404, "epoch": 0.02658, "grad_norm": 0.005327809136360884, "kl": 0.4639250859618187, "learning_rate": 9.996818755833206e-06, "loss": -0.0173, "step": 2658, "step_time": 2.0584745699961786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 126.875, "completions/mean_terminated_length": 126.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.5335296308621764, "epoch": 0.02659, "frac_reward_zero_std": 0.5, "grad_norm": 0.013555015437304974, "kl": 0.6364800930023193, "learning_rate": 9.996816329135637e-06, "loss": 0.0019, "num_tokens": 22551224.0, "reward": 0.6880432367324829, "reward_std": 0.018126407638192177, "rewards/rollout_reward_func/mean": 0.6880432367324829, "rewards/rollout_reward_func/std": 0.08262824267148972, "sampling/importance_sampling_ratio/max": 1.000196099281311, "sampling/importance_sampling_ratio/mean": 0.9319915771484375, "sampling/importance_sampling_ratio/min": 0.001044788514263928, "sampling/sampling_logp_difference/max": 1.7813235521316528, "sampling/sampling_logp_difference/mean": 0.09105565398931503, "step": 2659, "step_time": 4.40194775499549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5321973487734795, "epoch": 0.0266, "grad_norm": 0.014017157256603241, "kl": 0.6443082168698311, "learning_rate": 9.996813901513254e-06, "loss": 0.0019, "step": 2660, "step_time": 2.0456042250079918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 99.75, "completions/mean_terminated_length": 99.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.07740197889506817, "epoch": 0.02661, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005274989525787532, "kl": 0.4599064216017723, "learning_rate": 9.996811472966058e-06, "loss": 0.0011, "num_tokens": 22565208.0, "reward": 0.6980769634246826, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6980769634246826, "rewards/rollout_reward_func/std": 0.1325596272945404, "sampling/importance_sampling_ratio/max": 1.0007528066635132, "sampling/importance_sampling_ratio/mean": 0.9944232702255249, "sampling/importance_sampling_ratio/min": 0.989531934261322, "sampling/sampling_logp_difference/max": 0.005956029519438744, "sampling/sampling_logp_difference/mean": 0.0015302820829674602, "step": 2661, "step_time": 3.5484920339949895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07734445948153734, "epoch": 0.02662, "grad_norm": 0.000528843782376498, "kl": 0.45991597324609756, "learning_rate": 9.99680904349405e-06, "loss": 0.0011, "step": 2662, "step_time": 2.40482366100332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 108.4375, "completions/mean_terminated_length": 108.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5638990113511682, "epoch": 0.02663, "frac_reward_zero_std": 0.5, "grad_norm": 0.006586179602891207, "kl": 0.6662759073078632, "learning_rate": 9.996806613097228e-06, "loss": -0.036, "num_tokens": 22579566.0, "reward": 0.7894542813301086, "reward_std": 0.2466374784708023, "rewards/rollout_reward_func/mean": 0.7894542813301086, "rewards/rollout_reward_func/std": 0.5097038745880127, "sampling/importance_sampling_ratio/max": 0.9991158843040466, "sampling/importance_sampling_ratio/mean": 0.9329713582992554, "sampling/importance_sampling_ratio/min": 0.00022730077034793794, "sampling/sampling_logp_difference/max": 1.8188583850860596, "sampling/sampling_logp_difference/mean": 0.10527672618627548, "step": 2663, "step_time": 3.9690830139879836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.563131389208138, "epoch": 0.02664, "grad_norm": 0.006995593197643757, "kl": 0.6546067446470261, "learning_rate": 9.996804181775596e-06, "loss": -0.036, "step": 2664, "step_time": 2.037164056004258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 165.90625, "completions/mean_terminated_length": 165.90625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3217832129448652, "epoch": 0.02665, "frac_reward_zero_std": 0.75, "grad_norm": 0.003209190210327506, "kl": 0.4645274430513382, "learning_rate": 9.996801749529153e-06, "loss": -0.0175, "num_tokens": 22595499.0, "reward": 0.5561730861663818, "reward_std": 0.006799102760851383, "rewards/rollout_reward_func/mean": 0.5561730861663818, "rewards/rollout_reward_func/std": 0.12936171889305115, "sampling/importance_sampling_ratio/max": 0.9995116591453552, "sampling/importance_sampling_ratio/mean": 0.9640644788742065, "sampling/importance_sampling_ratio/min": 0.0033424324356019497, "sampling/sampling_logp_difference/max": 1.5861656665802002, "sampling/sampling_logp_difference/mean": 0.0312919095158577, "step": 2665, "step_time": 4.5526293580041965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3228919431567192, "epoch": 0.02666, "grad_norm": 0.0031649896409362555, "kl": 0.46998219564557076, "learning_rate": 9.996799316357901e-06, "loss": -0.0175, "step": 2666, "step_time": 2.007653702996322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 144.6875, "completions/mean_terminated_length": 144.6875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.23783902637660503, "epoch": 0.02667, "frac_reward_zero_std": 0.75, "grad_norm": 0.012397930026054382, "kl": 0.5297735333442688, "learning_rate": 9.996796882261842e-06, "loss": -0.0168, "num_tokens": 22610865.0, "reward": 0.6826153993606567, "reward_std": 0.024476774036884308, "rewards/rollout_reward_func/mean": 0.6826153993606567, "rewards/rollout_reward_func/std": 0.14047834277153015, "sampling/importance_sampling_ratio/max": 1.0022432804107666, "sampling/importance_sampling_ratio/mean": 0.9652490615844727, "sampling/importance_sampling_ratio/min": 0.043573979288339615, "sampling/sampling_logp_difference/max": 1.617689847946167, "sampling/sampling_logp_difference/mean": 0.025641947984695435, "step": 2667, "step_time": 4.235427755993442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23840058967471123, "epoch": 0.02668, "grad_norm": 0.011773992329835892, "kl": 0.5331183224916458, "learning_rate": 9.996794447240971e-06, "loss": -0.0169, "step": 2668, "step_time": 2.480083005008055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 192.34375, "completions/mean_terminated_length": 192.34375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.24445686815306544, "epoch": 0.02669, "frac_reward_zero_std": 0.75, "grad_norm": 0.031229309737682343, "kl": 0.4320248067378998, "learning_rate": 9.996792011295291e-06, "loss": -0.0293, "num_tokens": 22627756.0, "reward": 0.9185624718666077, "reward_std": 0.014676112681627274, "rewards/rollout_reward_func/mean": 0.9185624718666077, "rewards/rollout_reward_func/std": 0.23958390951156616, "sampling/importance_sampling_ratio/max": 0.9979482293128967, "sampling/importance_sampling_ratio/mean": 0.940453290939331, "sampling/importance_sampling_ratio/min": 0.01238967850804329, "sampling/sampling_logp_difference/max": 1.5685957670211792, "sampling/sampling_logp_difference/mean": 0.027863282710313797, "step": 2669, "step_time": 4.44274939999741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24231963930651546, "epoch": 0.0267, "grad_norm": 0.03357522562146187, "kl": 0.43161851167678833, "learning_rate": 9.996789574424806e-06, "loss": -0.0293, "step": 2670, "step_time": 2.0559726919964305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 125.4375, "completions/mean_terminated_length": 125.4375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.5902386317029595, "epoch": 0.02671, "frac_reward_zero_std": 0.5, "grad_norm": 0.00867509562522173, "kl": 0.592425212264061, "learning_rate": 9.996787136629513e-06, "loss": -0.036, "num_tokens": 22642626.0, "reward": 0.7041490077972412, "reward_std": 0.043745435774326324, "rewards/rollout_reward_func/mean": 0.7041490077972412, "rewards/rollout_reward_func/std": 0.13092979788780212, "sampling/importance_sampling_ratio/max": 1.0014508962631226, "sampling/importance_sampling_ratio/mean": 0.9352465867996216, "sampling/importance_sampling_ratio/min": 7.202819883502659e-14, "sampling/sampling_logp_difference/max": 4.0753960609436035, "sampling/sampling_logp_difference/mean": 0.22302883863449097, "step": 2671, "step_time": 4.366701292012294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5910800658166409, "epoch": 0.02672, "grad_norm": 0.008557550609111786, "kl": 0.5961564406752586, "learning_rate": 9.996784697909414e-06, "loss": -0.036, "step": 2672, "step_time": 2.015378806005174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 103.0, "completions/max_terminated_length": 103.0, "completions/mean_length": 92.09375, "completions/mean_terminated_length": 92.09375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.9308705665171146, "epoch": 0.02673, "frac_reward_zero_std": 0.25, "grad_norm": 0.013482466340065002, "kl": 0.7641698569059372, "learning_rate": 9.99678225826451e-06, "loss": -0.0541, "num_tokens": 22656197.0, "reward": 0.5826923251152039, "reward_std": 0.027196411043405533, "rewards/rollout_reward_func/mean": 0.5826923251152039, "rewards/rollout_reward_func/std": 0.17605243623256683, "sampling/importance_sampling_ratio/max": 0.9996529817581177, "sampling/importance_sampling_ratio/mean": 0.9048470258712769, "sampling/importance_sampling_ratio/min": 2.6723824762044713e-24, "sampling/sampling_logp_difference/max": 9.594253540039062, "sampling/sampling_logp_difference/mean": 0.3907526135444641, "step": 2673, "step_time": 3.7626828409993323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9338519666343927, "epoch": 0.02674, "grad_norm": 0.013853211887180805, "kl": 0.7516141012310982, "learning_rate": 9.996779817694799e-06, "loss": -0.0541, "step": 2674, "step_time": 2.4308197779973852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 152.53125, "completions/mean_terminated_length": 152.53125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.6832407549954951, "epoch": 0.02675, "frac_reward_zero_std": 0.25, "grad_norm": 0.009777368046343327, "kl": 0.6065841615200043, "learning_rate": 9.996777376200286e-06, "loss": -0.0264, "num_tokens": 22671926.0, "reward": 0.7371634840965271, "reward_std": 0.0431063137948513, "rewards/rollout_reward_func/mean": 0.7371634840965271, "rewards/rollout_reward_func/std": 0.3337218463420868, "sampling/importance_sampling_ratio/max": 1.0016547441482544, "sampling/importance_sampling_ratio/mean": 0.903431236743927, "sampling/importance_sampling_ratio/min": 0.000404213962610811, "sampling/sampling_logp_difference/max": 1.9950629472732544, "sampling/sampling_logp_difference/mean": 0.10121209919452667, "step": 2675, "step_time": 3.952535329008242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.687612381298095, "epoch": 0.02676, "grad_norm": 0.009381602518260479, "kl": 0.6013820879161358, "learning_rate": 9.996774933780967e-06, "loss": -0.0264, "step": 2676, "step_time": 2.008809636005026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 175.3125, "completions/mean_terminated_length": 175.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.34015461849048734, "epoch": 0.02677, "frac_reward_zero_std": 0.75, "grad_norm": 0.0059844981878995895, "kl": 0.5498669147491455, "learning_rate": 9.996772490436846e-06, "loss": -0.0077, "num_tokens": 22688392.0, "reward": 0.8209519386291504, "reward_std": 0.03331560641527176, "rewards/rollout_reward_func/mean": 0.8209519386291504, "rewards/rollout_reward_func/std": 0.4033139646053314, "sampling/importance_sampling_ratio/max": 1.0084056854248047, "sampling/importance_sampling_ratio/mean": 0.9674928188323975, "sampling/importance_sampling_ratio/min": 0.016738226637244225, "sampling/sampling_logp_difference/max": 1.990975260734558, "sampling/sampling_logp_difference/mean": 0.04343094676733017, "step": 2677, "step_time": 5.020338593007182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34153653960675, "epoch": 0.02678, "grad_norm": 0.006207246799021959, "kl": 0.5473158955574036, "learning_rate": 9.99677004616792e-06, "loss": -0.0077, "step": 2678, "step_time": 2.003937463989132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 125.5, "completions/mean_terminated_length": 125.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.05990229221060872, "epoch": 0.02679, "frac_reward_zero_std": 1.0, "grad_norm": 0.00042741064680740237, "kl": 0.5328888446092606, "learning_rate": 9.996767600974195e-06, "loss": 0.0015, "num_tokens": 22703144.0, "reward": 0.863653838634491, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.863653838634491, "rewards/rollout_reward_func/std": 0.17428970336914062, "sampling/importance_sampling_ratio/max": 0.9997617602348328, "sampling/importance_sampling_ratio/mean": 0.9966065287590027, "sampling/importance_sampling_ratio/min": 0.9933505654335022, "sampling/sampling_logp_difference/max": 0.003487084060907364, "sampling/sampling_logp_difference/mean": 0.000896607874892652, "step": 2679, "step_time": 4.025800640993111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0581972636282444, "epoch": 0.0268, "grad_norm": 0.00041852492722682655, "kl": 0.5331636555492878, "learning_rate": 9.996765154855666e-06, "loss": 0.0015, "step": 2680, "step_time": 2.442665016009414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.524248419329524, "epoch": 0.02681, "frac_reward_zero_std": 0.5, "grad_norm": 0.01790509559214115, "kl": 0.8344783186912537, "learning_rate": 9.996762707812337e-06, "loss": -0.0469, "num_tokens": 22718508.0, "reward": 0.7747005224227905, "reward_std": 0.0848880335688591, "rewards/rollout_reward_func/mean": 0.7747005224227905, "rewards/rollout_reward_func/std": 0.3000222444534302, "sampling/importance_sampling_ratio/max": 0.9988771677017212, "sampling/importance_sampling_ratio/mean": 0.9080945253372192, "sampling/importance_sampling_ratio/min": 0.0011577224358916283, "sampling/sampling_logp_difference/max": 1.8858106136322021, "sampling/sampling_logp_difference/mean": 0.08535987883806229, "step": 2681, "step_time": 4.424560679013666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5246228454634547, "epoch": 0.02682, "grad_norm": 0.017136534675955772, "kl": 0.8390968516469002, "learning_rate": 9.996760259844209e-06, "loss": -0.0469, "step": 2682, "step_time": 2.509051149994775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 123.0625, "completions/mean_terminated_length": 123.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2549042166210711, "epoch": 0.02683, "frac_reward_zero_std": 0.75, "grad_norm": 0.015054656192660332, "kl": 0.5106775276362896, "learning_rate": 9.996757810951281e-06, "loss": 0.0201, "num_tokens": 22733182.0, "reward": 0.5608654022216797, "reward_std": 0.00679910508915782, "rewards/rollout_reward_func/mean": 0.5608654022216797, "rewards/rollout_reward_func/std": 0.2135022133588791, "sampling/importance_sampling_ratio/max": 1.0000773668289185, "sampling/importance_sampling_ratio/mean": 0.9663428068161011, "sampling/importance_sampling_ratio/min": 0.023529864847660065, "sampling/sampling_logp_difference/max": 1.6386408805847168, "sampling/sampling_logp_difference/mean": 0.03211858123540878, "step": 2683, "step_time": 3.9312919479925768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2563890074379742, "epoch": 0.02684, "grad_norm": 0.014647158794105053, "kl": 0.5009382963180542, "learning_rate": 9.996755361133552e-06, "loss": 0.0201, "step": 2684, "step_time": 2.0216608890114003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 149.375, "completions/mean_terminated_length": 149.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.21113791642710567, "epoch": 0.02685, "frac_reward_zero_std": 0.75, "grad_norm": 0.014843268319964409, "kl": 0.550067026168108, "learning_rate": 9.996752910391026e-06, "loss": 0.0196, "num_tokens": 22748730.0, "reward": 0.7653557658195496, "reward_std": 0.0006799129769206047, "rewards/rollout_reward_func/mean": 0.7653557658195496, "rewards/rollout_reward_func/std": 0.358338326215744, "sampling/importance_sampling_ratio/max": 1.0011184215545654, "sampling/importance_sampling_ratio/mean": 0.9663636684417725, "sampling/importance_sampling_ratio/min": 0.021413274109363556, "sampling/sampling_logp_difference/max": 1.8221280574798584, "sampling/sampling_logp_difference/mean": 0.030886627733707428, "step": 2685, "step_time": 4.145394106999447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21326830005273223, "epoch": 0.02686, "grad_norm": 0.015142525546252728, "kl": 0.5535202100872993, "learning_rate": 9.9967504587237e-06, "loss": 0.0197, "step": 2686, "step_time": 2.4616262650088174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 235.78125, "completions/mean_terminated_length": 235.78125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.17923660390079021, "epoch": 0.02687, "frac_reward_zero_std": 0.75, "grad_norm": 0.017723634839057922, "kl": 0.4460650123655796, "learning_rate": 9.996748006131578e-06, "loss": 0.0403, "num_tokens": 22766955.0, "reward": 0.510937511920929, "reward_std": 0.012877500616014004, "rewards/rollout_reward_func/mean": 0.510937511920929, "rewards/rollout_reward_func/std": 0.1644875556230545, "sampling/importance_sampling_ratio/max": 1.0083281993865967, "sampling/importance_sampling_ratio/mean": 0.9664578437805176, "sampling/importance_sampling_ratio/min": 0.004331531934440136, "sampling/sampling_logp_difference/max": 2.7257823944091797, "sampling/sampling_logp_difference/mean": 0.025020450353622437, "step": 2687, "step_time": 4.378614527988248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18132088659331203, "epoch": 0.02688, "grad_norm": 0.01804487220942974, "kl": 0.4453341141343117, "learning_rate": 9.996745552614661e-06, "loss": 0.0403, "step": 2688, "step_time": 2.495335024992528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 175.0625, "completions/mean_terminated_length": 175.0625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.14313773810863495, "epoch": 0.02689, "frac_reward_zero_std": 0.5, "grad_norm": 0.34715184569358826, "kl": 0.6152595169842243, "learning_rate": 9.996743098172948e-06, "loss": -0.0122, "num_tokens": 22783357.0, "reward": 0.8400336503982544, "reward_std": 0.16838659346103668, "rewards/rollout_reward_func/mean": 0.8400336503982544, "rewards/rollout_reward_func/std": 0.49873560667037964, "sampling/importance_sampling_ratio/max": 1.0091601610183716, "sampling/importance_sampling_ratio/mean": 0.9514156579971313, "sampling/importance_sampling_ratio/min": 0.09316682815551758, "sampling/sampling_logp_difference/max": 2.4286906719207764, "sampling/sampling_logp_difference/mean": 0.018098587170243263, "step": 2689, "step_time": 4.24785480400169 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 0.13992365077137947, "epoch": 0.0269, "grad_norm": 0.15481802821159363, "kl": 0.5782671719789505, "learning_rate": 9.996740642806437e-06, "loss": -0.0137, "step": 2690, "step_time": 2.0639594750027754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 123.25, "completions/mean_terminated_length": 123.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3110027043148875, "epoch": 0.02691, "frac_reward_zero_std": 0.75, "grad_norm": 0.004250999074429274, "kl": 0.49798303470015526, "learning_rate": 9.996738186515133e-06, "loss": -0.0178, "num_tokens": 22798013.0, "reward": 0.45768123865127563, "reward_std": 0.020700551569461823, "rewards/rollout_reward_func/mean": 0.45768123865127563, "rewards/rollout_reward_func/std": 0.1468804031610489, "sampling/importance_sampling_ratio/max": 1.0010062456130981, "sampling/importance_sampling_ratio/mean": 0.965617299079895, "sampling/importance_sampling_ratio/min": 0.00012098383740521967, "sampling/sampling_logp_difference/max": 2.4760584831237793, "sampling/sampling_logp_difference/mean": 0.054628632962703705, "step": 2691, "step_time": 4.01974383200286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31177784409374, "epoch": 0.02692, "grad_norm": 0.004113879054784775, "kl": 0.49507106840610504, "learning_rate": 9.996735729299035e-06, "loss": -0.0179, "step": 2692, "step_time": 2.459085453992884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 172.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.06495766434818506, "epoch": 0.02693, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006220475770533085, "kl": 0.5199658609926701, "learning_rate": 9.99673327115814e-06, "loss": 0.0017, "num_tokens": 22814261.0, "reward": 0.5303846001625061, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5303846001625061, "rewards/rollout_reward_func/std": 0.24036969244480133, "sampling/importance_sampling_ratio/max": 1.000016212463379, "sampling/importance_sampling_ratio/mean": 0.995177686214447, "sampling/importance_sampling_ratio/min": 0.988845705986023, "sampling/sampling_logp_difference/max": 0.00895511731505394, "sampling/sampling_logp_difference/mean": 0.0011043415870517492, "step": 2693, "step_time": 4.009771102995728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.068315165117383, "epoch": 0.02694, "grad_norm": 0.0007373208645731211, "kl": 0.5192366242408752, "learning_rate": 9.996730812092456e-06, "loss": 0.0017, "step": 2694, "step_time": 2.4060903540012077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 153.34375, "completions/mean_terminated_length": 153.34375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.7093234853819013, "epoch": 0.02695, "frac_reward_zero_std": 0.5, "grad_norm": 0.007540390361100435, "kl": 0.5198792815208435, "learning_rate": 9.996728352101978e-06, "loss": -0.046, "num_tokens": 22829904.0, "reward": 0.7881730794906616, "reward_std": 0.055752649903297424, "rewards/rollout_reward_func/mean": 0.7881730794906616, "rewards/rollout_reward_func/std": 0.2783237099647522, "sampling/importance_sampling_ratio/max": 0.9970198273658752, "sampling/importance_sampling_ratio/mean": 0.9306706190109253, "sampling/importance_sampling_ratio/min": 1.6060720753064347e-18, "sampling/sampling_logp_difference/max": 3.396347999572754, "sampling/sampling_logp_difference/mean": 0.24965953826904297, "step": 2695, "step_time": 4.200580045013339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7072152700275183, "epoch": 0.02696, "grad_norm": 0.008136429823935032, "kl": 0.5180570967495441, "learning_rate": 9.99672589118671e-06, "loss": -0.046, "step": 2696, "step_time": 2.0554390620091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 69.25, "completions/mean_terminated_length": 69.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07866628095507622, "epoch": 0.02697, "frac_reward_zero_std": 1.0, "grad_norm": 0.00046641984954476357, "kl": 0.6989715322852135, "learning_rate": 9.996723429346647e-06, "loss": 0.0015, "num_tokens": 22842832.0, "reward": 0.6673076748847961, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6673076748847961, "rewards/rollout_reward_func/std": 0.1658240407705307, "sampling/importance_sampling_ratio/max": 0.99973464012146, "sampling/importance_sampling_ratio/mean": 0.9951845407485962, "sampling/importance_sampling_ratio/min": 0.9912819862365723, "sampling/sampling_logp_difference/max": 0.0074104368686676025, "sampling/sampling_logp_difference/mean": 0.0014920772518962622, "step": 2697, "step_time": 4.051876139004889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08030449226498604, "epoch": 0.02698, "grad_norm": 0.0004767725185956806, "kl": 0.6986871659755707, "learning_rate": 9.996720966581795e-06, "loss": 0.0015, "step": 2698, "step_time": 1.9800242410128703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 168.25, "completions/mean_terminated_length": 168.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.22181745525449514, "epoch": 0.02699, "frac_reward_zero_std": 0.5, "grad_norm": 0.6175949573516846, "kl": 0.445529293268919, "learning_rate": 9.996718502892154e-06, "loss": -0.0322, "num_tokens": 22859040.0, "reward": 0.6671779155731201, "reward_std": 0.01450929045677185, "rewards/rollout_reward_func/mean": 0.6671779155731201, "rewards/rollout_reward_func/std": 0.20253878831863403, "sampling/importance_sampling_ratio/max": 1.0031256675720215, "sampling/importance_sampling_ratio/mean": 0.9512840509414673, "sampling/importance_sampling_ratio/min": 0.018824027851223946, "sampling/sampling_logp_difference/max": 1.9031139612197876, "sampling/sampling_logp_difference/mean": 0.029660996049642563, "step": 2699, "step_time": 4.265824512993277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.23125752341002226, "epoch": 0.027, "grad_norm": 0.010871228761970997, "kl": 0.4613778777420521, "learning_rate": 9.996716038277723e-06, "loss": -0.0341, "step": 2700, "step_time": 2.4295021560174064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 145.03125, "completions/mean_terminated_length": 145.03125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.36665447149425745, "epoch": 0.02701, "frac_reward_zero_std": 0.75, "grad_norm": 0.007060991134494543, "kl": 0.41763637587428093, "learning_rate": 9.996713572738503e-06, "loss": -0.0274, "num_tokens": 22874537.0, "reward": 0.5177115201950073, "reward_std": 0.01115052867680788, "rewards/rollout_reward_func/mean": 0.5177115201950073, "rewards/rollout_reward_func/std": 0.10191415250301361, "sampling/importance_sampling_ratio/max": 0.9980820417404175, "sampling/importance_sampling_ratio/mean": 0.9579682350158691, "sampling/importance_sampling_ratio/min": 0.00042913545621559024, "sampling/sampling_logp_difference/max": 2.0851047039031982, "sampling/sampling_logp_difference/mean": 0.052231431007385254, "step": 2701, "step_time": 3.9503460159976385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35584334284067154, "epoch": 0.02702, "grad_norm": 0.005816637072712183, "kl": 0.4257913716137409, "learning_rate": 9.996711106274495e-06, "loss": -0.0274, "step": 2702, "step_time": 2.027693972995621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.24542652117088437, "epoch": 0.02703, "frac_reward_zero_std": 0.75, "grad_norm": 0.014630456455051899, "kl": 0.5238379463553429, "learning_rate": 9.9967086388857e-06, "loss": -0.0267, "num_tokens": 22890093.0, "reward": 0.9268269538879395, "reward_std": 0.006799111608415842, "rewards/rollout_reward_func/mean": 0.9268269538879395, "rewards/rollout_reward_func/std": 0.21367380023002625, "sampling/importance_sampling_ratio/max": 0.9991259574890137, "sampling/importance_sampling_ratio/mean": 0.9642804861068726, "sampling/importance_sampling_ratio/min": 0.008717359974980354, "sampling/sampling_logp_difference/max": 1.567431926727295, "sampling/sampling_logp_difference/mean": 0.02718648500740528, "step": 2703, "step_time": 4.575289022002835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2493078992702067, "epoch": 0.02704, "grad_norm": 0.010281823575496674, "kl": 0.5241713672876358, "learning_rate": 9.996706170572117e-06, "loss": -0.0268, "step": 2704, "step_time": 1.9988069820028613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 172.125, "completions/mean_terminated_length": 172.125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.1110850116237998, "epoch": 0.02705, "frac_reward_zero_std": 0.75, "grad_norm": 0.04940864071249962, "kl": 0.5270413607358932, "learning_rate": 9.996703701333748e-06, "loss": -0.0285, "num_tokens": 22906281.0, "reward": 1.052398920059204, "reward_std": 0.010946566238999367, "rewards/rollout_reward_func/mean": 1.052398920059204, "rewards/rollout_reward_func/std": 0.24974755942821503, "sampling/importance_sampling_ratio/max": 0.9985567927360535, "sampling/importance_sampling_ratio/mean": 0.9705297946929932, "sampling/importance_sampling_ratio/min": 0.2132202535867691, "sampling/sampling_logp_difference/max": 1.4704164266586304, "sampling/sampling_logp_difference/mean": 0.008987931534647942, "step": 2705, "step_time": 4.460660605989688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11120141157880425, "epoch": 0.02706, "grad_norm": 0.04473694786429405, "kl": 0.5319979675114155, "learning_rate": 9.996701231170595e-06, "loss": -0.0286, "step": 2706, "step_time": 2.499406403003377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 78.40625, "completions/mean_terminated_length": 78.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9361594198271632, "epoch": 0.02707, "frac_reward_zero_std": 0.25, "grad_norm": 0.07507386058568954, "kl": 0.86802738904953, "learning_rate": 9.996698760082655e-06, "loss": -0.0374, "num_tokens": 22919678.0, "reward": 0.6926971077919006, "reward_std": 0.045169897377491, "rewards/rollout_reward_func/mean": 0.6926971077919006, "rewards/rollout_reward_func/std": 0.15147249400615692, "sampling/importance_sampling_ratio/max": 0.9995819926261902, "sampling/importance_sampling_ratio/mean": 0.8787546753883362, "sampling/importance_sampling_ratio/min": 3.267431661984995e-18, "sampling/sampling_logp_difference/max": 3.872347831726074, "sampling/sampling_logp_difference/mean": 0.38672593235969543, "step": 2707, "step_time": 3.8964407020030194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.9443324720486999, "epoch": 0.02708, "grad_norm": 0.015325671061873436, "kl": 0.8981945514678955, "learning_rate": 9.996696288069931e-06, "loss": -0.0377, "step": 2708, "step_time": 2.031740021004225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 152.15625, "completions/mean_terminated_length": 152.15625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.6004901574924588, "epoch": 0.02709, "frac_reward_zero_std": 0.75, "grad_norm": 0.02360963448882103, "kl": 0.6351150348782539, "learning_rate": 9.996693815132424e-06, "loss": -0.0169, "num_tokens": 22935227.0, "reward": 0.5178846120834351, "reward_std": 0.06255175173282623, "rewards/rollout_reward_func/mean": 0.5178846120834351, "rewards/rollout_reward_func/std": 0.21337229013442993, "sampling/importance_sampling_ratio/max": 1.004520058631897, "sampling/importance_sampling_ratio/mean": 0.9320551156997681, "sampling/importance_sampling_ratio/min": 9.888931782275033e-15, "sampling/sampling_logp_difference/max": 13.356659889221191, "sampling/sampling_logp_difference/mean": 0.24692651629447937, "step": 2709, "step_time": 5.126496724013123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6058622892014682, "epoch": 0.0271, "grad_norm": 0.024174630641937256, "kl": 0.6424708738923073, "learning_rate": 9.996691341270131e-06, "loss": -0.0169, "step": 2710, "step_time": 2.0862683700106572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 172.75, "completions/mean_terminated_length": 172.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.35409480752423406, "epoch": 0.02711, "frac_reward_zero_std": 0.75, "grad_norm": 0.009755972772836685, "kl": 0.43640242144465446, "learning_rate": 9.996688866483057e-06, "loss": 0.0207, "num_tokens": 22951379.0, "reward": 0.9495673179626465, "reward_std": 0.015637939795851707, "rewards/rollout_reward_func/mean": 0.9495673179626465, "rewards/rollout_reward_func/std": 0.320895791053772, "sampling/importance_sampling_ratio/max": 0.9999569654464722, "sampling/importance_sampling_ratio/mean": 0.965405285358429, "sampling/importance_sampling_ratio/min": 5.57311977900099e-05, "sampling/sampling_logp_difference/max": 2.2179083824157715, "sampling/sampling_logp_difference/mean": 0.06053405627608299, "step": 2711, "step_time": 4.4134282960003475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35384020395576954, "epoch": 0.02712, "grad_norm": 0.009552915580570698, "kl": 0.43241118267178535, "learning_rate": 9.996686390771202e-06, "loss": 0.0207, "step": 2712, "step_time": 2.4971388979975018 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 150.625, "completions/mean_terminated_length": 150.625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.07872644439339638, "epoch": 0.02713, "frac_reward_zero_std": 0.75, "grad_norm": 1.3998488187789917, "kl": 0.4993758909404278, "learning_rate": 9.996683914134564e-06, "loss": -0.0134, "num_tokens": 22966879.0, "reward": 0.7841634750366211, "reward_std": 0.018747860565781593, "rewards/rollout_reward_func/mean": 0.7841634750366211, "rewards/rollout_reward_func/std": 0.26918408274650574, "sampling/importance_sampling_ratio/max": 1.1140284538269043, "sampling/importance_sampling_ratio/mean": 0.9871234893798828, "sampling/importance_sampling_ratio/min": 0.8653939962387085, "sampling/sampling_logp_difference/max": 0.1140143871307373, "sampling/sampling_logp_difference/mean": 0.004105774220079184, "step": 2713, "step_time": 4.343015640995873 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.06666666828095913, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07708333525806665, "entropy": 0.3108019004575908, "epoch": 0.02714, "grad_norm": 0.03616519644856453, "kl": 0.5235159359872341, "learning_rate": 9.996681436573146e-06, "loss": -0.0162, "step": 2714, "step_time": 2.0585558859966113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 118.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07757194060832262, "epoch": 0.02715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005952457431703806, "kl": 0.5985148921608925, "learning_rate": 9.996678958086948e-06, "loss": 0.0016, "num_tokens": 22981367.0, "reward": 0.7289614677429199, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7289614677429199, "rewards/rollout_reward_func/std": 0.22641389071941376, "sampling/importance_sampling_ratio/max": 1.000695824623108, "sampling/importance_sampling_ratio/mean": 0.9946799874305725, "sampling/importance_sampling_ratio/min": 0.988376259803772, "sampling/sampling_logp_difference/max": 0.009852923452854156, "sampling/sampling_logp_difference/mean": 0.001249802648089826, "step": 2715, "step_time": 4.711666717004846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07998078782111406, "epoch": 0.02716, "grad_norm": 0.0006174939917400479, "kl": 0.5980330854654312, "learning_rate": 9.996676478675969e-06, "loss": 0.0016, "step": 2716, "step_time": 2.0572161600139225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 94.25, "completions/mean_terminated_length": 94.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09513426572084427, "epoch": 0.02717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007240201230160892, "kl": 0.6979748979210854, "learning_rate": 9.996673998340211e-06, "loss": 0.0016, "num_tokens": 22995183.0, "reward": 0.906653881072998, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.906653881072998, "rewards/rollout_reward_func/std": 0.25536900758743286, "sampling/importance_sampling_ratio/max": 0.9999824166297913, "sampling/importance_sampling_ratio/mean": 0.9900432825088501, "sampling/importance_sampling_ratio/min": 0.9816091656684875, "sampling/sampling_logp_difference/max": 0.017386961728334427, "sampling/sampling_logp_difference/mean": 0.0025905335787683725, "step": 2717, "step_time": 4.010542236981564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09713564533740282, "epoch": 0.02718, "grad_norm": 0.000723240606021136, "kl": 0.6976684555411339, "learning_rate": 9.996671517079676e-06, "loss": 0.0016, "step": 2718, "step_time": 2.479627472006541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 95.03125, "completions/mean_terminated_length": 95.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8647950924932957, "epoch": 0.02719, "frac_reward_zero_std": 0.5, "grad_norm": 0.007601352408528328, "kl": 0.660183984786272, "learning_rate": 9.996669034894363e-06, "loss": -0.0408, "num_tokens": 23009080.0, "reward": 0.5564903616905212, "reward_std": 0.15167000889778137, "rewards/rollout_reward_func/mean": 0.5564903616905212, "rewards/rollout_reward_func/std": 0.3083479702472687, "sampling/importance_sampling_ratio/max": 1.0012933015823364, "sampling/importance_sampling_ratio/mean": 0.9024208784103394, "sampling/importance_sampling_ratio/min": 0.00035698252031579614, "sampling/sampling_logp_difference/max": 2.2560951709747314, "sampling/sampling_logp_difference/mean": 0.15328681468963623, "step": 2719, "step_time": 3.9035920909882407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8653111383318901, "epoch": 0.0272, "grad_norm": 0.007712604943662882, "kl": 0.6601298190653324, "learning_rate": 9.996666551784273e-06, "loss": -0.0408, "step": 2720, "step_time": 2.0206509020063095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 116.5, "completions/mean_terminated_length": 116.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08910503797233105, "epoch": 0.02721, "frac_reward_zero_std": 1.0, "grad_norm": 0.000668404798489064, "kl": 0.5343032963573933, "learning_rate": 9.996664067749407e-06, "loss": 0.0014, "num_tokens": 23023576.0, "reward": 0.7438461780548096, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7438461780548096, "rewards/rollout_reward_func/std": 0.22257956862449646, "sampling/importance_sampling_ratio/max": 1.000862717628479, "sampling/importance_sampling_ratio/mean": 0.9935644865036011, "sampling/importance_sampling_ratio/min": 0.9855075478553772, "sampling/sampling_logp_difference/max": 0.0134788379073143, "sampling/sampling_logp_difference/mean": 0.0015795291401445866, "step": 2721, "step_time": 4.339279270003317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08952723164111376, "epoch": 0.02722, "grad_norm": 0.0006658196216449142, "kl": 0.5342062897980213, "learning_rate": 9.996661582789763e-06, "loss": 0.0014, "step": 2722, "step_time": 2.016419419007434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 126.84375, "completions/mean_terminated_length": 126.84375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2750026360154152, "epoch": 0.02723, "frac_reward_zero_std": 0.75, "grad_norm": 0.004831962753087282, "kl": 0.46573272347450256, "learning_rate": 9.996659096905345e-06, "loss": -0.0177, "num_tokens": 23038403.0, "reward": 0.8704326748847961, "reward_std": 0.00475937407463789, "rewards/rollout_reward_func/mean": 0.8704326748847961, "rewards/rollout_reward_func/std": 0.2545451521873474, "sampling/importance_sampling_ratio/max": 0.999335765838623, "sampling/importance_sampling_ratio/mean": 0.9643872976303101, "sampling/importance_sampling_ratio/min": 1.2132206350656816e-09, "sampling/sampling_logp_difference/max": 13.32921028137207, "sampling/sampling_logp_difference/mean": 0.12322243303060532, "step": 2723, "step_time": 3.908421415988414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2732934122905135, "epoch": 0.02724, "grad_norm": 0.004971690475940704, "kl": 0.4648440293967724, "learning_rate": 9.996656610096151e-06, "loss": -0.0178, "step": 2724, "step_time": 1.9901501499989536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 170.4375, "completions/mean_terminated_length": 170.4375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2782971393316984, "epoch": 0.02725, "frac_reward_zero_std": 0.5, "grad_norm": 0.0162170622497797, "kl": 0.6362428404390812, "learning_rate": 9.996654122362184e-06, "loss": 0.0221, "num_tokens": 23054657.0, "reward": 0.907249927520752, "reward_std": 0.02828427217900753, "rewards/rollout_reward_func/mean": 0.907249927520752, "rewards/rollout_reward_func/std": 0.3061581552028656, "sampling/importance_sampling_ratio/max": 0.997320294380188, "sampling/importance_sampling_ratio/mean": 0.9343242049217224, "sampling/importance_sampling_ratio/min": 0.005922320764511824, "sampling/sampling_logp_difference/max": 2.4591920375823975, "sampling/sampling_logp_difference/mean": 0.04410480335354805, "step": 2725, "step_time": 4.697469863996957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2797149885445833, "epoch": 0.02726, "grad_norm": 0.016115393489599228, "kl": 0.6505373269319534, "learning_rate": 9.996651633703442e-06, "loss": 0.0221, "step": 2726, "step_time": 2.5280524819972925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 138.1875, "completions/mean_terminated_length": 138.1875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.8775538336485624, "epoch": 0.02727, "frac_reward_zero_std": 0.5, "grad_norm": 0.006321484223008156, "kl": 0.49634654074907303, "learning_rate": 9.99664914411993e-06, "loss": -0.0317, "num_tokens": 23069759.0, "reward": 0.6858173608779907, "reward_std": 0.015939006581902504, "rewards/rollout_reward_func/mean": 0.6858173608779907, "rewards/rollout_reward_func/std": 0.34776362776756287, "sampling/importance_sampling_ratio/max": 1.00009024143219, "sampling/importance_sampling_ratio/mean": 0.9002168774604797, "sampling/importance_sampling_ratio/min": 1.2584509079260897e-07, "sampling/sampling_logp_difference/max": 3.4910948276519775, "sampling/sampling_logp_difference/mean": 0.19699563086032867, "step": 2727, "step_time": 3.9585614209936466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8769594272598624, "epoch": 0.02728, "grad_norm": 0.006390704307705164, "kl": 0.4974488914012909, "learning_rate": 9.996646653611643e-06, "loss": -0.0317, "step": 2728, "step_time": 2.0107930229933118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.06612668931484222, "epoch": 0.02729, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008000316447578371, "kl": 0.4321987219154835, "learning_rate": 9.996644162178585e-06, "loss": 0.0018, "num_tokens": 23087343.0, "reward": 1.0216538906097412, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0216538906097412, "rewards/rollout_reward_func/std": 0.09604562819004059, "sampling/importance_sampling_ratio/max": 0.9988280534744263, "sampling/importance_sampling_ratio/mean": 0.9929448366165161, "sampling/importance_sampling_ratio/min": 0.9861659407615662, "sampling/sampling_logp_difference/max": 0.009282346814870834, "sampling/sampling_logp_difference/mean": 0.001134171849116683, "step": 2729, "step_time": 4.3466493869927945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06710078753530979, "epoch": 0.0273, "grad_norm": 0.0008346653776243329, "kl": 0.43199431151151657, "learning_rate": 9.996641669820757e-06, "loss": 0.0018, "step": 2730, "step_time": 2.038345193002897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 218.9375, "completions/mean_terminated_length": 218.9375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.22536705154925585, "epoch": 0.02731, "frac_reward_zero_std": 0.75, "grad_norm": 0.015961378812789917, "kl": 0.4248191602528095, "learning_rate": 9.996639176538158e-06, "loss": 0.0203, "num_tokens": 23105085.0, "reward": 0.6609038710594177, "reward_std": 0.004079463891685009, "rewards/rollout_reward_func/mean": 0.6609038710594177, "rewards/rollout_reward_func/std": 0.32166793942451477, "sampling/importance_sampling_ratio/max": 0.9976879954338074, "sampling/importance_sampling_ratio/mean": 0.962496280670166, "sampling/importance_sampling_ratio/min": 0.019022194668650627, "sampling/sampling_logp_difference/max": 1.665987253189087, "sampling/sampling_logp_difference/mean": 0.02137950249016285, "step": 2731, "step_time": 4.849965046982106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22612209059298038, "epoch": 0.02732, "grad_norm": 0.015841469168663025, "kl": 0.4224916957318783, "learning_rate": 9.99663668233079e-06, "loss": 0.0203, "step": 2732, "step_time": 2.498911735987349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 150.75, "completions/mean_terminated_length": 150.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.46109592635184526, "epoch": 0.02733, "frac_reward_zero_std": 0.75, "grad_norm": 0.008322000503540039, "kl": 0.46654947102069855, "learning_rate": 9.996634187198654e-06, "loss": -0.0272, "num_tokens": 23120645.0, "reward": 0.6585096120834351, "reward_std": 0.06187184900045395, "rewards/rollout_reward_func/mean": 0.6585096120834351, "rewards/rollout_reward_func/std": 0.23520874977111816, "sampling/importance_sampling_ratio/max": 0.9971314668655396, "sampling/importance_sampling_ratio/mean": 0.9601944088935852, "sampling/importance_sampling_ratio/min": 7.617593361652908e-11, "sampling/sampling_logp_difference/max": 1.998679757118225, "sampling/sampling_logp_difference/mean": 0.11215490102767944, "step": 2733, "step_time": 4.6872090060060145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4615572318434715, "epoch": 0.02734, "grad_norm": 0.008498350158333778, "kl": 0.4659401997923851, "learning_rate": 9.996631691141746e-06, "loss": -0.0272, "step": 2734, "step_time": 2.1081227619943093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 192.90625, "completions/mean_terminated_length": 192.90625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.9698970559984446, "epoch": 0.02735, "frac_reward_zero_std": 0.25, "grad_norm": 0.03734579309821129, "kl": 0.4960869029164314, "learning_rate": 9.996629194160071e-06, "loss": -0.0972, "num_tokens": 23137642.0, "reward": 0.6374509930610657, "reward_std": 0.08091948926448822, "rewards/rollout_reward_func/mean": 0.6374509930610657, "rewards/rollout_reward_func/std": 0.2886785864830017, "sampling/importance_sampling_ratio/max": 1.003943681716919, "sampling/importance_sampling_ratio/mean": 0.8430637121200562, "sampling/importance_sampling_ratio/min": 5.07296471496943e-09, "sampling/sampling_logp_difference/max": 2.948639154434204, "sampling/sampling_logp_difference/mean": 0.20305770635604858, "step": 2735, "step_time": 4.610324233995925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9679451826959848, "epoch": 0.02736, "grad_norm": 0.036196932196617126, "kl": 0.5001996830105782, "learning_rate": 9.99662669625363e-06, "loss": -0.0972, "step": 2736, "step_time": 2.561949923001521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 92.5, "completions/mean_terminated_length": 92.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10404209420084953, "epoch": 0.02737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007195822545327246, "kl": 0.6314177960157394, "learning_rate": 9.99662419742242e-06, "loss": 0.0014, "num_tokens": 23151314.0, "reward": 0.7123076915740967, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7123076915740967, "rewards/rollout_reward_func/std": 0.13556495308876038, "sampling/importance_sampling_ratio/max": 0.9987347722053528, "sampling/importance_sampling_ratio/mean": 0.9902284145355225, "sampling/importance_sampling_ratio/min": 0.975382387638092, "sampling/sampling_logp_difference/max": 0.023930471390485764, "sampling/sampling_logp_difference/mean": 0.0029527421575039625, "step": 2737, "step_time": 4.831515999001567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10183268878608942, "epoch": 0.02738, "grad_norm": 0.000717115355655551, "kl": 0.6317832171916962, "learning_rate": 9.996621697666447e-06, "loss": 0.0014, "step": 2738, "step_time": 2.0039966350086615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 173.34375, "completions/mean_terminated_length": 173.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.24036816880106926, "epoch": 0.02739, "frac_reward_zero_std": 0.75, "grad_norm": 0.0048406668938696384, "kl": 0.5148421600461006, "learning_rate": 9.996619196985707e-06, "loss": -0.027, "num_tokens": 23167685.0, "reward": 1.0379037857055664, "reward_std": 0.01767767034471035, "rewards/rollout_reward_func/mean": 1.0379037857055664, "rewards/rollout_reward_func/std": 0.26579931378364563, "sampling/importance_sampling_ratio/max": 0.9953587651252747, "sampling/importance_sampling_ratio/mean": 0.958168625831604, "sampling/importance_sampling_ratio/min": 0.00029730433016084135, "sampling/sampling_logp_difference/max": 2.747527837753296, "sampling/sampling_logp_difference/mean": 0.04297034442424774, "step": 2739, "step_time": 4.239766769998823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23863322753459215, "epoch": 0.0274, "grad_norm": 0.004239426925778389, "kl": 0.5182023830711842, "learning_rate": 9.996616695380201e-06, "loss": -0.027, "step": 2740, "step_time": 2.0573384190138313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 134.875, "completions/mean_terminated_length": 134.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.47662563249468803, "epoch": 0.02741, "frac_reward_zero_std": 0.75, "grad_norm": 0.007978297770023346, "kl": 0.5874114856123924, "learning_rate": 9.996614192849932e-06, "loss": -0.0228, "num_tokens": 23182793.0, "reward": 0.650048017501831, "reward_std": 0.05192752555012703, "rewards/rollout_reward_func/mean": 0.650048017501831, "rewards/rollout_reward_func/std": 0.27592068910598755, "sampling/importance_sampling_ratio/max": 0.9964809417724609, "sampling/importance_sampling_ratio/mean": 0.9315751791000366, "sampling/importance_sampling_ratio/min": 3.265400300733745e-05, "sampling/sampling_logp_difference/max": 2.010277032852173, "sampling/sampling_logp_difference/mean": 0.09034973382949829, "step": 2741, "step_time": 4.202150149008958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.47532852552831173, "epoch": 0.02742, "grad_norm": 0.007399345748126507, "kl": 0.5810109749436378, "learning_rate": 9.9966116893949e-06, "loss": -0.0228, "step": 2742, "step_time": 2.5322792060032953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 166.03125, "completions/mean_terminated_length": 166.03125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3630700735375285, "epoch": 0.02743, "frac_reward_zero_std": 0.75, "grad_norm": 0.0049996282905340195, "kl": 0.539530873298645, "learning_rate": 9.996609185015104e-06, "loss": -0.0363, "num_tokens": 23198842.0, "reward": 0.7100528478622437, "reward_std": 0.041515324264764786, "rewards/rollout_reward_func/mean": 0.7100528478622437, "rewards/rollout_reward_func/std": 0.37970104813575745, "sampling/importance_sampling_ratio/max": 0.9977351427078247, "sampling/importance_sampling_ratio/mean": 0.9619035720825195, "sampling/importance_sampling_ratio/min": 1.0137973021294953e-14, "sampling/sampling_logp_difference/max": 4.518579483032227, "sampling/sampling_logp_difference/mean": 0.13500219583511353, "step": 2743, "step_time": 4.985789773003489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3611904289573431, "epoch": 0.02744, "grad_norm": 0.004962428472936153, "kl": 0.5398792773485184, "learning_rate": 9.996606679710545e-06, "loss": -0.0363, "step": 2744, "step_time": 2.059523680982238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 144.75, "completions/mean_terminated_length": 144.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.1260472247377038, "epoch": 0.02745, "frac_reward_zero_std": 1.0, "grad_norm": 0.0085613913834095, "kl": 0.5165872313082218, "learning_rate": 9.996604173481224e-06, "loss": 0.0016, "num_tokens": 23214242.0, "reward": 0.7978076934814453, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7978076934814453, "rewards/rollout_reward_func/std": 0.08251132816076279, "sampling/importance_sampling_ratio/max": 0.9992620944976807, "sampling/importance_sampling_ratio/mean": 0.9706873893737793, "sampling/importance_sampling_ratio/min": 0.19764026999473572, "sampling/sampling_logp_difference/max": 1.618950366973877, "sampling/sampling_logp_difference/mean": 0.011120567098259926, "step": 2745, "step_time": 4.078903566994995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12442674767225981, "epoch": 0.02746, "grad_norm": 0.007974343374371529, "kl": 0.5107441507279873, "learning_rate": 9.996601666327144e-06, "loss": 0.0016, "step": 2746, "step_time": 2.0213483549960074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 229.40625, "completions/mean_terminated_length": 229.40625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 1.599380330182612, "epoch": 0.02747, "frac_reward_zero_std": 0.0, "grad_norm": 0.021473413333296776, "kl": 0.474961344152689, "learning_rate": 9.996599158248302e-06, "loss": -0.1288, "num_tokens": 23232263.0, "reward": 0.5080304145812988, "reward_std": 0.1661136895418167, "rewards/rollout_reward_func/mean": 0.5080304145812988, "rewards/rollout_reward_func/std": 0.2984205484390259, "sampling/importance_sampling_ratio/max": 1.001548171043396, "sampling/importance_sampling_ratio/mean": 0.7795069217681885, "sampling/importance_sampling_ratio/min": 4.210434822127834e-28, "sampling/sampling_logp_difference/max": 12.255487442016602, "sampling/sampling_logp_difference/mean": 0.4690150022506714, "step": 2747, "step_time": 5.1026041209916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.601916333194822, "epoch": 0.02748, "grad_norm": 0.020891748368740082, "kl": 0.47796642407774925, "learning_rate": 9.9965966492447e-06, "loss": -0.1288, "step": 2748, "step_time": 2.5418390089980676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.060828257352113724, "epoch": 0.02749, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005243453779257834, "kl": 0.4581909291446209, "learning_rate": 9.996594139316339e-06, "loss": 0.0015, "num_tokens": 23248391.0, "reward": 0.8189615607261658, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8189615607261658, "rewards/rollout_reward_func/std": 0.24384985864162445, "sampling/importance_sampling_ratio/max": 1.0006619691848755, "sampling/importance_sampling_ratio/mean": 0.9937411546707153, "sampling/importance_sampling_ratio/min": 0.988688588142395, "sampling/sampling_logp_difference/max": 0.007189614698290825, "sampling/sampling_logp_difference/mean": 0.0012178460601717234, "step": 2749, "step_time": 4.9932675330055645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05984366638585925, "epoch": 0.0275, "grad_norm": 0.0005071309860795736, "kl": 0.45835762098431587, "learning_rate": 9.99659162846322e-06, "loss": 0.0015, "step": 2750, "step_time": 2.033550162006577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 155.5625, "completions/mean_terminated_length": 155.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.40546508645638824, "epoch": 0.02751, "frac_reward_zero_std": 0.75, "grad_norm": 0.007536254357546568, "kl": 0.5249487198889256, "learning_rate": 9.996589116685341e-06, "loss": -0.0093, "num_tokens": 23264137.0, "reward": 1.0352884531021118, "reward_std": 0.040920693427324295, "rewards/rollout_reward_func/mean": 1.0352884531021118, "rewards/rollout_reward_func/std": 0.22657731175422668, "sampling/importance_sampling_ratio/max": 0.9981834888458252, "sampling/importance_sampling_ratio/mean": 0.9315243363380432, "sampling/importance_sampling_ratio/min": 0.009298260323703289, "sampling/sampling_logp_difference/max": 2.055396795272827, "sampling/sampling_logp_difference/mean": 0.07294970750808716, "step": 2751, "step_time": 4.1356585010071285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4019841141998768, "epoch": 0.02752, "grad_norm": 0.007380613591521978, "kl": 0.5224223397672176, "learning_rate": 9.996586603982706e-06, "loss": -0.0093, "step": 2752, "step_time": 2.0454337159972056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 243.375, "completions/mean_terminated_length": 243.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.31407918594777584, "epoch": 0.02753, "frac_reward_zero_std": 0.5, "grad_norm": 0.02249237708747387, "kl": 0.6658420860767365, "learning_rate": 9.996584090355315e-06, "loss": -0.0537, "num_tokens": 23282661.0, "reward": 0.5738509893417358, "reward_std": 0.07309916615486145, "rewards/rollout_reward_func/mean": 0.5738509893417358, "rewards/rollout_reward_func/std": 0.3036749064922333, "sampling/importance_sampling_ratio/max": 0.9969779849052429, "sampling/importance_sampling_ratio/mean": 0.9051089882850647, "sampling/importance_sampling_ratio/min": 0.01663106121122837, "sampling/sampling_logp_difference/max": 2.34787654876709, "sampling/sampling_logp_difference/mean": 0.047049254179000854, "step": 2753, "step_time": 4.304089504003059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3112522307783365, "epoch": 0.02754, "grad_norm": 0.023589657619595528, "kl": 0.6871524527668953, "learning_rate": 9.996581575803167e-06, "loss": -0.0537, "step": 2754, "step_time": 2.456611078014248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 158.1875, "completions/mean_terminated_length": 158.1875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.35379034047946334, "epoch": 0.02755, "frac_reward_zero_std": 0.5, "grad_norm": 0.0106445187702775, "kl": 0.6701091900467873, "learning_rate": 9.996579060326262e-06, "loss": -0.064, "num_tokens": 23298491.0, "reward": 0.7620720863342285, "reward_std": 0.16852258145809174, "rewards/rollout_reward_func/mean": 0.7620720863342285, "rewards/rollout_reward_func/std": 0.3487745523452759, "sampling/importance_sampling_ratio/max": 0.9998890161514282, "sampling/importance_sampling_ratio/mean": 0.933754026889801, "sampling/importance_sampling_ratio/min": 0.00536617124453187, "sampling/sampling_logp_difference/max": 2.327233076095581, "sampling/sampling_logp_difference/mean": 0.052804362028837204, "step": 2755, "step_time": 4.531128029004321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.35434545343741775, "epoch": 0.02756, "grad_norm": 0.009948663413524628, "kl": 0.6683131605386734, "learning_rate": 9.996576543924604e-06, "loss": -0.064, "step": 2756, "step_time": 2.018731235002633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.05875801108777523, "epoch": 0.02757, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005299447802826762, "kl": 0.5061671808362007, "learning_rate": 9.99657402659819e-06, "loss": 0.0017, "num_tokens": 23314691.0, "reward": 0.8825384378433228, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8825384378433228, "rewards/rollout_reward_func/std": 0.23360347747802734, "sampling/importance_sampling_ratio/max": 0.9978798031806946, "sampling/importance_sampling_ratio/mean": 0.9932498931884766, "sampling/importance_sampling_ratio/min": 0.9879845976829529, "sampling/sampling_logp_difference/max": 0.00574466772377491, "sampling/sampling_logp_difference/mean": 0.00124573172070086, "step": 2757, "step_time": 4.1138457840061164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.057710100430995226, "epoch": 0.02758, "grad_norm": 0.0005170674412511289, "kl": 0.5063646286725998, "learning_rate": 9.996571508347024e-06, "loss": 0.0017, "step": 2758, "step_time": 2.017812402002164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 144.75, "completions/mean_terminated_length": 144.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0640823170542717, "epoch": 0.02759, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005059639224782586, "kl": 0.5536349192261696, "learning_rate": 9.996568989171104e-06, "loss": 0.0016, "num_tokens": 23330179.0, "reward": 0.9093461036682129, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9093461036682129, "rewards/rollout_reward_func/std": 0.10208728909492493, "sampling/importance_sampling_ratio/max": 0.999358057975769, "sampling/importance_sampling_ratio/mean": 0.9956806898117065, "sampling/importance_sampling_ratio/min": 0.9877163767814636, "sampling/sampling_logp_difference/max": 0.011419035494327545, "sampling/sampling_logp_difference/mean": 0.0010530586587265134, "step": 2759, "step_time": 4.207114148004621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06375561514869332, "epoch": 0.0276, "grad_norm": 0.0005053143831901252, "kl": 0.5536380596458912, "learning_rate": 9.99656646907043e-06, "loss": 0.0016, "step": 2760, "step_time": 2.5057036820071517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 116.46875, "completions/mean_terminated_length": 116.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.13617740757763386, "epoch": 0.02761, "frac_reward_zero_std": 0.75, "grad_norm": 0.038043733686208725, "kl": 1.1482234001159668, "learning_rate": 9.996563948045006e-06, "loss": -0.0242, "num_tokens": 23344618.0, "reward": 0.8813942074775696, "reward_std": 0.2022053450345993, "rewards/rollout_reward_func/mean": 0.8813942074775696, "rewards/rollout_reward_func/std": 0.4115385413169861, "sampling/importance_sampling_ratio/max": 0.9995397329330444, "sampling/importance_sampling_ratio/mean": 0.9657300710678101, "sampling/importance_sampling_ratio/min": 0.07696688920259476, "sampling/sampling_logp_difference/max": 2.68532133102417, "sampling/sampling_logp_difference/mean": 0.01933843269944191, "step": 2761, "step_time": 4.351228699008061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13619262166321278, "epoch": 0.02762, "grad_norm": 0.04280881583690643, "kl": 1.1853059381246567, "learning_rate": 9.99656142609483e-06, "loss": -0.0241, "step": 2762, "step_time": 2.0373768729987205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 73.75, "completions/mean_terminated_length": 72.83870697021484, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07124929036945105, "epoch": 0.02763, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022922419011592865, "kl": 0.6504089161753654, "learning_rate": 9.996558903219904e-06, "loss": 0.0014, "num_tokens": 23357834.0, "reward": 0.7046153545379639, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7046153545379639, "rewards/rollout_reward_func/std": 0.062317632138729095, "sampling/importance_sampling_ratio/max": 0.9993164539337158, "sampling/importance_sampling_ratio/mean": 0.995104968547821, "sampling/importance_sampling_ratio/min": 0.9877119660377502, "sampling/sampling_logp_difference/max": 0.01137450709939003, "sampling/sampling_logp_difference/mean": 0.0014878961956128478, "step": 2763, "step_time": 3.640723344993603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07047978229820728, "epoch": 0.02764, "grad_norm": 0.0021011142525821924, "kl": 0.648107573390007, "learning_rate": 9.996556379420228e-06, "loss": 0.0014, "step": 2764, "step_time": 2.0021406119922176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 111.5, "completions/mean_terminated_length": 108.74193572998047, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6923036156222224, "epoch": 0.02765, "frac_reward_zero_std": 0.5, "grad_norm": 0.023177819326519966, "kl": 0.6870120838284492, "learning_rate": 9.996553854695804e-06, "loss": -0.017, "num_tokens": 23372226.0, "reward": 0.5954567193984985, "reward_std": 0.06058001518249512, "rewards/rollout_reward_func/mean": 0.5954567193984985, "rewards/rollout_reward_func/std": 0.15764930844306946, "sampling/importance_sampling_ratio/max": 0.9985465407371521, "sampling/importance_sampling_ratio/mean": 0.9299458265304565, "sampling/importance_sampling_ratio/min": 5.282451809307531e-08, "sampling/sampling_logp_difference/max": 3.8403708934783936, "sampling/sampling_logp_difference/mean": 0.19337904453277588, "step": 2765, "step_time": 4.084905675998016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6924918787553906, "epoch": 0.02766, "grad_norm": 0.025016795843839645, "kl": 0.6815624721348286, "learning_rate": 9.99655132904663e-06, "loss": -0.0171, "step": 2766, "step_time": 2.9983804690127727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 98.03125, "completions/mean_terminated_length": 98.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.25919321086257696, "epoch": 0.02767, "frac_reward_zero_std": 0.75, "grad_norm": 0.015830105170607567, "kl": 0.6835415065288544, "learning_rate": 9.996548802472705e-06, "loss": 0.0201, "num_tokens": 23386187.0, "reward": 0.6534134149551392, "reward_std": 0.0020397298503667116, "rewards/rollout_reward_func/mean": 0.6534134149551392, "rewards/rollout_reward_func/std": 0.09716125577688217, "sampling/importance_sampling_ratio/max": 0.9987673163414001, "sampling/importance_sampling_ratio/mean": 0.964083194732666, "sampling/importance_sampling_ratio/min": 0.02845603972673416, "sampling/sampling_logp_difference/max": 1.7930822372436523, "sampling/sampling_logp_difference/mean": 0.029219523072242737, "step": 2767, "step_time": 3.754941720995703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26196951093152165, "epoch": 0.02768, "grad_norm": 0.015688294544816017, "kl": 0.6773556619882584, "learning_rate": 9.996546274974038e-06, "loss": 0.0201, "step": 2768, "step_time": 2.003653681000287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.176636534743011, "epoch": 0.02769, "frac_reward_zero_std": 0.75, "grad_norm": 0.004419836215674877, "kl": 0.5117769576609135, "learning_rate": 9.99654374655062e-06, "loss": -0.0268, "num_tokens": 23402867.0, "reward": 0.6441202163696289, "reward_std": 0.010429826565086842, "rewards/rollout_reward_func/mean": 0.6441202163696289, "rewards/rollout_reward_func/std": 0.10921290516853333, "sampling/importance_sampling_ratio/max": 0.9997621178627014, "sampling/importance_sampling_ratio/mean": 0.9647377729415894, "sampling/importance_sampling_ratio/min": 0.005435412283986807, "sampling/sampling_logp_difference/max": 2.3491311073303223, "sampling/sampling_logp_difference/mean": 0.023672152310609818, "step": 2769, "step_time": 4.271605783003906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17860253900289536, "epoch": 0.0277, "grad_norm": 0.004465069156140089, "kl": 0.5119581259787083, "learning_rate": 9.996541217202458e-06, "loss": -0.0268, "step": 2770, "step_time": 2.0084779389944742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 142.0, "completions/mean_terminated_length": 142.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.07917211297899485, "epoch": 0.02771, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007062627119012177, "kl": 0.4441528469324112, "learning_rate": 9.996538686929548e-06, "loss": 0.0014, "num_tokens": 23418091.0, "reward": 0.6816538572311401, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6816538572311401, "rewards/rollout_reward_func/std": 0.3873514235019684, "sampling/importance_sampling_ratio/max": 1.0023908615112305, "sampling/importance_sampling_ratio/mean": 0.9945849180221558, "sampling/importance_sampling_ratio/min": 0.9847766757011414, "sampling/sampling_logp_difference/max": 0.005745073780417442, "sampling/sampling_logp_difference/mean": 0.0012673498131334782, "step": 2771, "step_time": 4.605045852011244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07999254018068314, "epoch": 0.02772, "grad_norm": 0.0007206589798443019, "kl": 0.4440174624323845, "learning_rate": 9.996536155731895e-06, "loss": 0.0014, "step": 2772, "step_time": 2.4433280939992983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 101.6875, "completions/mean_terminated_length": 101.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3207649649120867, "epoch": 0.02773, "frac_reward_zero_std": 0.75, "grad_norm": 0.008671666495501995, "kl": 0.6726695075631142, "learning_rate": 9.996533623609497e-06, "loss": -0.0177, "num_tokens": 23432201.0, "reward": 0.8009614944458008, "reward_std": 0.02338891476392746, "rewards/rollout_reward_func/mean": 0.8009614944458008, "rewards/rollout_reward_func/std": 0.07278576493263245, "sampling/importance_sampling_ratio/max": 1.0071114301681519, "sampling/importance_sampling_ratio/mean": 0.9652175307273865, "sampling/importance_sampling_ratio/min": 0.0002563710731919855, "sampling/sampling_logp_difference/max": 2.168642997741699, "sampling/sampling_logp_difference/mean": 0.052241481840610504, "step": 2773, "step_time": 4.00834995500918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31980572547763586, "epoch": 0.02774, "grad_norm": 0.008691731840372086, "kl": 0.672718346118927, "learning_rate": 9.996531090562353e-06, "loss": -0.0177, "step": 2774, "step_time": 2.019274537989986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06265119835734367, "epoch": 0.02775, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006117411539889872, "kl": 0.45481910929083824, "learning_rate": 9.99652855659047e-06, "loss": 0.0017, "num_tokens": 23448985.0, "reward": 0.5170384645462036, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5170384645462036, "rewards/rollout_reward_func/std": 0.15707902610301971, "sampling/importance_sampling_ratio/max": 0.9988680481910706, "sampling/importance_sampling_ratio/mean": 0.9945786595344543, "sampling/importance_sampling_ratio/min": 0.9889883995056152, "sampling/sampling_logp_difference/max": 0.005764147266745567, "sampling/sampling_logp_difference/mean": 0.0010030288249254227, "step": 2775, "step_time": 4.234280875993136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06320873321965337, "epoch": 0.02776, "grad_norm": 0.000620393780991435, "kl": 0.45471883192658424, "learning_rate": 9.996526021693841e-06, "loss": 0.0017, "step": 2776, "step_time": 2.050579796989041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 129.1875, "completions/mean_terminated_length": 129.90322875976562, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 1.0689132595434785, "epoch": 0.02777, "frac_reward_zero_std": 0.5, "grad_norm": 0.008503029122948647, "kl": 0.5430374965071678, "learning_rate": 9.996523485872474e-06, "loss": -0.0474, "num_tokens": 23463855.0, "reward": 0.5278365612030029, "reward_std": 0.06666567176580429, "rewards/rollout_reward_func/mean": 0.5278365612030029, "rewards/rollout_reward_func/std": 0.1721908301115036, "sampling/importance_sampling_ratio/max": 0.9988596439361572, "sampling/importance_sampling_ratio/mean": 0.8715177774429321, "sampling/importance_sampling_ratio/min": 1.050163645280322e-12, "sampling/sampling_logp_difference/max": 3.098391056060791, "sampling/sampling_logp_difference/mean": 0.31689098477363586, "step": 2777, "step_time": 4.717056496003352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0720494124107063, "epoch": 0.02778, "grad_norm": 0.008446501567959785, "kl": 0.5393920503556728, "learning_rate": 9.996520949126363e-06, "loss": -0.0474, "step": 2778, "step_time": 2.5272267579930485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 148.65625, "completions/mean_terminated_length": 148.65625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.7118686912581325, "epoch": 0.02779, "frac_reward_zero_std": 0.25, "grad_norm": 0.014522457495331764, "kl": 0.7159332595765591, "learning_rate": 9.996518411455511e-06, "loss": -0.0054, "num_tokens": 23479292.0, "reward": 0.6741346120834351, "reward_std": 0.07479013502597809, "rewards/rollout_reward_func/mean": 0.6741346120834351, "rewards/rollout_reward_func/std": 0.31936410069465637, "sampling/importance_sampling_ratio/max": 0.9972346425056458, "sampling/importance_sampling_ratio/mean": 0.9044738411903381, "sampling/importance_sampling_ratio/min": 6.838968500677353e-17, "sampling/sampling_logp_difference/max": 3.6644701957702637, "sampling/sampling_logp_difference/mean": 0.23738762736320496, "step": 2779, "step_time": 4.071092397010943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7085194000974298, "epoch": 0.0278, "grad_norm": 0.014697976410388947, "kl": 0.7231007665395737, "learning_rate": 9.996515872859921e-06, "loss": -0.0053, "step": 2780, "step_time": 2.051072643000225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 153.375, "completions/mean_terminated_length": 153.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.25141929974779487, "epoch": 0.02781, "frac_reward_zero_std": 0.75, "grad_norm": 0.005517718847841024, "kl": 0.5033051259815693, "learning_rate": 9.99651333333959e-06, "loss": -0.0175, "num_tokens": 23495056.0, "reward": 0.6937692165374756, "reward_std": 0.00815892405807972, "rewards/rollout_reward_func/mean": 0.6937692165374756, "rewards/rollout_reward_func/std": 0.1346578150987625, "sampling/importance_sampling_ratio/max": 0.9990138411521912, "sampling/importance_sampling_ratio/mean": 0.9651215076446533, "sampling/importance_sampling_ratio/min": 0.0008501664851792157, "sampling/sampling_logp_difference/max": 1.8282618522644043, "sampling/sampling_logp_difference/mean": 0.04432044178247452, "step": 2781, "step_time": 4.154972256998008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24687337270006537, "epoch": 0.02782, "grad_norm": 0.006029742304235697, "kl": 0.5016896948218346, "learning_rate": 9.99651079289452e-06, "loss": -0.0176, "step": 2782, "step_time": 2.080852988001425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 173.21875, "completions/mean_terminated_length": 173.21875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4412573711015284, "epoch": 0.02783, "frac_reward_zero_std": 0.75, "grad_norm": 0.004162488970905542, "kl": 0.4743279069662094, "learning_rate": 9.996508251524713e-06, "loss": -0.0175, "num_tokens": 23511311.0, "reward": 0.6021826863288879, "reward_std": 0.03331560641527176, "rewards/rollout_reward_func/mean": 0.6021826863288879, "rewards/rollout_reward_func/std": 0.34389954805374146, "sampling/importance_sampling_ratio/max": 0.9987913966178894, "sampling/importance_sampling_ratio/mean": 0.9630550146102905, "sampling/importance_sampling_ratio/min": 2.4954337073896456e-16, "sampling/sampling_logp_difference/max": 2.9196200370788574, "sampling/sampling_logp_difference/mean": 0.16139496862888336, "step": 2783, "step_time": 5.236705480005185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.443091890309006, "epoch": 0.02784, "grad_norm": 0.00358836492523551, "kl": 0.4743296764791012, "learning_rate": 9.996505709230169e-06, "loss": -0.0175, "step": 2784, "step_time": 2.0337025160042685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08659407682716846, "epoch": 0.02785, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006294918130151927, "kl": 0.5677384659647942, "learning_rate": 9.996503166010887e-06, "loss": 0.0015, "num_tokens": 23526615.0, "reward": 0.5162692070007324, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5162692070007324, "rewards/rollout_reward_func/std": 0.15557335317134857, "sampling/importance_sampling_ratio/max": 0.999068558216095, "sampling/importance_sampling_ratio/mean": 0.9931650161743164, "sampling/importance_sampling_ratio/min": 0.9891804456710815, "sampling/sampling_logp_difference/max": 0.009732376784086227, "sampling/sampling_logp_difference/mean": 0.0018295806366950274, "step": 2785, "step_time": 4.402621703011391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08619137108325958, "epoch": 0.02786, "grad_norm": 0.000614201242569834, "kl": 0.5678902640938759, "learning_rate": 9.99650062186687e-06, "loss": 0.0015, "step": 2786, "step_time": 2.0068302049985505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 209.59375, "completions/mean_terminated_length": 213.09677124023438, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.703540088608861, "epoch": 0.02787, "frac_reward_zero_std": 0.25, "grad_norm": 0.019824769347906113, "kl": 0.6392698027193546, "learning_rate": 9.996498076798115e-06, "loss": -0.0783, "num_tokens": 23543946.0, "reward": 0.8541322350502014, "reward_std": 0.2858922779560089, "rewards/rollout_reward_func/mean": 0.8541322350502014, "rewards/rollout_reward_func/std": 0.5599451065063477, "sampling/importance_sampling_ratio/max": 0.9991092085838318, "sampling/importance_sampling_ratio/mean": 0.8725512027740479, "sampling/importance_sampling_ratio/min": 7.98730681678887e-16, "sampling/sampling_logp_difference/max": 18.604482650756836, "sampling/sampling_logp_difference/mean": 0.29320305585861206, "step": 2787, "step_time": 4.35165728701395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7026001787744462, "epoch": 0.02788, "grad_norm": 0.01925291307270527, "kl": 0.6446103528141975, "learning_rate": 9.996495530804626e-06, "loss": -0.0783, "step": 2788, "step_time": 2.0294300819878117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 91.875, "completions/mean_terminated_length": 91.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.22423836961388588, "epoch": 0.02789, "frac_reward_zero_std": 0.75, "grad_norm": 0.03164379671216011, "kl": 0.8207079842686653, "learning_rate": 9.996492983886405e-06, "loss": -0.0428, "num_tokens": 23557742.0, "reward": 0.6965288519859314, "reward_std": 0.018747854977846146, "rewards/rollout_reward_func/mean": 0.6965288519859314, "rewards/rollout_reward_func/std": 0.1433359831571579, "sampling/importance_sampling_ratio/max": 1.0003032684326172, "sampling/importance_sampling_ratio/mean": 0.9399183988571167, "sampling/importance_sampling_ratio/min": 0.1403990089893341, "sampling/sampling_logp_difference/max": 1.958348035812378, "sampling/sampling_logp_difference/mean": 0.02795414999127388, "step": 2789, "step_time": 4.896355599987146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22221263870596886, "epoch": 0.0279, "grad_norm": 0.028298761695623398, "kl": 0.8273889422416687, "learning_rate": 9.996490436043448e-06, "loss": -0.0428, "step": 2790, "step_time": 2.0424675160102197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3084403797984123, "epoch": 0.02791, "frac_reward_zero_std": 0.5, "grad_norm": 0.010594130493700504, "kl": 0.49913446232676506, "learning_rate": 9.996487887275758e-06, "loss": -0.0551, "num_tokens": 23573154.0, "reward": 0.5653509497642517, "reward_std": 0.024028034880757332, "rewards/rollout_reward_func/mean": 0.5653509497642517, "rewards/rollout_reward_func/std": 0.3283768892288208, "sampling/importance_sampling_ratio/max": 1.0016998052597046, "sampling/importance_sampling_ratio/mean": 0.936350405216217, "sampling/importance_sampling_ratio/min": 0.014359906315803528, "sampling/sampling_logp_difference/max": 1.875894546508789, "sampling/sampling_logp_difference/mean": 0.052847929298877716, "step": 2791, "step_time": 4.008486036982504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3078757729381323, "epoch": 0.02792, "grad_norm": 0.0106826052069664, "kl": 0.5006291456520557, "learning_rate": 9.996485337583335e-06, "loss": -0.0551, "step": 2792, "step_time": 2.0428048670000862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.056536976248025894, "epoch": 0.02793, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005284610670059919, "kl": 0.4279548041522503, "learning_rate": 9.996482786966182e-06, "loss": 0.0016, "num_tokens": 23590234.0, "reward": 0.7814614772796631, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7814614772796631, "rewards/rollout_reward_func/std": 0.34289246797561646, "sampling/importance_sampling_ratio/max": 0.9992837905883789, "sampling/importance_sampling_ratio/mean": 0.9948991537094116, "sampling/importance_sampling_ratio/min": 0.9872716069221497, "sampling/sampling_logp_difference/max": 0.005108889192342758, "sampling/sampling_logp_difference/mean": 0.0009433198138140142, "step": 2793, "step_time": 4.207065937989682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.055726546328514814, "epoch": 0.02794, "grad_norm": 0.0005124677554704249, "kl": 0.4280865490436554, "learning_rate": 9.996480235424297e-06, "loss": 0.0016, "step": 2794, "step_time": 2.0399428349992377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 129.5625, "completions/mean_terminated_length": 129.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2364132017828524, "epoch": 0.02795, "frac_reward_zero_std": 0.75, "grad_norm": 0.01044307928532362, "kl": 0.5205507911741734, "learning_rate": 9.99647768295768e-06, "loss": -0.0171, "num_tokens": 23605292.0, "reward": 0.7435576915740967, "reward_std": 0.00679910508915782, "rewards/rollout_reward_func/mean": 0.7435576915740967, "rewards/rollout_reward_func/std": 0.2664715051651001, "sampling/importance_sampling_ratio/max": 0.999048113822937, "sampling/importance_sampling_ratio/mean": 0.9668649435043335, "sampling/importance_sampling_ratio/min": 0.03737848624587059, "sampling/sampling_logp_difference/max": 1.5546342134475708, "sampling/sampling_logp_difference/mean": 0.026439819484949112, "step": 2795, "step_time": 4.418928764003795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23612493928521872, "epoch": 0.02796, "grad_norm": 0.010245755314826965, "kl": 0.5209254696965218, "learning_rate": 9.996475129566335e-06, "loss": -0.0171, "step": 2796, "step_time": 2.034791971986124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 117.5, "completions/mean_terminated_length": 117.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05593054601922631, "epoch": 0.02797, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004135800991207361, "kl": 0.42919355630874634, "learning_rate": 9.99647257525026e-06, "loss": 0.0012, "num_tokens": 23619764.0, "reward": 0.6457692384719849, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6457692384719849, "rewards/rollout_reward_func/std": 0.17605070769786835, "sampling/importance_sampling_ratio/max": 1.000980019569397, "sampling/importance_sampling_ratio/mean": 0.997679591178894, "sampling/importance_sampling_ratio/min": 0.992744505405426, "sampling/sampling_logp_difference/max": 0.003042822703719139, "sampling/sampling_logp_difference/mean": 0.0007405225187540054, "step": 2797, "step_time": 3.7929134400110343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05513191595673561, "epoch": 0.02798, "grad_norm": 0.0004010669363196939, "kl": 0.42932456731796265, "learning_rate": 9.996470020009457e-06, "loss": 0.0012, "step": 2798, "step_time": 2.0064988110025297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 144.9375, "completions/mean_terminated_length": 144.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.15800944669172168, "epoch": 0.02799, "frac_reward_zero_std": 0.75, "grad_norm": 0.011906604282557964, "kl": 0.577845748513937, "learning_rate": 9.996467463843924e-06, "loss": -0.017, "num_tokens": 23635282.0, "reward": 0.6041346192359924, "reward_std": 0.1416933238506317, "rewards/rollout_reward_func/mean": 0.6041346192359924, "rewards/rollout_reward_func/std": 0.43181025981903076, "sampling/importance_sampling_ratio/max": 0.9996726512908936, "sampling/importance_sampling_ratio/mean": 0.9679913520812988, "sampling/importance_sampling_ratio/min": 0.08011497557163239, "sampling/sampling_logp_difference/max": 2.523797035217285, "sampling/sampling_logp_difference/mean": 0.016913481056690216, "step": 2799, "step_time": 3.9890410549996886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15779994474723935, "epoch": 0.028, "grad_norm": 0.010252950713038445, "kl": 0.5940176285803318, "learning_rate": 9.996464906753666e-06, "loss": -0.017, "step": 2800, "step_time": 2.0033705850000842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 141.03125, "completions/mean_terminated_length": 141.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3984394078142941, "epoch": 0.02801, "frac_reward_zero_std": 0.5, "grad_norm": 0.010934270918369293, "kl": 0.6712823137640953, "learning_rate": 9.99646234873868e-06, "loss": -0.0649, "num_tokens": 23650563.0, "reward": 0.45867305994033813, "reward_std": 0.1324465423822403, "rewards/rollout_reward_func/mean": 0.45867305994033813, "rewards/rollout_reward_func/std": 0.3495119512081146, "sampling/importance_sampling_ratio/max": 1.0002706050872803, "sampling/importance_sampling_ratio/mean": 0.9337393045425415, "sampling/importance_sampling_ratio/min": 0.005385465454310179, "sampling/sampling_logp_difference/max": 2.3788228034973145, "sampling/sampling_logp_difference/mean": 0.06491401791572571, "step": 2801, "step_time": 4.583544567998615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39721532398834825, "epoch": 0.02802, "grad_norm": 0.010618617758154869, "kl": 0.683385081589222, "learning_rate": 9.996459789798966e-06, "loss": -0.0649, "step": 2802, "step_time": 2.0469450269956724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 99.6875, "completions/mean_terminated_length": 99.6875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.23527527693659067, "epoch": 0.02803, "frac_reward_zero_std": 0.75, "grad_norm": 0.004039960913360119, "kl": 0.7011864110827446, "learning_rate": 9.99645722993453e-06, "loss": -0.0171, "num_tokens": 23664489.0, "reward": 0.6519230604171753, "reward_std": 0.027196412906050682, "rewards/rollout_reward_func/mean": 0.6519230604171753, "rewards/rollout_reward_func/std": 0.14970862865447998, "sampling/importance_sampling_ratio/max": 0.9989833235740662, "sampling/importance_sampling_ratio/mean": 0.9657567739486694, "sampling/importance_sampling_ratio/min": 0.012714355252683163, "sampling/sampling_logp_difference/max": 1.6974555253982544, "sampling/sampling_logp_difference/mean": 0.03591122850775719, "step": 2803, "step_time": 3.7263159030044335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23344439547508955, "epoch": 0.02804, "grad_norm": 0.0032079436350613832, "kl": 0.7026031985878944, "learning_rate": 9.996454669145367e-06, "loss": -0.0171, "step": 2804, "step_time": 1.9829013419875992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.15222844388335943, "epoch": 0.02805, "frac_reward_zero_std": 0.5, "grad_norm": 0.01820010505616665, "kl": 0.6686959601938725, "learning_rate": 9.99645210743148e-06, "loss": -0.0508, "num_tokens": 23682837.0, "reward": 0.9858605861663818, "reward_std": 0.20360594987869263, "rewards/rollout_reward_func/mean": 0.9858605861663818, "rewards/rollout_reward_func/std": 0.49786633253097534, "sampling/importance_sampling_ratio/max": 0.9988412261009216, "sampling/importance_sampling_ratio/mean": 0.9401088953018188, "sampling/importance_sampling_ratio/min": 0.0776023119688034, "sampling/sampling_logp_difference/max": 2.6164345741271973, "sampling/sampling_logp_difference/mean": 0.024159012362360954, "step": 2805, "step_time": 4.59147578500415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15122087113559246, "epoch": 0.02806, "grad_norm": 0.01614519953727722, "kl": 0.6847001239657402, "learning_rate": 9.99644954479287e-06, "loss": -0.0508, "step": 2806, "step_time": 2.500377121003112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 200.0, "completions/mean_terminated_length": 200.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.045142633374780416, "epoch": 0.02807, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003763781569432467, "kl": 0.42408836632966995, "learning_rate": 9.996446981229536e-06, "loss": 0.0016, "num_tokens": 23699917.0, "reward": 1.031461477279663, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.031461477279663, "rewards/rollout_reward_func/std": 0.35906293988227844, "sampling/importance_sampling_ratio/max": 0.9986950755119324, "sampling/importance_sampling_ratio/mean": 0.9959931373596191, "sampling/importance_sampling_ratio/min": 0.9941573143005371, "sampling/sampling_logp_difference/max": 0.003367062658071518, "sampling/sampling_logp_difference/mean": 0.0007465534145012498, "step": 2807, "step_time": 4.822332513998845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.043835433665663004, "epoch": 0.02808, "grad_norm": 0.0003645262913778424, "kl": 0.4242871552705765, "learning_rate": 9.996444416741479e-06, "loss": 0.0016, "step": 2808, "step_time": 2.020597363996785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 177.28125, "completions/mean_terminated_length": 177.28125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3038752730935812, "epoch": 0.02809, "frac_reward_zero_std": 0.75, "grad_norm": 0.0036554529797285795, "kl": 0.4696646183729172, "learning_rate": 9.996441851328704e-06, "loss": -0.0176, "num_tokens": 23716382.0, "reward": 0.7486730813980103, "reward_std": 0.00679910508915782, "rewards/rollout_reward_func/mean": 0.7486730813980103, "rewards/rollout_reward_func/std": 0.27181482315063477, "sampling/importance_sampling_ratio/max": 1.000246524810791, "sampling/importance_sampling_ratio/mean": 0.9656665325164795, "sampling/importance_sampling_ratio/min": 0.0008941686828620732, "sampling/sampling_logp_difference/max": 1.8951778411865234, "sampling/sampling_logp_difference/mean": 0.053795840591192245, "step": 2809, "step_time": 4.366598167995107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30475507164373994, "epoch": 0.0281, "grad_norm": 0.0033188930246979, "kl": 0.4660811573266983, "learning_rate": 9.996439284991204e-06, "loss": -0.0177, "step": 2810, "step_time": 2.042010550998384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2194461217150092, "epoch": 0.02811, "frac_reward_zero_std": 0.75, "grad_norm": 0.0036649336107075214, "kl": 0.4728129878640175, "learning_rate": 9.996436717728985e-06, "loss": -0.0175, "num_tokens": 23731726.0, "reward": 0.6892980337142944, "reward_std": 0.0088388342410326, "rewards/rollout_reward_func/mean": 0.6892980337142944, "rewards/rollout_reward_func/std": 0.24605877697467804, "sampling/importance_sampling_ratio/max": 1.0012271404266357, "sampling/importance_sampling_ratio/mean": 0.9662003517150879, "sampling/importance_sampling_ratio/min": 0.006756233982741833, "sampling/sampling_logp_difference/max": 2.2548301219940186, "sampling/sampling_logp_difference/mean": 0.03858412057161331, "step": 2811, "step_time": 4.248295096003858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2196121304295957, "epoch": 0.02812, "grad_norm": 0.0033718005288392305, "kl": 0.4765745773911476, "learning_rate": 9.996434149542047e-06, "loss": -0.0175, "step": 2812, "step_time": 2.984282801000518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 173.40625, "completions/mean_terminated_length": 173.40625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.25200352910906076, "epoch": 0.02813, "frac_reward_zero_std": 0.75, "grad_norm": 0.053253524005413055, "kl": 0.6684430092573166, "learning_rate": 9.996431580430389e-06, "loss": -0.0261, "num_tokens": 23748043.0, "reward": 1.0109808444976807, "reward_std": 0.06119193509221077, "rewards/rollout_reward_func/mean": 1.0109808444976807, "rewards/rollout_reward_func/std": 0.3270070552825928, "sampling/importance_sampling_ratio/max": 0.9982075691223145, "sampling/importance_sampling_ratio/mean": 0.9656017422676086, "sampling/importance_sampling_ratio/min": 1.110115817937185e-06, "sampling/sampling_logp_difference/max": 3.486457347869873, "sampling/sampling_logp_difference/mean": 0.06909884512424469, "step": 2813, "step_time": 4.3619716159955715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2517619039863348, "epoch": 0.02814, "grad_norm": 0.053958382457494736, "kl": 0.6686123162508011, "learning_rate": 9.996429010394011e-06, "loss": -0.0261, "step": 2814, "step_time": 2.0448819740049657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 127.71875, "completions/mean_terminated_length": 127.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3521071751601994, "epoch": 0.02815, "frac_reward_zero_std": 0.5, "grad_norm": 0.017071159556508064, "kl": 0.7057512477040291, "learning_rate": 9.996426439432916e-06, "loss": -0.054, "num_tokens": 23762898.0, "reward": 0.8656225800514221, "reward_std": 0.02217189222574234, "rewards/rollout_reward_func/mean": 0.8656225800514221, "rewards/rollout_reward_func/std": 0.2581659257411957, "sampling/importance_sampling_ratio/max": 0.9988995790481567, "sampling/importance_sampling_ratio/mean": 0.935073971748352, "sampling/importance_sampling_ratio/min": 0.0010892907157540321, "sampling/sampling_logp_difference/max": 2.289822578430176, "sampling/sampling_logp_difference/mean": 0.052381135523319244, "step": 2815, "step_time": 4.195505893992959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3561058333143592, "epoch": 0.02816, "grad_norm": 0.01879170909523964, "kl": 0.7137166820466518, "learning_rate": 9.996423867547105e-06, "loss": -0.054, "step": 2816, "step_time": 2.058135346000199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 130.875, "completions/mean_terminated_length": 130.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3499578912742436, "epoch": 0.02817, "frac_reward_zero_std": 0.75, "grad_norm": 0.004193603061139584, "kl": 0.47266729176044464, "learning_rate": 9.996421294736577e-06, "loss": -0.0178, "num_tokens": 23777878.0, "reward": 0.6243269443511963, "reward_std": 0.02583659254014492, "rewards/rollout_reward_func/mean": 0.6243269443511963, "rewards/rollout_reward_func/std": 0.17118896543979645, "sampling/importance_sampling_ratio/max": 1.0003340244293213, "sampling/importance_sampling_ratio/mean": 0.9665881395339966, "sampling/importance_sampling_ratio/min": 6.7925575422123075e-06, "sampling/sampling_logp_difference/max": 2.133876085281372, "sampling/sampling_logp_difference/mean": 0.08409338444471359, "step": 2817, "step_time": 4.068935071998567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3495155335403979, "epoch": 0.02818, "grad_norm": 0.004285204689949751, "kl": 0.4733085408806801, "learning_rate": 9.996418721001332e-06, "loss": -0.0178, "step": 2818, "step_time": 2.9483718919946114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.042809861712157726, "epoch": 0.02819, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003344482393004, "kl": 0.43339579179883003, "learning_rate": 9.996416146341373e-06, "loss": 0.0015, "num_tokens": 23793926.0, "reward": 0.7864615321159363, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7864615321159363, "rewards/rollout_reward_func/std": 0.4089212417602539, "sampling/importance_sampling_ratio/max": 0.9989055395126343, "sampling/importance_sampling_ratio/mean": 0.9968743920326233, "sampling/importance_sampling_ratio/min": 0.9942987561225891, "sampling/sampling_logp_difference/max": 0.00257267989218235, "sampling/sampling_logp_difference/mean": 0.0006209478015080094, "step": 2819, "step_time": 4.204608965999796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04337209137156606, "epoch": 0.0282, "grad_norm": 0.00033870135666802526, "kl": 0.4333093389868736, "learning_rate": 9.996413570756697e-06, "loss": 0.0015, "step": 2820, "step_time": 2.0204740280096303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 124.0, "completions/mean_terminated_length": 124.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.049893131479620934, "epoch": 0.02821, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003809745830949396, "kl": 0.46736178174614906, "learning_rate": 9.996410994247308e-06, "loss": 0.0013, "num_tokens": 23808662.0, "reward": 0.7874231338500977, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7874231338500977, "rewards/rollout_reward_func/std": 0.32482612133026123, "sampling/importance_sampling_ratio/max": 1.000072717666626, "sampling/importance_sampling_ratio/mean": 0.9963798522949219, "sampling/importance_sampling_ratio/min": 0.9909647107124329, "sampling/sampling_logp_difference/max": 0.0038833729922771454, "sampling/sampling_logp_difference/mean": 0.0008390133734792471, "step": 2821, "step_time": 3.91820231701422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.050805858336389065, "epoch": 0.02822, "grad_norm": 0.00039851179462857544, "kl": 0.46723875403404236, "learning_rate": 9.996408416813207e-06, "loss": 0.0013, "step": 2822, "step_time": 1.9748819469896262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 127.25, "completions/mean_terminated_length": 127.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0653012627735734, "epoch": 0.02823, "frac_reward_zero_std": 1.0, "grad_norm": 0.00046009026118554175, "kl": 0.6083960384130478, "learning_rate": 9.996405838454392e-06, "loss": 0.0016, "num_tokens": 23823590.0, "reward": 0.8195384740829468, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8195384740829468, "rewards/rollout_reward_func/std": 0.3281361758708954, "sampling/importance_sampling_ratio/max": 0.99985271692276, "sampling/importance_sampling_ratio/mean": 0.9955936074256897, "sampling/importance_sampling_ratio/min": 0.9908435940742493, "sampling/sampling_logp_difference/max": 0.008007088676095009, "sampling/sampling_logp_difference/mean": 0.0011735591106116772, "step": 2823, "step_time": 3.888172940998629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06635478092357516, "epoch": 0.02824, "grad_norm": 0.00046004942851141095, "kl": 0.6082350388169289, "learning_rate": 9.996403259170863e-06, "loss": 0.0016, "step": 2824, "step_time": 2.9684754779955256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 210.46875, "completions/mean_terminated_length": 210.46875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.38544315146282315, "epoch": 0.02825, "frac_reward_zero_std": 0.5, "grad_norm": 0.008861396461725235, "kl": 0.4629935398697853, "learning_rate": 9.996400678962625e-06, "loss": -0.0649, "num_tokens": 23840949.0, "reward": 0.7809326648712158, "reward_std": 0.03358757868409157, "rewards/rollout_reward_func/mean": 0.7809326648712158, "rewards/rollout_reward_func/std": 0.2212732434272766, "sampling/importance_sampling_ratio/max": 1.001681923866272, "sampling/importance_sampling_ratio/mean": 0.9352678656578064, "sampling/importance_sampling_ratio/min": 0.0001777255383785814, "sampling/sampling_logp_difference/max": 2.0194830894470215, "sampling/sampling_logp_difference/mean": 0.05806565284729004, "step": 2825, "step_time": 4.303348324014223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38487242767587304, "epoch": 0.02826, "grad_norm": 0.008960823528468609, "kl": 0.46438823640346527, "learning_rate": 9.996398097829675e-06, "loss": -0.0649, "step": 2826, "step_time": 1.9820895959928748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 101.75, "completions/mean_terminated_length": 101.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.060499245300889015, "epoch": 0.02827, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003634709573816508, "kl": 0.6416309252381325, "learning_rate": 9.996395515772015e-06, "loss": 0.0014, "num_tokens": 23855093.0, "reward": 0.993384599685669, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.993384599685669, "rewards/rollout_reward_func/std": 0.25335457921028137, "sampling/importance_sampling_ratio/max": 0.9998729825019836, "sampling/importance_sampling_ratio/mean": 0.9961724281311035, "sampling/importance_sampling_ratio/min": 0.9885297417640686, "sampling/sampling_logp_difference/max": 0.010169651359319687, "sampling/sampling_logp_difference/mean": 0.0013016369193792343, "step": 2827, "step_time": 3.777936640995904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06027860986068845, "epoch": 0.02828, "grad_norm": 0.0003632405714597553, "kl": 0.64167071133852, "learning_rate": 9.996392932789646e-06, "loss": 0.0014, "step": 2828, "step_time": 1.9834215819937526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 212.03125, "completions/mean_terminated_length": 212.03125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5640331287868321, "epoch": 0.02829, "frac_reward_zero_std": 0.5, "grad_norm": 0.013129866681993008, "kl": 0.4812444634735584, "learning_rate": 9.996390348882567e-06, "loss": -0.0747, "num_tokens": 23872558.0, "reward": 0.4694807827472687, "reward_std": 0.04454772546887398, "rewards/rollout_reward_func/mean": 0.4694807827472687, "rewards/rollout_reward_func/std": 0.23546670377254486, "sampling/importance_sampling_ratio/max": 1.0001342296600342, "sampling/importance_sampling_ratio/mean": 0.9340857863426208, "sampling/importance_sampling_ratio/min": 1.638594682162875e-08, "sampling/sampling_logp_difference/max": 3.4983489513397217, "sampling/sampling_logp_difference/mean": 0.1098008081316948, "step": 2829, "step_time": 4.436330202996032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5636595818214118, "epoch": 0.0283, "grad_norm": 0.013693771325051785, "kl": 0.491014514118433, "learning_rate": 9.996387764050779e-06, "loss": -0.0747, "step": 2830, "step_time": 2.5381486609985586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 162.5, "completions/mean_terminated_length": 162.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.09877544082701206, "epoch": 0.02831, "frac_reward_zero_std": 0.75, "grad_norm": 0.009164620190858841, "kl": 0.6051015518605709, "learning_rate": 9.996385178294283e-06, "loss": -0.0249, "num_tokens": 23888382.0, "reward": 0.5333173274993896, "reward_std": 0.11898431181907654, "rewards/rollout_reward_func/mean": 0.5333173274993896, "rewards/rollout_reward_func/std": 0.4623725414276123, "sampling/importance_sampling_ratio/max": 0.9994602203369141, "sampling/importance_sampling_ratio/mean": 0.9683762788772583, "sampling/importance_sampling_ratio/min": 0.06377873569726944, "sampling/sampling_logp_difference/max": 2.811610460281372, "sampling/sampling_logp_difference/mean": 0.01548202894628048, "step": 2831, "step_time": 3.9852200870009256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09897889383137226, "epoch": 0.02832, "grad_norm": 0.008924315683543682, "kl": 0.596725307404995, "learning_rate": 9.996382591613081e-06, "loss": -0.0249, "step": 2832, "step_time": 1.9894384740036912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 142.5, "completions/mean_terminated_length": 142.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.04261450283229351, "epoch": 0.02833, "frac_reward_zero_std": 1.0, "grad_norm": 0.00031906587537378073, "kl": 0.41202954202890396, "learning_rate": 9.996380004007174e-06, "loss": 0.0013, "num_tokens": 23903678.0, "reward": 0.7688461542129517, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7688461542129517, "rewards/rollout_reward_func/std": 0.21435095369815826, "sampling/importance_sampling_ratio/max": 0.9999280571937561, "sampling/importance_sampling_ratio/mean": 0.99723881483078, "sampling/importance_sampling_ratio/min": 0.9938422441482544, "sampling/sampling_logp_difference/max": 0.002542913891375065, "sampling/sampling_logp_difference/mean": 0.000640487065538764, "step": 2833, "step_time": 4.100197923995438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.042449085507541895, "epoch": 0.02834, "grad_norm": 0.0003230184083804488, "kl": 0.4120398201048374, "learning_rate": 9.996377415476559e-06, "loss": 0.0013, "step": 2834, "step_time": 1.996478705012123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 199.40625, "completions/mean_terminated_length": 199.40625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.5080962255597115, "epoch": 0.02835, "frac_reward_zero_std": 0.25, "grad_norm": 0.017267432063817978, "kl": 0.5748189948499203, "learning_rate": 9.996374826021238e-06, "loss": -0.0466, "num_tokens": 23920907.0, "reward": 0.6604615449905396, "reward_std": 0.053141795098781586, "rewards/rollout_reward_func/mean": 0.6604615449905396, "rewards/rollout_reward_func/std": 0.32086411118507385, "sampling/importance_sampling_ratio/max": 0.998276948928833, "sampling/importance_sampling_ratio/mean": 0.9032847285270691, "sampling/importance_sampling_ratio/min": 0.0005393979954533279, "sampling/sampling_logp_difference/max": 3.3327102661132812, "sampling/sampling_logp_difference/mean": 0.08009563386440277, "step": 2835, "step_time": 4.244166478005354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5086690196767449, "epoch": 0.02836, "grad_norm": 0.01814369671046734, "kl": 0.5812766812741756, "learning_rate": 9.996372235641215e-06, "loss": -0.0465, "step": 2836, "step_time": 2.9537607670063153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 127.1875, "completions/mean_terminated_length": 127.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.42714707273989916, "epoch": 0.02837, "frac_reward_zero_std": 0.5, "grad_norm": 0.011325634084641933, "kl": 0.552939023822546, "learning_rate": 9.996369644336487e-06, "loss": 0.0013, "num_tokens": 23935769.0, "reward": 0.5430288314819336, "reward_std": 0.019717402756214142, "rewards/rollout_reward_func/mean": 0.5430288314819336, "rewards/rollout_reward_func/std": 0.09048784524202347, "sampling/importance_sampling_ratio/max": 0.9997319579124451, "sampling/importance_sampling_ratio/mean": 0.9346368312835693, "sampling/importance_sampling_ratio/min": 0.004194346722215414, "sampling/sampling_logp_difference/max": 2.2780890464782715, "sampling/sampling_logp_difference/mean": 0.06674284487962723, "step": 2837, "step_time": 3.8938592940030503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42784026730805635, "epoch": 0.02838, "grad_norm": 0.010991547256708145, "kl": 0.5444842949509621, "learning_rate": 9.996367052107054e-06, "loss": 0.0013, "step": 2838, "step_time": 2.0084200269920984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 121.25, "completions/mean_terminated_length": 121.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.05188716296106577, "epoch": 0.02839, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003759485553018749, "kl": 0.540370300412178, "learning_rate": 9.996364458952922e-06, "loss": 0.0015, "num_tokens": 23950361.0, "reward": 0.6699999570846558, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6699999570846558, "rewards/rollout_reward_func/std": 0.16742642223834991, "sampling/importance_sampling_ratio/max": 0.9994120001792908, "sampling/importance_sampling_ratio/mean": 0.9972800016403198, "sampling/importance_sampling_ratio/min": 0.9948959350585938, "sampling/sampling_logp_difference/max": 0.0033653657883405685, "sampling/sampling_logp_difference/mean": 0.000838071689940989, "step": 2839, "step_time": 3.8008928920098697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.053475662134587765, "epoch": 0.0284, "grad_norm": 0.0003924695774912834, "kl": 0.5400933437049389, "learning_rate": 9.996361864874085e-06, "loss": 0.0015, "step": 2840, "step_time": 1.9909987500213902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 193.25, "completions/mean_terminated_length": 193.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.27510619442909956, "epoch": 0.02841, "frac_reward_zero_std": 0.75, "grad_norm": 0.01697443798184395, "kl": 0.474158700555563, "learning_rate": 9.996359269870548e-06, "loss": 0.0308, "num_tokens": 23967225.0, "reward": 0.8385913372039795, "reward_std": 0.012007215991616249, "rewards/rollout_reward_func/mean": 0.8385913372039795, "rewards/rollout_reward_func/std": 0.2691197991371155, "sampling/importance_sampling_ratio/max": 1.0008430480957031, "sampling/importance_sampling_ratio/mean": 0.9649621248245239, "sampling/importance_sampling_ratio/min": 2.4911257057169678e-08, "sampling/sampling_logp_difference/max": 3.703537940979004, "sampling/sampling_logp_difference/mean": 0.1002911776304245, "step": 2841, "step_time": 4.611303088997374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2763683055527508, "epoch": 0.02842, "grad_norm": 0.01596931181848049, "kl": 0.47046321257948875, "learning_rate": 9.99635667394231e-06, "loss": 0.0307, "step": 2842, "step_time": 3.0007490899879485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.5260878410190344, "epoch": 0.02843, "frac_reward_zero_std": 0.5, "grad_norm": 0.011623906902968884, "kl": 0.6833384409546852, "learning_rate": 9.996354077089372e-06, "loss": -0.0454, "num_tokens": 23983629.0, "reward": 0.7088461518287659, "reward_std": 0.09409958869218826, "rewards/rollout_reward_func/mean": 0.7088461518287659, "rewards/rollout_reward_func/std": 0.3483513593673706, "sampling/importance_sampling_ratio/max": 0.997776985168457, "sampling/importance_sampling_ratio/mean": 0.9332921504974365, "sampling/importance_sampling_ratio/min": 1.1944974858124624e-06, "sampling/sampling_logp_difference/max": 1.8896827697753906, "sampling/sampling_logp_difference/mean": 0.10145695507526398, "step": 2843, "step_time": 4.6520208769943565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5274654333479702, "epoch": 0.02844, "grad_norm": 0.010691276751458645, "kl": 0.6686341613531113, "learning_rate": 9.996351479311734e-06, "loss": -0.0454, "step": 2844, "step_time": 2.055217585999344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 171.4375, "completions/mean_terminated_length": 171.4375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.4622981557622552, "epoch": 0.02845, "frac_reward_zero_std": 0.5, "grad_norm": 0.012669897638261318, "kl": 0.7823780439794064, "learning_rate": 9.996348880609398e-06, "loss": -0.0451, "num_tokens": 23999883.0, "reward": 0.7405240535736084, "reward_std": 0.0646594762802124, "rewards/rollout_reward_func/mean": 0.7405240535736084, "rewards/rollout_reward_func/std": 0.33282822370529175, "sampling/importance_sampling_ratio/max": 0.9994540810585022, "sampling/importance_sampling_ratio/mean": 0.9341864585876465, "sampling/importance_sampling_ratio/min": 0.003939760383218527, "sampling/sampling_logp_difference/max": 2.5440094470977783, "sampling/sampling_logp_difference/mean": 0.060466647148132324, "step": 2845, "step_time": 4.299506001014379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.45882615027949214, "epoch": 0.02846, "grad_norm": 0.012259350158274174, "kl": 0.740753285586834, "learning_rate": 9.996346280982362e-06, "loss": -0.0451, "step": 2846, "step_time": 2.016781598002126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 149.0625, "completions/mean_terminated_length": 149.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.36584756057709455, "epoch": 0.02847, "frac_reward_zero_std": 0.75, "grad_norm": 0.0037841827142983675, "kl": 0.45151565596461296, "learning_rate": 9.996343680430632e-06, "loss": -0.027, "num_tokens": 24015333.0, "reward": 0.8969711661338806, "reward_std": 0.002039732877165079, "rewards/rollout_reward_func/mean": 0.8969711661338806, "rewards/rollout_reward_func/std": 0.2694844603538513, "sampling/importance_sampling_ratio/max": 0.9994266033172607, "sampling/importance_sampling_ratio/mean": 0.9648618102073669, "sampling/importance_sampling_ratio/min": 5.193266705560973e-09, "sampling/sampling_logp_difference/max": 3.99603271484375, "sampling/sampling_logp_difference/mean": 0.09460482746362686, "step": 2847, "step_time": 4.538233890983975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36674765637144446, "epoch": 0.02848, "grad_norm": 0.003776279976591468, "kl": 0.4515700004994869, "learning_rate": 9.996341078954204e-06, "loss": -0.027, "step": 2848, "step_time": 2.5561395620170515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 246.25, "completions/mean_terminated_length": 246.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.09541625250130892, "epoch": 0.02849, "frac_reward_zero_std": 0.75, "grad_norm": 0.022547075524926186, "kl": 0.4214438572525978, "learning_rate": 9.996338476553078e-06, "loss": -0.0325, "num_tokens": 24033949.0, "reward": 0.8555865287780762, "reward_std": 0.11898431181907654, "rewards/rollout_reward_func/mean": 0.8555865287780762, "rewards/rollout_reward_func/std": 0.6323428750038147, "sampling/importance_sampling_ratio/max": 0.9986585974693298, "sampling/importance_sampling_ratio/mean": 0.9672399759292603, "sampling/importance_sampling_ratio/min": 0.10191044211387634, "sampling/sampling_logp_difference/max": 2.4586286544799805, "sampling/sampling_logp_difference/mean": 0.01109723374247551, "step": 2849, "step_time": 4.6455998760065995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09538244595751166, "epoch": 0.0285, "grad_norm": 0.019695285707712173, "kl": 0.42566724866628647, "learning_rate": 9.996335873227257e-06, "loss": -0.0326, "step": 2850, "step_time": 2.0204165059985826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 125.0625, "completions/mean_terminated_length": 125.0625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.6423941645771265, "epoch": 0.02851, "frac_reward_zero_std": 0.5, "grad_norm": 0.02305540256202221, "kl": 0.670601449906826, "learning_rate": 9.996333268976742e-06, "loss": -0.0424, "num_tokens": 24048631.0, "reward": 0.6651923656463623, "reward_std": 0.04079461842775345, "rewards/rollout_reward_func/mean": 0.6651923656463623, "rewards/rollout_reward_func/std": 0.26295217871665955, "sampling/importance_sampling_ratio/max": 0.998809278011322, "sampling/importance_sampling_ratio/mean": 0.905366837978363, "sampling/importance_sampling_ratio/min": 0.0005067368620075285, "sampling/sampling_logp_difference/max": 2.0609874725341797, "sampling/sampling_logp_difference/mean": 0.11728577315807343, "step": 2851, "step_time": 3.9092691390032996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6417262833565474, "epoch": 0.02852, "grad_norm": 0.02344045229256153, "kl": 0.6679545789957047, "learning_rate": 9.996330663801531e-06, "loss": -0.0424, "step": 2852, "step_time": 2.072614594006154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 153.6875, "completions/mean_terminated_length": 153.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.42611862532794476, "epoch": 0.02853, "frac_reward_zero_std": 0.5, "grad_norm": 0.008781159296631813, "kl": 0.7899556197226048, "learning_rate": 9.996328057701627e-06, "loss": -0.0338, "num_tokens": 24064373.0, "reward": 0.7197595834732056, "reward_std": 0.17201730608940125, "rewards/rollout_reward_func/mean": 0.7197595834732056, "rewards/rollout_reward_func/std": 0.42262890934944153, "sampling/importance_sampling_ratio/max": 0.9974513649940491, "sampling/importance_sampling_ratio/mean": 0.9336051940917969, "sampling/importance_sampling_ratio/min": 0.010259617120027542, "sampling/sampling_logp_difference/max": 2.6356358528137207, "sampling/sampling_logp_difference/mean": 0.05827789008617401, "step": 2853, "step_time": 4.585858300997643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4232726339250803, "epoch": 0.02854, "grad_norm": 0.008582782931625843, "kl": 0.8004347458481789, "learning_rate": 9.996325450677032e-06, "loss": -0.0338, "step": 2854, "step_time": 2.0279125639935955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 55.5625, "completions/mean_terminated_length": 57.290321350097656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3105762042105198, "epoch": 0.02855, "frac_reward_zero_std": 0.75, "grad_norm": 0.023217100650072098, "kl": 0.7302508428692818, "learning_rate": 9.996322842727743e-06, "loss": -0.008, "num_tokens": 24077063.0, "reward": 0.5632211565971375, "reward_std": 0.03331560641527176, "rewards/rollout_reward_func/mean": 0.5632211565971375, "rewards/rollout_reward_func/std": 0.20553268492221832, "sampling/importance_sampling_ratio/max": 0.9979605674743652, "sampling/importance_sampling_ratio/mean": 0.959542989730835, "sampling/importance_sampling_ratio/min": 0.008459669537842274, "sampling/sampling_logp_difference/max": 2.193178176879883, "sampling/sampling_logp_difference/mean": 0.05200972780585289, "step": 2855, "step_time": 3.450590933993226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3075454290956259, "epoch": 0.02856, "grad_norm": 0.023156458511948586, "kl": 0.7311054989695549, "learning_rate": 9.99632023385376e-06, "loss": -0.008, "step": 2856, "step_time": 2.004751792002935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 143.03125, "completions/mean_terminated_length": 143.03125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.4512471240013838, "epoch": 0.02857, "frac_reward_zero_std": 0.5, "grad_norm": 0.008844990283250809, "kl": 0.6262630745768547, "learning_rate": 9.99631762405509e-06, "loss": -0.0456, "num_tokens": 24092464.0, "reward": 0.7584615349769592, "reward_std": 0.03154784068465233, "rewards/rollout_reward_func/mean": 0.7584615349769592, "rewards/rollout_reward_func/std": 0.09522439539432526, "sampling/importance_sampling_ratio/max": 0.9991978406906128, "sampling/importance_sampling_ratio/mean": 0.9346902370452881, "sampling/importance_sampling_ratio/min": 0.0014299413887783885, "sampling/sampling_logp_difference/max": 1.777132511138916, "sampling/sampling_logp_difference/mean": 0.06397385895252228, "step": 2857, "step_time": 3.877932754003268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.45205450570210814, "epoch": 0.02858, "grad_norm": 0.008496955968439579, "kl": 0.6232693940401077, "learning_rate": 9.996315013331728e-06, "loss": -0.0456, "step": 2858, "step_time": 1.9505541430044104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 153.34375, "completions/mean_terminated_length": 153.34375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.9030338576994836, "epoch": 0.02859, "frac_reward_zero_std": 0.0, "grad_norm": 0.028856467455625534, "kl": 1.0386765636503696, "learning_rate": 9.996312401683675e-06, "loss": -0.0519, "num_tokens": 24108227.0, "reward": 0.854067325592041, "reward_std": 0.13976237177848816, "rewards/rollout_reward_func/mean": 0.854067325592041, "rewards/rollout_reward_func/std": 0.3576061725616455, "sampling/importance_sampling_ratio/max": 0.9992968440055847, "sampling/importance_sampling_ratio/mean": 0.8750829100608826, "sampling/importance_sampling_ratio/min": 8.307673482477185e-08, "sampling/sampling_logp_difference/max": 4.085742950439453, "sampling/sampling_logp_difference/mean": 0.18579642474651337, "step": 2859, "step_time": 4.432802970004559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9047076399438083, "epoch": 0.0286, "grad_norm": 0.02548728510737419, "kl": 1.0214149691164494, "learning_rate": 9.996309789110934e-06, "loss": -0.0519, "step": 2860, "step_time": 2.4786029959941516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 186.1875, "completions/mean_terminated_length": 186.1875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.9126525144092739, "epoch": 0.02861, "frac_reward_zero_std": 0.0, "grad_norm": 0.012167985551059246, "kl": 0.8153339326381683, "learning_rate": 9.996307175613505e-06, "loss": -0.1132, "num_tokens": 24124865.0, "reward": 0.7117966413497925, "reward_std": 0.2212017923593521, "rewards/rollout_reward_func/mean": 0.7117966413497925, "rewards/rollout_reward_func/std": 0.49724850058555603, "sampling/importance_sampling_ratio/max": 0.9978927969932556, "sampling/importance_sampling_ratio/mean": 0.8433480262756348, "sampling/importance_sampling_ratio/min": 1.0078062516460928e-12, "sampling/sampling_logp_difference/max": 13.321612358093262, "sampling/sampling_logp_difference/mean": 0.23305359482765198, "step": 2861, "step_time": 4.264007555997523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9117533299140632, "epoch": 0.02862, "grad_norm": 0.012213381938636303, "kl": 0.8086026683449745, "learning_rate": 9.996304561191386e-06, "loss": -0.1132, "step": 2862, "step_time": 2.0674693810142344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 118.71875, "completions/mean_terminated_length": 118.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2196096107363701, "epoch": 0.02863, "frac_reward_zero_std": 0.75, "grad_norm": 0.006320393178611994, "kl": 0.6696459501981735, "learning_rate": 9.996301945844583e-06, "loss": -0.0366, "num_tokens": 24139608.0, "reward": 0.791495144367218, "reward_std": 0.21040505170822144, "rewards/rollout_reward_func/mean": 0.791495144367218, "rewards/rollout_reward_func/std": 0.4358612895011902, "sampling/importance_sampling_ratio/max": 1.0007386207580566, "sampling/importance_sampling_ratio/mean": 0.9636046886444092, "sampling/importance_sampling_ratio/min": 0.004565995652228594, "sampling/sampling_logp_difference/max": 2.325451612472534, "sampling/sampling_logp_difference/mean": 0.036712441593408585, "step": 2863, "step_time": 4.033570074992895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21902607940137386, "epoch": 0.02864, "grad_norm": 0.005976811982691288, "kl": 0.6611060760915279, "learning_rate": 9.996299329573091e-06, "loss": -0.0366, "step": 2864, "step_time": 2.05008785201062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.17788933543488383, "epoch": 0.02865, "frac_reward_zero_std": 0.75, "grad_norm": 0.004449869971722364, "kl": 0.5313479416072369, "learning_rate": 9.996296712376915e-06, "loss": -0.0171, "num_tokens": 24155912.0, "reward": 0.6654038429260254, "reward_std": 0.006799102760851383, "rewards/rollout_reward_func/mean": 0.6654038429260254, "rewards/rollout_reward_func/std": 0.14198341965675354, "sampling/importance_sampling_ratio/max": 1.00296950340271, "sampling/importance_sampling_ratio/mean": 0.9666757583618164, "sampling/importance_sampling_ratio/min": 0.018015962094068527, "sampling/sampling_logp_difference/max": 1.8894344568252563, "sampling/sampling_logp_difference/mean": 0.021560512483119965, "step": 2865, "step_time": 4.234020321993739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17956163780763745, "epoch": 0.02866, "grad_norm": 0.004313464276492596, "kl": 0.5320828557014465, "learning_rate": 9.99629409425605e-06, "loss": -0.0171, "step": 2866, "step_time": 2.5406103990026168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 173.78125, "completions/mean_terminated_length": 173.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.442434411495924, "epoch": 0.02867, "frac_reward_zero_std": 0.75, "grad_norm": 0.00500009348616004, "kl": 0.5826865583658218, "learning_rate": 9.996291475210505e-06, "loss": -0.0174, "num_tokens": 24172185.0, "reward": 0.881206750869751, "reward_std": 0.04347347095608711, "rewards/rollout_reward_func/mean": 0.881206750869751, "rewards/rollout_reward_func/std": 0.41634756326675415, "sampling/importance_sampling_ratio/max": 0.9998034238815308, "sampling/importance_sampling_ratio/mean": 0.9640476107597351, "sampling/importance_sampling_ratio/min": 7.28319282643497e-05, "sampling/sampling_logp_difference/max": 2.1013126373291016, "sampling/sampling_logp_difference/mean": 0.09538116306066513, "step": 2867, "step_time": 4.571272040986514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4420313024893403, "epoch": 0.02868, "grad_norm": 0.004989486187696457, "kl": 0.5825344994664192, "learning_rate": 9.996288855240273e-06, "loss": -0.0174, "step": 2868, "step_time": 2.0640547309958492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 130.84375, "completions/mean_terminated_length": 130.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.33349688071757555, "epoch": 0.02869, "frac_reward_zero_std": 0.75, "grad_norm": 0.006686633452773094, "kl": 0.6479741334915161, "learning_rate": 9.996286234345358e-06, "loss": -0.0175, "num_tokens": 24187196.0, "reward": 0.781653881072998, "reward_std": 0.038074977695941925, "rewards/rollout_reward_func/mean": 0.781653881072998, "rewards/rollout_reward_func/std": 0.08937087655067444, "sampling/importance_sampling_ratio/max": 0.9979807734489441, "sampling/importance_sampling_ratio/mean": 0.9627150297164917, "sampling/importance_sampling_ratio/min": 5.689682438969612e-05, "sampling/sampling_logp_difference/max": 1.8068342208862305, "sampling/sampling_logp_difference/mean": 0.05063556879758835, "step": 2869, "step_time": 4.401831932002096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33148053707554936, "epoch": 0.0287, "grad_norm": 0.006137220188975334, "kl": 0.6410280764102936, "learning_rate": 9.996283612525762e-06, "loss": -0.0175, "step": 2870, "step_time": 2.0822752930034767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 248.0625, "completions/mean_terminated_length": 248.0625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.3254775023087859, "epoch": 0.02871, "frac_reward_zero_std": 0.75, "grad_norm": 0.01655939221382141, "kl": 0.42367206141352654, "learning_rate": 9.996280989781482e-06, "loss": -0.0364, "num_tokens": 24205902.0, "reward": 0.9069072008132935, "reward_std": 0.010270723141729832, "rewards/rollout_reward_func/mean": 0.9069072008132935, "rewards/rollout_reward_func/std": 0.37518900632858276, "sampling/importance_sampling_ratio/max": 0.9988215565681458, "sampling/importance_sampling_ratio/mean": 0.9646823406219482, "sampling/importance_sampling_ratio/min": 3.745346494000225e-16, "sampling/sampling_logp_difference/max": 3.15041446685791, "sampling/sampling_logp_difference/mean": 0.12543685734272003, "step": 2871, "step_time": 5.791798043013841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.32542898366227746, "epoch": 0.02872, "grad_norm": 0.01676410809159279, "kl": 0.42388858273625374, "learning_rate": 9.996278366112522e-06, "loss": -0.0364, "step": 2872, "step_time": 2.0795848710040445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 222.75, "completions/mean_terminated_length": 222.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.24609630834311247, "epoch": 0.02873, "frac_reward_zero_std": 0.75, "grad_norm": 0.0057786693796515465, "kl": 0.4028831571340561, "learning_rate": 9.99627574151888e-06, "loss": -0.0367, "num_tokens": 24223766.0, "reward": 0.8411442041397095, "reward_std": 0.008838832378387451, "rewards/rollout_reward_func/mean": 0.8411442041397095, "rewards/rollout_reward_func/std": 0.43372833728790283, "sampling/importance_sampling_ratio/max": 0.9993900060653687, "sampling/importance_sampling_ratio/mean": 0.9640260934829712, "sampling/importance_sampling_ratio/min": 0.000833811005577445, "sampling/sampling_logp_difference/max": 2.0090537071228027, "sampling/sampling_logp_difference/mean": 0.0382055826485157, "step": 2873, "step_time": 4.523331528005656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24522489728406072, "epoch": 0.02874, "grad_norm": 0.005637546069920063, "kl": 0.40254100039601326, "learning_rate": 9.996273116000558e-06, "loss": -0.0367, "step": 2874, "step_time": 2.076383049003198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 200.125, "completions/mean_terminated_length": 200.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.7109483340755105, "epoch": 0.02875, "frac_reward_zero_std": 0.25, "grad_norm": 0.012052916921675205, "kl": 0.4967195428907871, "learning_rate": 9.996270489557558e-06, "loss": -0.0371, "num_tokens": 24240962.0, "reward": 0.7758365869522095, "reward_std": 0.05262507498264313, "rewards/rollout_reward_func/mean": 0.7758365869522095, "rewards/rollout_reward_func/std": 0.24786800146102905, "sampling/importance_sampling_ratio/max": 1.0007187128067017, "sampling/importance_sampling_ratio/mean": 0.9036271572113037, "sampling/importance_sampling_ratio/min": 9.042253376392747e-22, "sampling/sampling_logp_difference/max": 4.164237022399902, "sampling/sampling_logp_difference/mean": 0.23747225105762482, "step": 2875, "step_time": 4.445139184004802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7080762656405568, "epoch": 0.02876, "grad_norm": 0.012623663991689682, "kl": 0.49720096215605736, "learning_rate": 9.99626786218988e-06, "loss": -0.0372, "step": 2876, "step_time": 2.0330235509973136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 122.4375, "completions/mean_terminated_length": 122.4375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2136619407683611, "epoch": 0.02877, "frac_reward_zero_std": 0.75, "grad_norm": 0.02315022051334381, "kl": 0.46942660957574844, "learning_rate": 9.996265233897522e-06, "loss": 0.0192, "num_tokens": 24255616.0, "reward": 0.5002884864807129, "reward_std": 0.004079463891685009, "rewards/rollout_reward_func/mean": 0.5002884864807129, "rewards/rollout_reward_func/std": 0.1026880145072937, "sampling/importance_sampling_ratio/max": 0.9983274340629578, "sampling/importance_sampling_ratio/mean": 0.9649332761764526, "sampling/importance_sampling_ratio/min": 0.043081458657979965, "sampling/sampling_logp_difference/max": 1.4448728561401367, "sampling/sampling_logp_difference/mean": 0.026863064616918564, "step": 2877, "step_time": 4.429395676022978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21195145742967725, "epoch": 0.02878, "grad_norm": 0.02457941509783268, "kl": 0.46373045444488525, "learning_rate": 9.996262604680486e-06, "loss": 0.0191, "step": 2878, "step_time": 2.002518161003536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 140.25, "completions/mean_terminated_length": 140.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.05139204952865839, "epoch": 0.02879, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004150020831730217, "kl": 0.43324465677142143, "learning_rate": 9.996259974538774e-06, "loss": 0.0013, "num_tokens": 24270816.0, "reward": 0.9099999666213989, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9099999666213989, "rewards/rollout_reward_func/std": 0.17827491462230682, "sampling/importance_sampling_ratio/max": 0.9998012185096741, "sampling/importance_sampling_ratio/mean": 0.9961061477661133, "sampling/importance_sampling_ratio/min": 0.9931575655937195, "sampling/sampling_logp_difference/max": 0.003204755485057831, "sampling/sampling_logp_difference/mean": 0.0009185365634039044, "step": 2879, "step_time": 3.8902120369893964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.052149164490401745, "epoch": 0.0288, "grad_norm": 0.0004229525220580399, "kl": 0.4331078641116619, "learning_rate": 9.996257343472384e-06, "loss": 0.0013, "step": 2880, "step_time": 2.0091199500020593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 78.3125, "completions/mean_terminated_length": 78.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.27230835892260075, "epoch": 0.02881, "frac_reward_zero_std": 0.75, "grad_norm": 0.004377954173833132, "kl": 0.655513696372509, "learning_rate": 9.996254711481321e-06, "loss": -0.0174, "num_tokens": 24284234.0, "reward": 0.6488461494445801, "reward_std": 0.010878566652536392, "rewards/rollout_reward_func/mean": 0.6488461494445801, "rewards/rollout_reward_func/std": 0.12048573791980743, "sampling/importance_sampling_ratio/max": 1.0018887519836426, "sampling/importance_sampling_ratio/mean": 0.9648309946060181, "sampling/importance_sampling_ratio/min": 0.021305082365870476, "sampling/sampling_logp_difference/max": 1.6340608596801758, "sampling/sampling_logp_difference/mean": 0.03380006551742554, "step": 2881, "step_time": 3.611877606002963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2708445005118847, "epoch": 0.02882, "grad_norm": 0.004649122711271048, "kl": 0.654562309384346, "learning_rate": 9.996252078565582e-06, "loss": -0.0174, "step": 2882, "step_time": 1.9768626969962497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.7024286719970405, "epoch": 0.02883, "frac_reward_zero_std": 0.5, "grad_norm": 0.013098962604999542, "kl": 0.6981385052204132, "learning_rate": 9.996249444725167e-06, "loss": -0.0411, "num_tokens": 24299658.0, "reward": 0.783096194267273, "reward_std": 0.037360332906246185, "rewards/rollout_reward_func/mean": 0.783096194267273, "rewards/rollout_reward_func/std": 0.29513052105903625, "sampling/importance_sampling_ratio/max": 0.9990030527114868, "sampling/importance_sampling_ratio/mean": 0.8720132112503052, "sampling/importance_sampling_ratio/min": 2.232744211738691e-09, "sampling/sampling_logp_difference/max": 17.298595428466797, "sampling/sampling_logp_difference/mean": 0.1968960016965866, "step": 2883, "step_time": 5.025501309995889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7032903847284615, "epoch": 0.02884, "grad_norm": 0.013187015429139137, "kl": 0.6947740912437439, "learning_rate": 9.996246809960082e-06, "loss": -0.0411, "step": 2884, "step_time": 1.9803627830042387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 190.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.05530566303059459, "epoch": 0.02885, "frac_reward_zero_std": 1.0, "grad_norm": 0.000517068721819669, "kl": 0.40156953036785126, "learning_rate": 9.996244174270322e-06, "loss": 0.0015, "num_tokens": 24316362.0, "reward": 1.1114616394042969, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.1114616394042969, "rewards/rollout_reward_func/std": 0.18725252151489258, "sampling/importance_sampling_ratio/max": 0.998883068561554, "sampling/importance_sampling_ratio/mean": 0.9938448667526245, "sampling/importance_sampling_ratio/min": 0.988508403301239, "sampling/sampling_logp_difference/max": 0.006459444761276245, "sampling/sampling_logp_difference/mean": 0.0010007575619965792, "step": 2885, "step_time": 4.394056048011407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05593921011313796, "epoch": 0.02886, "grad_norm": 0.0005194738041609526, "kl": 0.40145648270845413, "learning_rate": 9.996241537655891e-06, "loss": 0.0015, "step": 2886, "step_time": 2.0398831460042857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 125.65625, "completions/mean_terminated_length": 125.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2971767224371433, "epoch": 0.02887, "frac_reward_zero_std": 0.75, "grad_norm": 0.004824598785489798, "kl": 0.540940273553133, "learning_rate": 9.996238900116787e-06, "loss": -0.0176, "num_tokens": 24331207.0, "reward": 0.7059327363967896, "reward_std": 0.0020397312473505735, "rewards/rollout_reward_func/mean": 0.7059327363967896, "rewards/rollout_reward_func/std": 0.19299696385860443, "sampling/importance_sampling_ratio/max": 0.9995079040527344, "sampling/importance_sampling_ratio/mean": 0.9623638987541199, "sampling/importance_sampling_ratio/min": 0.006044051609933376, "sampling/sampling_logp_difference/max": 1.7801170349121094, "sampling/sampling_logp_difference/mean": 0.05379900336265564, "step": 2887, "step_time": 4.1917746720064315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30007103085517883, "epoch": 0.02888, "grad_norm": 0.004545946605503559, "kl": 0.547021996229887, "learning_rate": 9.996236261653014e-06, "loss": -0.0176, "step": 2888, "step_time": 2.5202320760145085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 215.71875, "completions/mean_terminated_length": 215.71875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.4024423281662166, "epoch": 0.02889, "frac_reward_zero_std": 0.75, "grad_norm": 0.009149732999503613, "kl": 0.4707561917603016, "learning_rate": 9.996233622264567e-06, "loss": -0.0267, "num_tokens": 24348822.0, "reward": 0.8282451629638672, "reward_std": 0.041515324264764786, "rewards/rollout_reward_func/mean": 0.8282451629638672, "rewards/rollout_reward_func/std": 0.3616671562194824, "sampling/importance_sampling_ratio/max": 0.9991050362586975, "sampling/importance_sampling_ratio/mean": 0.9618757963180542, "sampling/importance_sampling_ratio/min": 1.3854752577407287e-13, "sampling/sampling_logp_difference/max": 2.610381841659546, "sampling/sampling_logp_difference/mean": 0.14054341614246368, "step": 2889, "step_time": 5.007491576987377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.40297297621145844, "epoch": 0.0289, "grad_norm": 0.012174738571047783, "kl": 0.4663223586976528, "learning_rate": 9.996230981951453e-06, "loss": -0.0267, "step": 2890, "step_time": 2.046878357992682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 169.40625, "completions/mean_terminated_length": 169.40625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.40019452292472124, "epoch": 0.02891, "frac_reward_zero_std": 0.5, "grad_norm": 0.011807159520685673, "kl": 0.8023916892707348, "learning_rate": 9.996228340713668e-06, "loss": -0.0536, "num_tokens": 24364979.0, "reward": 0.8693509101867676, "reward_std": 0.08940821886062622, "rewards/rollout_reward_func/mean": 0.8693509101867676, "rewards/rollout_reward_func/std": 0.26873254776000977, "sampling/importance_sampling_ratio/max": 0.9992485642433167, "sampling/importance_sampling_ratio/mean": 0.9091725945472717, "sampling/importance_sampling_ratio/min": 0.0008497567614540458, "sampling/sampling_logp_difference/max": 2.5708508491516113, "sampling/sampling_logp_difference/mean": 0.06430082023143768, "step": 2891, "step_time": 4.410978175001219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3987875781022012, "epoch": 0.02892, "grad_norm": 0.012615333311259747, "kl": 0.8095283545553684, "learning_rate": 9.996225698551217e-06, "loss": -0.0535, "step": 2892, "step_time": 2.039789705995645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 190.25, "completions/mean_terminated_length": 190.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06942929327487946, "epoch": 0.02893, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006417546537704766, "kl": 0.45944297686219215, "learning_rate": 9.996223055464097e-06, "loss": 0.0017, "num_tokens": 24381779.0, "reward": 0.7286922931671143, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7286922931671143, "rewards/rollout_reward_func/std": 0.20955176651477814, "sampling/importance_sampling_ratio/max": 0.9991055727005005, "sampling/importance_sampling_ratio/mean": 0.9956384897232056, "sampling/importance_sampling_ratio/min": 0.9906861782073975, "sampling/sampling_logp_difference/max": 0.005889516323804855, "sampling/sampling_logp_difference/mean": 0.001017245464026928, "step": 2893, "step_time": 4.699876450984448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0695134848356247, "epoch": 0.02894, "grad_norm": 0.0006513161933980882, "kl": 0.459457840770483, "learning_rate": 9.99622041145231e-06, "loss": 0.0017, "step": 2894, "step_time": 2.462107773993921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 137.96875, "completions/mean_terminated_length": 137.96875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4665727987885475, "epoch": 0.02895, "frac_reward_zero_std": 0.5, "grad_norm": 0.009831349365413189, "kl": 0.48459820076823235, "learning_rate": 9.996217766515856e-06, "loss": -0.0178, "num_tokens": 24396906.0, "reward": 0.7330095767974854, "reward_std": 0.05926099419593811, "rewards/rollout_reward_func/mean": 0.7330095767974854, "rewards/rollout_reward_func/std": 0.4156895875930786, "sampling/importance_sampling_ratio/max": 1.0000666379928589, "sampling/importance_sampling_ratio/mean": 0.9350223541259766, "sampling/importance_sampling_ratio/min": 0.015348752960562706, "sampling/sampling_logp_difference/max": 1.805452823638916, "sampling/sampling_logp_difference/mean": 0.05578362196683884, "step": 2895, "step_time": 4.598559920996195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4682181291282177, "epoch": 0.02896, "grad_norm": 0.00903890747576952, "kl": 0.4734221622347832, "learning_rate": 9.996215120654738e-06, "loss": -0.0178, "step": 2896, "step_time": 2.028174437007692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4863018300384283, "epoch": 0.02897, "frac_reward_zero_std": 0.75, "grad_norm": 0.006366098299622536, "kl": 0.5087772719562054, "learning_rate": 9.996212473868954e-06, "loss": -0.0129, "num_tokens": 24413182.0, "reward": 0.7118798494338989, "reward_std": 0.01670553721487522, "rewards/rollout_reward_func/mean": 0.7118798494338989, "rewards/rollout_reward_func/std": 0.3153359591960907, "sampling/importance_sampling_ratio/max": 0.9970769882202148, "sampling/importance_sampling_ratio/mean": 0.9305998086929321, "sampling/importance_sampling_ratio/min": 0.0007802589680068195, "sampling/sampling_logp_difference/max": 1.5114554166793823, "sampling/sampling_logp_difference/mean": 0.06873522698879242, "step": 2897, "step_time": 4.208787354007654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.49509126134216785, "epoch": 0.02898, "grad_norm": 0.005684385076165199, "kl": 0.5020761638879776, "learning_rate": 9.996209826158505e-06, "loss": -0.013, "step": 2898, "step_time": 2.03242689200124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08042972721159458, "epoch": 0.02899, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008096603560261428, "kl": 0.5717602074146271, "learning_rate": 9.996207177523391e-06, "loss": 0.0015, "num_tokens": 24428022.0, "reward": 0.820961594581604, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.820961594581604, "rewards/rollout_reward_func/std": 0.26744258403778076, "sampling/importance_sampling_ratio/max": 0.997292697429657, "sampling/importance_sampling_ratio/mean": 0.9904860854148865, "sampling/importance_sampling_ratio/min": 0.982566237449646, "sampling/sampling_logp_difference/max": 0.015995390713214874, "sampling/sampling_logp_difference/mean": 0.002193054184317589, "step": 2899, "step_time": 3.961338663000788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08254987979307771, "epoch": 0.029, "grad_norm": 0.0009036498377099633, "kl": 0.5713602788746357, "learning_rate": 9.996204527963615e-06, "loss": 0.0015, "step": 2900, "step_time": 2.0384180009932606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 165.71875, "completions/mean_terminated_length": 165.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.550522718578577, "epoch": 0.02901, "frac_reward_zero_std": 0.5, "grad_norm": 0.008418257348239422, "kl": 0.5242051221430302, "learning_rate": 9.996201877479178e-06, "loss": -0.0079, "num_tokens": 24444037.0, "reward": 0.9119038581848145, "reward_std": 0.03179260343313217, "rewards/rollout_reward_func/mean": 0.9119038581848145, "rewards/rollout_reward_func/std": 0.23340579867362976, "sampling/importance_sampling_ratio/max": 0.9983248710632324, "sampling/importance_sampling_ratio/mean": 0.930103063583374, "sampling/importance_sampling_ratio/min": 0.00013122255040798336, "sampling/sampling_logp_difference/max": 2.455827474594116, "sampling/sampling_logp_difference/mean": 0.08097420632839203, "step": 2901, "step_time": 5.406941592991643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5498890299350023, "epoch": 0.02902, "grad_norm": 0.008251970633864403, "kl": 0.5238447599112988, "learning_rate": 9.996199226070078e-06, "loss": -0.0079, "step": 2902, "step_time": 2.021025202993769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 107.875, "completions/mean_terminated_length": 107.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.45759908482432365, "epoch": 0.02903, "frac_reward_zero_std": 0.75, "grad_norm": 0.00497563136741519, "kl": 0.65313920378685, "learning_rate": 9.996196573736315e-06, "loss": -0.0176, "num_tokens": 24458377.0, "reward": 1.018384575843811, "reward_std": 0.038074977695941925, "rewards/rollout_reward_func/mean": 1.018384575843811, "rewards/rollout_reward_func/std": 0.3002997040748596, "sampling/importance_sampling_ratio/max": 0.9976944327354431, "sampling/importance_sampling_ratio/mean": 0.9586135149002075, "sampling/importance_sampling_ratio/min": 8.641345417004231e-09, "sampling/sampling_logp_difference/max": 3.858912706375122, "sampling/sampling_logp_difference/mean": 0.133266881108284, "step": 2903, "step_time": 4.038317933984217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4574377005919814, "epoch": 0.02904, "grad_norm": 0.004986904561519623, "kl": 0.65262720733881, "learning_rate": 9.996193920477894e-06, "loss": -0.0176, "step": 2904, "step_time": 2.014930211000319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.07170993648469448, "epoch": 0.02905, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006129672983661294, "kl": 0.4153336063027382, "learning_rate": 9.996191266294812e-06, "loss": 0.0013, "num_tokens": 24473969.0, "reward": 0.8626153469085693, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8626153469085693, "rewards/rollout_reward_func/std": 0.3794006407260895, "sampling/importance_sampling_ratio/max": 1.00114905834198, "sampling/importance_sampling_ratio/mean": 0.9946227073669434, "sampling/importance_sampling_ratio/min": 0.9865912795066833, "sampling/sampling_logp_difference/max": 0.009397558867931366, "sampling/sampling_logp_difference/mean": 0.0011592516675591469, "step": 2905, "step_time": 4.217423192982096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07190571678802371, "epoch": 0.02906, "grad_norm": 0.0006115440046414733, "kl": 0.4152711294591427, "learning_rate": 9.99618861118707e-06, "loss": 0.0013, "step": 2906, "step_time": 2.536215638014255 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 145.65625, "completions/mean_terminated_length": 145.65625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 1.3586742915213108, "epoch": 0.02907, "frac_reward_zero_std": 0.25, "grad_norm": 0.02142401412129402, "kl": 0.9138319194316864, "learning_rate": 9.996185955154672e-06, "loss": -0.0573, "num_tokens": 24489422.0, "reward": 0.6892284154891968, "reward_std": 0.13154923915863037, "rewards/rollout_reward_func/mean": 0.6892284154891968, "rewards/rollout_reward_func/std": 0.26290440559387207, "sampling/importance_sampling_ratio/max": 0.9980316162109375, "sampling/importance_sampling_ratio/mean": 0.7765412926673889, "sampling/importance_sampling_ratio/min": 4.609316400783214e-12, "sampling/sampling_logp_difference/max": 18.208253860473633, "sampling/sampling_logp_difference/mean": 0.4650009274482727, "step": 2907, "step_time": 4.096653967993916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3614050084725022, "epoch": 0.02908, "grad_norm": 0.01828850619494915, "kl": 0.8717442490160465, "learning_rate": 9.996183298197613e-06, "loss": -0.0574, "step": 2908, "step_time": 2.0481629239875474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 169.0, "completions/mean_terminated_length": 169.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3378429329022765, "epoch": 0.02909, "frac_reward_zero_std": 0.75, "grad_norm": 0.0054877265356481075, "kl": 0.40103762224316597, "learning_rate": 9.996180640315898e-06, "loss": -0.0178, "num_tokens": 24505510.0, "reward": 0.8570865392684937, "reward_std": 0.01563793420791626, "rewards/rollout_reward_func/mean": 0.8570865392684937, "rewards/rollout_reward_func/std": 0.36987540125846863, "sampling/importance_sampling_ratio/max": 0.9990856051445007, "sampling/importance_sampling_ratio/mean": 0.963075578212738, "sampling/importance_sampling_ratio/min": 0.0007570664747618139, "sampling/sampling_logp_difference/max": 1.9348201751708984, "sampling/sampling_logp_difference/mean": 0.05728612840175629, "step": 2909, "step_time": 4.342087345990876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33618772868067026, "epoch": 0.0291, "grad_norm": 0.0055521330796182156, "kl": 0.40058476105332375, "learning_rate": 9.996177981509527e-06, "loss": -0.0178, "step": 2910, "step_time": 2.0456594729985227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 79.25, "completions/mean_terminated_length": 79.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.34000446647405624, "epoch": 0.02911, "frac_reward_zero_std": 0.5, "grad_norm": 0.030980098992586136, "kl": 0.771015077829361, "learning_rate": 9.9961753217785e-06, "loss": -0.0334, "num_tokens": 24518958.0, "reward": 0.6475625038146973, "reward_std": 0.04170570150017738, "rewards/rollout_reward_func/mean": 0.6475625038146973, "rewards/rollout_reward_func/std": 0.10014620423316956, "sampling/importance_sampling_ratio/max": 0.9991621375083923, "sampling/importance_sampling_ratio/mean": 0.9361786842346191, "sampling/importance_sampling_ratio/min": 0.02081894315779209, "sampling/sampling_logp_difference/max": 2.1486475467681885, "sampling/sampling_logp_difference/mean": 0.06068354845046997, "step": 2911, "step_time": 3.652249191996816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33877547923475504, "epoch": 0.02912, "grad_norm": 0.033448513597249985, "kl": 0.760346993803978, "learning_rate": 9.996172661122816e-06, "loss": -0.0334, "step": 2912, "step_time": 2.6704905060032615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 165.15625, "completions/mean_terminated_length": 165.15625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.22618097811937332, "epoch": 0.02913, "frac_reward_zero_std": 0.75, "grad_norm": 0.0061293975450098515, "kl": 0.4316949136555195, "learning_rate": 9.996169999542479e-06, "loss": -0.027, "num_tokens": 24534979.0, "reward": 0.7687981128692627, "reward_std": 0.029508113861083984, "rewards/rollout_reward_func/mean": 0.7687981128692627, "rewards/rollout_reward_func/std": 0.3179078996181488, "sampling/importance_sampling_ratio/max": 0.9982150197029114, "sampling/importance_sampling_ratio/mean": 0.9631873369216919, "sampling/importance_sampling_ratio/min": 0.021063106134533882, "sampling/sampling_logp_difference/max": 1.5681917667388916, "sampling/sampling_logp_difference/mean": 0.02174205519258976, "step": 2913, "step_time": 4.115571536007337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22539330553263426, "epoch": 0.02914, "grad_norm": 0.005985674448311329, "kl": 0.4318792223930359, "learning_rate": 9.996167337037489e-06, "loss": -0.027, "step": 2914, "step_time": 1.9738092199841049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 164.59375, "completions/mean_terminated_length": 164.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4901172947138548, "epoch": 0.02915, "frac_reward_zero_std": 0.75, "grad_norm": 0.012649140320718288, "kl": 0.5422052592039108, "learning_rate": 9.996164673607843e-06, "loss": -0.0363, "num_tokens": 24551070.0, "reward": 0.8374760150909424, "reward_std": 0.040427468717098236, "rewards/rollout_reward_func/mean": 0.8374760150909424, "rewards/rollout_reward_func/std": 0.3320681154727936, "sampling/importance_sampling_ratio/max": 0.992786705493927, "sampling/importance_sampling_ratio/mean": 0.9579951167106628, "sampling/importance_sampling_ratio/min": 7.835926147246766e-16, "sampling/sampling_logp_difference/max": 4.2700724601745605, "sampling/sampling_logp_difference/mean": 0.16715741157531738, "step": 2915, "step_time": 4.318385908001801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.48699410539120436, "epoch": 0.02916, "grad_norm": 0.012957506813108921, "kl": 0.547166645526886, "learning_rate": 9.996162009253546e-06, "loss": -0.0363, "step": 2916, "step_time": 2.065306910983054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 153.03125, "completions/mean_terminated_length": 153.03125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3750056540593505, "epoch": 0.02917, "frac_reward_zero_std": 0.75, "grad_norm": 0.0058618029579520226, "kl": 0.4215439707040787, "learning_rate": 9.996159343974595e-06, "loss": -0.0178, "num_tokens": 24566703.0, "reward": 0.7178846597671509, "reward_std": 0.013598206453025341, "rewards/rollout_reward_func/mean": 0.7178846597671509, "rewards/rollout_reward_func/std": 0.3078385889530182, "sampling/importance_sampling_ratio/max": 0.9981894493103027, "sampling/importance_sampling_ratio/mean": 0.962748646736145, "sampling/importance_sampling_ratio/min": 2.3870898147038133e-08, "sampling/sampling_logp_difference/max": 4.192923545837402, "sampling/sampling_logp_difference/mean": 0.12143626064062119, "step": 2917, "step_time": 4.375279040999885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3743870765902102, "epoch": 0.02918, "grad_norm": 0.005794648081064224, "kl": 0.4208870120346546, "learning_rate": 9.996156677770994e-06, "loss": -0.0178, "step": 2918, "step_time": 3.0375386839950806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 152.875, "completions/mean_terminated_length": 152.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5535774976015091, "epoch": 0.02919, "frac_reward_zero_std": 0.5, "grad_norm": 0.010875788517296314, "kl": 0.4726518727838993, "learning_rate": 9.996154010642744e-06, "loss": -0.056, "num_tokens": 24582419.0, "reward": 0.8536956906318665, "reward_std": 0.052425168454647064, "rewards/rollout_reward_func/mean": 0.8536956906318665, "rewards/rollout_reward_func/std": 0.40424272418022156, "sampling/importance_sampling_ratio/max": 0.998201310634613, "sampling/importance_sampling_ratio/mean": 0.9309096336364746, "sampling/importance_sampling_ratio/min": 2.14719491964388e-07, "sampling/sampling_logp_difference/max": 3.342359781265259, "sampling/sampling_logp_difference/mean": 0.13722798228263855, "step": 2919, "step_time": 4.666773347991693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5490323305130005, "epoch": 0.0292, "grad_norm": 0.009513725526630878, "kl": 0.4626225344836712, "learning_rate": 9.996151342589842e-06, "loss": -0.056, "step": 2920, "step_time": 2.093052975986211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06626735720783472, "epoch": 0.02921, "frac_reward_zero_std": 1.0, "grad_norm": 0.000682974117808044, "kl": 0.4212917536497116, "learning_rate": 9.996148673612291e-06, "loss": 0.0017, "num_tokens": 24599771.0, "reward": 0.9210000038146973, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.9210000038146973, "rewards/rollout_reward_func/std": 0.32060936093330383, "sampling/importance_sampling_ratio/max": 1.0006424188613892, "sampling/importance_sampling_ratio/mean": 0.994592010974884, "sampling/importance_sampling_ratio/min": 0.9878864288330078, "sampling/sampling_logp_difference/max": 0.008031472563743591, "sampling/sampling_logp_difference/mean": 0.0011008932488039136, "step": 2921, "step_time": 4.505742086003011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06521870382130146, "epoch": 0.02922, "grad_norm": 0.0006558039458468556, "kl": 0.42149652168154716, "learning_rate": 9.99614600371009e-06, "loss": 0.0017, "step": 2922, "step_time": 2.0207075319922296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07291479734703898, "epoch": 0.02923, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005273303831927478, "kl": 0.6140705868601799, "learning_rate": 9.996143332883242e-06, "loss": 0.0016, "num_tokens": 24614547.0, "reward": 0.671923041343689, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.671923041343689, "rewards/rollout_reward_func/std": 0.06390844285488129, "sampling/importance_sampling_ratio/max": 0.9986971020698547, "sampling/importance_sampling_ratio/mean": 0.9950833320617676, "sampling/importance_sampling_ratio/min": 0.9905144572257996, "sampling/sampling_logp_difference/max": 0.00815589353442192, "sampling/sampling_logp_difference/mean": 0.0012045640032738447, "step": 2923, "step_time": 3.916613116991357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07139785960316658, "epoch": 0.02924, "grad_norm": 0.000511127698700875, "kl": 0.6143860146403313, "learning_rate": 9.996140661131747e-06, "loss": 0.0016, "step": 2924, "step_time": 2.972622647008393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 220.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 172.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.6728983586654067, "epoch": 0.02925, "frac_reward_zero_std": 0.5, "grad_norm": 0.01949191652238369, "kl": 0.5686670579016209, "learning_rate": 9.996137988455605e-06, "loss": -0.0457, "num_tokens": 24630867.0, "reward": 0.7058653831481934, "reward_std": 0.09002013504505157, "rewards/rollout_reward_func/mean": 0.7058653831481934, "rewards/rollout_reward_func/std": 0.23460808396339417, "sampling/importance_sampling_ratio/max": 0.9999948143959045, "sampling/importance_sampling_ratio/mean": 0.932876706123352, "sampling/importance_sampling_ratio/min": 2.950215225891136e-16, "sampling/sampling_logp_difference/max": 3.1996641159057617, "sampling/sampling_logp_difference/mean": 0.20916633307933807, "step": 2925, "step_time": 4.2100862019869965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.673509465996176, "epoch": 0.02926, "grad_norm": 0.01946944184601307, "kl": 0.5725330002605915, "learning_rate": 9.996135314854818e-06, "loss": -0.0457, "step": 2926, "step_time": 2.0399508539994713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 214.96875, "completions/mean_terminated_length": 214.96875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.3994407942518592, "epoch": 0.02927, "frac_reward_zero_std": 0.75, "grad_norm": 0.006764458958059549, "kl": 0.4303329065442085, "learning_rate": 9.996132640329383e-06, "loss": -0.034, "num_tokens": 24648370.0, "reward": 0.8348269462585449, "reward_std": 0.01706082746386528, "rewards/rollout_reward_func/mean": 0.8348269462585449, "rewards/rollout_reward_func/std": 0.19248047471046448, "sampling/importance_sampling_ratio/max": 0.9989413619041443, "sampling/importance_sampling_ratio/mean": 0.9327702522277832, "sampling/importance_sampling_ratio/min": 0.0005590390646830201, "sampling/sampling_logp_difference/max": 2.834977149963379, "sampling/sampling_logp_difference/mean": 0.05633952468633652, "step": 2927, "step_time": 4.2830361839951365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39795931661501527, "epoch": 0.02928, "grad_norm": 0.007069423329085112, "kl": 0.4352187365293503, "learning_rate": 9.996129964879305e-06, "loss": -0.034, "step": 2928, "step_time": 1.9670125229895348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 165.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.06865647155791521, "epoch": 0.02929, "frac_reward_zero_std": 1.0, "grad_norm": 0.000617105804849416, "kl": 0.4846276678144932, "learning_rate": 9.996127288504582e-06, "loss": 0.0017, "num_tokens": 24664298.0, "reward": 0.7746154069900513, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7746154069900513, "rewards/rollout_reward_func/std": 0.2426915317773819, "sampling/importance_sampling_ratio/max": 0.9987488985061646, "sampling/importance_sampling_ratio/mean": 0.9931789636611938, "sampling/importance_sampling_ratio/min": 0.9868988394737244, "sampling/sampling_logp_difference/max": 0.007781824097037315, "sampling/sampling_logp_difference/mean": 0.0013297493569552898, "step": 2929, "step_time": 3.985378480007057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06850490346550941, "epoch": 0.0293, "grad_norm": 0.0006192443543113768, "kl": 0.484665609896183, "learning_rate": 9.996124611205217e-06, "loss": 0.0017, "step": 2930, "step_time": 2.8916616199858254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 187.84375, "completions/mean_terminated_length": 187.84375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2555590281262994, "epoch": 0.02931, "frac_reward_zero_std": 0.75, "grad_norm": 0.0036851544864475727, "kl": 0.415917094796896, "learning_rate": 9.996121932981209e-06, "loss": -0.027, "num_tokens": 24680989.0, "reward": 0.8298269510269165, "reward_std": 0.004079459700733423, "rewards/rollout_reward_func/mean": 0.8298269510269165, "rewards/rollout_reward_func/std": 0.3018702566623688, "sampling/importance_sampling_ratio/max": 0.9990037679672241, "sampling/importance_sampling_ratio/mean": 0.9637381434440613, "sampling/importance_sampling_ratio/min": 0.0054314895533025265, "sampling/sampling_logp_difference/max": 1.6634809970855713, "sampling/sampling_logp_difference/mean": 0.025298139080405235, "step": 2931, "step_time": 4.402486686994962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.25589864840731025, "epoch": 0.02932, "grad_norm": 0.0037486362271010876, "kl": 0.4157010205090046, "learning_rate": 9.996119253832556e-06, "loss": -0.027, "step": 2932, "step_time": 2.0269113829926937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 195.5, "completions/mean_terminated_length": 195.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.05459881480783224, "epoch": 0.02933, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005049118772149086, "kl": 0.5055703409016132, "learning_rate": 9.996116573759264e-06, "loss": 0.0019, "num_tokens": 24698013.0, "reward": 0.7864615321159363, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7864615321159363, "rewards/rollout_reward_func/std": 0.1936255842447281, "sampling/importance_sampling_ratio/max": 1.0008405447006226, "sampling/importance_sampling_ratio/mean": 0.9961156845092773, "sampling/importance_sampling_ratio/min": 0.9924208521842957, "sampling/sampling_logp_difference/max": 0.005719978362321854, "sampling/sampling_logp_difference/mean": 0.0009800889529287815, "step": 2933, "step_time": 4.24545548199967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.054232079070061445, "epoch": 0.02934, "grad_norm": 0.0004950527218170464, "kl": 0.5056262165307999, "learning_rate": 9.996113892761332e-06, "loss": 0.0019, "step": 2934, "step_time": 2.046068800977082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 143.84375, "completions/mean_terminated_length": 145.22579956054688, "completions/min_length": 101.0, "completions/min_terminated_length": 104.0, "entropy": 0.8407297036610544, "epoch": 0.02935, "frac_reward_zero_std": 0.25, "grad_norm": 0.017678581178188324, "kl": 0.818238727748394, "learning_rate": 9.996111210838758e-06, "loss": -0.0246, "num_tokens": 24713384.0, "reward": 0.82401442527771, "reward_std": 0.11633266508579254, "rewards/rollout_reward_func/mean": 0.82401442527771, "rewards/rollout_reward_func/std": 0.31688517332077026, "sampling/importance_sampling_ratio/max": 1.0003920793533325, "sampling/importance_sampling_ratio/mean": 0.9052830934524536, "sampling/importance_sampling_ratio/min": 4.432630119280773e-18, "sampling/sampling_logp_difference/max": 3.4240145683288574, "sampling/sampling_logp_difference/mean": 0.2737642228603363, "step": 2935, "step_time": 4.51931297199917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8385878163389862, "epoch": 0.02936, "grad_norm": 0.013291322626173496, "kl": 0.7670213915407658, "learning_rate": 9.996108527991544e-06, "loss": -0.0247, "step": 2936, "step_time": 2.507051828011754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 211.40625, "completions/mean_terminated_length": 211.40625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.5144809931516647, "epoch": 0.02937, "frac_reward_zero_std": 0.0, "grad_norm": 0.03683407977223396, "kl": 0.5643757730722427, "learning_rate": 9.996105844219693e-06, "loss": -0.0506, "num_tokens": 24730773.0, "reward": 0.7708557844161987, "reward_std": 0.09858700633049011, "rewards/rollout_reward_func/mean": 0.7708557844161987, "rewards/rollout_reward_func/std": 0.24859194457530975, "sampling/importance_sampling_ratio/max": 0.9966242909431458, "sampling/importance_sampling_ratio/mean": 0.8733654618263245, "sampling/importance_sampling_ratio/min": 0.0012542216572910547, "sampling/sampling_logp_difference/max": 2.547499179840088, "sampling/sampling_logp_difference/mean": 0.08814956992864609, "step": 2937, "step_time": 4.521643321990268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.523096895776689, "epoch": 0.02938, "grad_norm": 0.038363952189683914, "kl": 0.5588945038616657, "learning_rate": 9.996103159523205e-06, "loss": -0.0508, "step": 2938, "step_time": 2.084410574003414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 111.8125, "completions/mean_terminated_length": 111.8125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4385632863268256, "epoch": 0.02939, "frac_reward_zero_std": 0.75, "grad_norm": 0.008034723810851574, "kl": 0.6836576201021671, "learning_rate": 9.996100473902077e-06, "loss": -0.0233, "num_tokens": 24745031.0, "reward": 0.5267788171768188, "reward_std": 0.12506616115570068, "rewards/rollout_reward_func/mean": 0.5267788171768188, "rewards/rollout_reward_func/std": 0.3432587683200836, "sampling/importance_sampling_ratio/max": 0.9996663331985474, "sampling/importance_sampling_ratio/mean": 0.9347453117370605, "sampling/importance_sampling_ratio/min": 0.006787145510315895, "sampling/sampling_logp_difference/max": 2.0387701988220215, "sampling/sampling_logp_difference/mean": 0.058205727487802505, "step": 2939, "step_time": 3.8389909510078724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4372130986303091, "epoch": 0.0294, "grad_norm": 0.007379347924143076, "kl": 0.6770377680659294, "learning_rate": 9.996097787356312e-06, "loss": -0.0233, "step": 2940, "step_time": 2.011638213007245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 194.78125, "completions/mean_terminated_length": 194.78125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4290217459201813, "epoch": 0.02941, "frac_reward_zero_std": 0.5, "grad_norm": 0.0118164187297225, "kl": 0.48029719293117523, "learning_rate": 9.996095099885912e-06, "loss": -0.0646, "num_tokens": 24762000.0, "reward": 0.7567499876022339, "reward_std": 0.011150531470775604, "rewards/rollout_reward_func/mean": 0.7567499876022339, "rewards/rollout_reward_func/std": 0.3970157206058502, "sampling/importance_sampling_ratio/max": 0.9968722462654114, "sampling/importance_sampling_ratio/mean": 0.931273877620697, "sampling/importance_sampling_ratio/min": 2.5287247353844577e-06, "sampling/sampling_logp_difference/max": 3.245826244354248, "sampling/sampling_logp_difference/mean": 0.07659505307674408, "step": 2941, "step_time": 5.091074595984537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4267366351559758, "epoch": 0.02942, "grad_norm": 0.011902439408004284, "kl": 0.4804268032312393, "learning_rate": 9.996092411490876e-06, "loss": -0.0646, "step": 2942, "step_time": 2.507151881007303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.06356831546872854, "epoch": 0.02943, "frac_reward_zero_std": 1.0, "grad_norm": 0.00048113588127307594, "kl": 0.4884833134710789, "learning_rate": 9.996089722171205e-06, "loss": 0.0014, "num_tokens": 24776848.0, "reward": 0.681538462638855, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.681538462638855, "rewards/rollout_reward_func/std": 0.05672488734126091, "sampling/importance_sampling_ratio/max": 0.9996698498725891, "sampling/importance_sampling_ratio/mean": 0.9960311055183411, "sampling/importance_sampling_ratio/min": 0.9906021952629089, "sampling/sampling_logp_difference/max": 0.007217003032565117, "sampling/sampling_logp_difference/mean": 0.001067677396349609, "step": 2943, "step_time": 3.870173920993693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06347520370036364, "epoch": 0.02944, "grad_norm": 0.0004796756838914007, "kl": 0.4884730726480484, "learning_rate": 9.996087031926901e-06, "loss": 0.0014, "step": 2944, "step_time": 1.9841879469968262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3837727978825569, "epoch": 0.02945, "frac_reward_zero_std": 0.5, "grad_norm": 0.02077377960085869, "kl": 0.5057614520192146, "learning_rate": 9.996084340757962e-06, "loss": -0.0356, "num_tokens": 24791600.0, "reward": 0.6010096073150635, "reward_std": 0.015637939795851707, "rewards/rollout_reward_func/mean": 0.6010096073150635, "rewards/rollout_reward_func/std": 0.15552513301372528, "sampling/importance_sampling_ratio/max": 0.9980056285858154, "sampling/importance_sampling_ratio/mean": 0.9340304136276245, "sampling/importance_sampling_ratio/min": 0.02304663136601448, "sampling/sampling_logp_difference/max": 2.0012006759643555, "sampling/sampling_logp_difference/mean": 0.05240500718355179, "step": 2945, "step_time": 4.002109360022587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3825103836134076, "epoch": 0.02946, "grad_norm": 0.021000217646360397, "kl": 0.5063264966011047, "learning_rate": 9.996081648664393e-06, "loss": -0.0356, "step": 2946, "step_time": 1.9855056510132272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 135.6875, "completions/mean_terminated_length": 136.258056640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3041623644530773, "epoch": 0.02947, "frac_reward_zero_std": 0.25, "grad_norm": 0.014518815092742443, "kl": 0.6461009606719017, "learning_rate": 9.996078955646189e-06, "loss": -0.0571, "num_tokens": 24806766.0, "reward": 0.7669447660446167, "reward_std": 0.09249390661716461, "rewards/rollout_reward_func/mean": 0.7669447660446167, "rewards/rollout_reward_func/std": 0.3086891770362854, "sampling/importance_sampling_ratio/max": 0.9990668296813965, "sampling/importance_sampling_ratio/mean": 0.8709315061569214, "sampling/importance_sampling_ratio/min": 1.0125877144450511e-21, "sampling/sampling_logp_difference/max": 12.803488731384277, "sampling/sampling_logp_difference/mean": 0.588900089263916, "step": 2947, "step_time": 4.904453970986651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3029789859429002, "epoch": 0.02948, "grad_norm": 0.013899119570851326, "kl": 0.6476493254303932, "learning_rate": 9.996076261703353e-06, "loss": -0.0572, "step": 2948, "step_time": 2.43761503600399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 104.0, "completions/max_terminated_length": 104.0, "completions/mean_length": 97.0, "completions/mean_terminated_length": 97.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.15123852528631687, "epoch": 0.02949, "frac_reward_zero_std": 0.75, "grad_norm": 0.1272355169057846, "kl": 0.6319021731615067, "learning_rate": 9.996073566835888e-06, "loss": -0.0146, "num_tokens": 24820670.0, "reward": 0.7375961542129517, "reward_std": 0.017677659168839455, "rewards/rollout_reward_func/mean": 0.7375961542129517, "rewards/rollout_reward_func/std": 0.24928739666938782, "sampling/importance_sampling_ratio/max": 0.9992924928665161, "sampling/importance_sampling_ratio/mean": 0.969176173210144, "sampling/importance_sampling_ratio/min": 0.14766888320446014, "sampling/sampling_logp_difference/max": 2.215726613998413, "sampling/sampling_logp_difference/mean": 0.021167194470763206, "step": 2949, "step_time": 3.7137477790092817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.14384880661964417, "epoch": 0.0295, "grad_norm": 0.0169121865183115, "kl": 0.6632212027907372, "learning_rate": 9.996070871043792e-06, "loss": -0.0148, "step": 2950, "step_time": 1.945134871006303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 104.3125, "completions/mean_terminated_length": 104.3125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.9710900289937854, "epoch": 0.02951, "frac_reward_zero_std": 0.0, "grad_norm": 0.013321862556040287, "kl": 0.7137513719499111, "learning_rate": 9.996068174327066e-06, "loss": -0.0363, "num_tokens": 24834888.0, "reward": 0.5487500429153442, "reward_std": 0.036987125873565674, "rewards/rollout_reward_func/mean": 0.5487500429153442, "rewards/rollout_reward_func/std": 0.12416297942399979, "sampling/importance_sampling_ratio/max": 0.9996013641357422, "sampling/importance_sampling_ratio/mean": 0.8732389211654663, "sampling/importance_sampling_ratio/min": 0.001258750562556088, "sampling/sampling_logp_difference/max": 2.1308650970458984, "sampling/sampling_logp_difference/mean": 0.17059093713760376, "step": 2951, "step_time": 3.946929204998014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.968605731613934, "epoch": 0.02952, "grad_norm": 0.013279703445732594, "kl": 0.7136729396879673, "learning_rate": 9.996065476685711e-06, "loss": -0.0363, "step": 2952, "step_time": 1.9981707980041392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 93.75, "completions/mean_terminated_length": 93.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06373404012992978, "epoch": 0.02953, "frac_reward_zero_std": 1.0, "grad_norm": 0.00039670331170782447, "kl": 0.664787620306015, "learning_rate": 9.996062778119729e-06, "loss": 0.0016, "num_tokens": 24848656.0, "reward": 0.8392307758331299, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8392307758331299, "rewards/rollout_reward_func/std": 0.13672025501728058, "sampling/importance_sampling_ratio/max": 0.99917072057724, "sampling/importance_sampling_ratio/mean": 0.9954830408096313, "sampling/importance_sampling_ratio/min": 0.9905877113342285, "sampling/sampling_logp_difference/max": 0.005126304924488068, "sampling/sampling_logp_difference/mean": 0.0012211098801344633, "step": 2953, "step_time": 4.100701496987313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06362389726564288, "epoch": 0.02954, "grad_norm": 0.00040103416540659964, "kl": 0.6647951900959015, "learning_rate": 9.996060078629118e-06, "loss": 0.0016, "step": 2954, "step_time": 2.407538230996579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 190.1875, "completions/mean_terminated_length": 190.1875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.12813235726207495, "epoch": 0.02955, "frac_reward_zero_std": 0.75, "grad_norm": 0.02128012292087078, "kl": 0.5505583807826042, "learning_rate": 9.996057378213882e-06, "loss": -0.0245, "num_tokens": 24865478.0, "reward": 0.4241538643836975, "reward_std": 0.1321745663881302, "rewards/rollout_reward_func/mean": 0.4241538643836975, "rewards/rollout_reward_func/std": 0.3076786994934082, "sampling/importance_sampling_ratio/max": 1.000418782234192, "sampling/importance_sampling_ratio/mean": 0.967190682888031, "sampling/importance_sampling_ratio/min": 0.11839991062879562, "sampling/sampling_logp_difference/max": 2.25407338142395, "sampling/sampling_logp_difference/mean": 0.013711612671613693, "step": 2955, "step_time": 4.189347431994975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1271073305979371, "epoch": 0.02956, "grad_norm": 0.023082539439201355, "kl": 0.5499202273786068, "learning_rate": 9.996054676874018e-06, "loss": -0.0245, "step": 2956, "step_time": 2.0333053559734253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 122.875, "completions/mean_terminated_length": 122.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.12577434163540602, "epoch": 0.02957, "frac_reward_zero_std": 0.75, "grad_norm": 0.020878000184893608, "kl": 0.5637411065399647, "learning_rate": 9.99605197460953e-06, "loss": -0.0236, "num_tokens": 24880122.0, "reward": 0.7855528593063354, "reward_std": 0.010946566238999367, "rewards/rollout_reward_func/mean": 0.7855528593063354, "rewards/rollout_reward_func/std": 0.23128922283649445, "sampling/importance_sampling_ratio/max": 1.0007765293121338, "sampling/importance_sampling_ratio/mean": 0.9692869186401367, "sampling/importance_sampling_ratio/min": 0.12262208759784698, "sampling/sampling_logp_difference/max": 2.1507678031921387, "sampling/sampling_logp_difference/mean": 0.015582293272018433, "step": 2957, "step_time": 3.9484948739846004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12505900859832764, "epoch": 0.02958, "grad_norm": 0.019128205254673958, "kl": 0.5704132989048958, "learning_rate": 9.996049271420417e-06, "loss": -0.0237, "step": 2958, "step_time": 2.501611457009858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 145.25, "completions/mean_terminated_length": 145.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.1293689594604075, "epoch": 0.02959, "frac_reward_zero_std": 0.75, "grad_norm": 0.004714846611022949, "kl": 0.6401332430541515, "learning_rate": 9.996046567306679e-06, "loss": -0.0157, "num_tokens": 24895506.0, "reward": 0.6049038767814636, "reward_std": 0.01767767034471035, "rewards/rollout_reward_func/mean": 0.6049038767814636, "rewards/rollout_reward_func/std": 0.25009849667549133, "sampling/importance_sampling_ratio/max": 0.9990583062171936, "sampling/importance_sampling_ratio/mean": 0.9671982526779175, "sampling/importance_sampling_ratio/min": 0.08969543129205704, "sampling/sampling_logp_difference/max": 2.4105324745178223, "sampling/sampling_logp_difference/mean": 0.01605682633817196, "step": 2959, "step_time": 3.8665175100177294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1282686903141439, "epoch": 0.0296, "grad_norm": 0.003475381061434746, "kl": 0.659479945898056, "learning_rate": 9.996043862268317e-06, "loss": -0.0157, "step": 2960, "step_time": 2.439946411002893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 219.59375, "completions/mean_terminated_length": 219.59375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4505099831148982, "epoch": 0.02961, "frac_reward_zero_std": 0.75, "grad_norm": 0.02848881110548973, "kl": 0.7137865796685219, "learning_rate": 9.996041156305334e-06, "loss": 0.0221, "num_tokens": 24913157.0, "reward": 0.7863365411758423, "reward_std": 0.01835758052766323, "rewards/rollout_reward_func/mean": 0.7863365411758423, "rewards/rollout_reward_func/std": 0.14263474941253662, "sampling/importance_sampling_ratio/max": 0.9978141188621521, "sampling/importance_sampling_ratio/mean": 0.9387911558151245, "sampling/importance_sampling_ratio/min": 6.608551927911321e-08, "sampling/sampling_logp_difference/max": 3.3705525398254395, "sampling/sampling_logp_difference/mean": 0.08882448077201843, "step": 2961, "step_time": 4.360878081010014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.44721354451030493, "epoch": 0.02962, "grad_norm": 0.025699308142066002, "kl": 0.6929610632359982, "learning_rate": 9.996038449417726e-06, "loss": 0.0221, "step": 2962, "step_time": 2.045083942000929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 237.0625, "completions/mean_terminated_length": 237.0625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.2555199940688908, "epoch": 0.02963, "frac_reward_zero_std": 0.75, "grad_norm": 0.007486600894480944, "kl": 0.40779703855514526, "learning_rate": 9.996035741605499e-06, "loss": -0.0366, "num_tokens": 24931455.0, "reward": 0.9696586728096008, "reward_std": 0.023157751187682152, "rewards/rollout_reward_func/mean": 0.9696586728096008, "rewards/rollout_reward_func/std": 0.23143871128559113, "sampling/importance_sampling_ratio/max": 0.9980493783950806, "sampling/importance_sampling_ratio/mean": 0.9637749195098877, "sampling/importance_sampling_ratio/min": 0.0002902727574110031, "sampling/sampling_logp_difference/max": 2.030890464782715, "sampling/sampling_logp_difference/mean": 0.03876114264130592, "step": 2963, "step_time": 4.4363899220043095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2555164280347526, "epoch": 0.02964, "grad_norm": 0.007333092857152224, "kl": 0.4098645783960819, "learning_rate": 9.99603303286865e-06, "loss": -0.0366, "step": 2964, "step_time": 2.5193610959977377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 192.25, "completions/mean_terminated_length": 192.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.12581480061635375, "epoch": 0.02965, "frac_reward_zero_std": 0.75, "grad_norm": 0.036058615893125534, "kl": 0.9765450209379196, "learning_rate": 9.996030323207182e-06, "loss": -0.0145, "num_tokens": 24948343.0, "reward": 0.970596194267273, "reward_std": 0.04487408697605133, "rewards/rollout_reward_func/mean": 0.970596194267273, "rewards/rollout_reward_func/std": 0.36099839210510254, "sampling/importance_sampling_ratio/max": 0.999818742275238, "sampling/importance_sampling_ratio/mean": 0.9684403538703918, "sampling/importance_sampling_ratio/min": 0.09840507805347443, "sampling/sampling_logp_difference/max": 2.4422125816345215, "sampling/sampling_logp_difference/mean": 0.016692545264959335, "step": 2965, "step_time": 4.279595374988276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12669977685436606, "epoch": 0.02966, "grad_norm": 0.026354825124144554, "kl": 0.9139147736132145, "learning_rate": 9.996027612621093e-06, "loss": -0.0145, "step": 2966, "step_time": 2.438360494998051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 122.59375, "completions/mean_terminated_length": 122.59375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.4159204922616482, "epoch": 0.02967, "frac_reward_zero_std": 0.75, "grad_norm": 0.02380790375173092, "kl": 0.5803592503070831, "learning_rate": 9.996024901110385e-06, "loss": 0.0202, "num_tokens": 24963002.0, "reward": 0.4961538314819336, "reward_std": 0.01631784811615944, "rewards/rollout_reward_func/mean": 0.4961538314819336, "rewards/rollout_reward_func/std": 0.0382874459028244, "sampling/importance_sampling_ratio/max": 1.000851035118103, "sampling/importance_sampling_ratio/mean": 0.9359934329986572, "sampling/importance_sampling_ratio/min": 0.013918901793658733, "sampling/sampling_logp_difference/max": 1.960028886795044, "sampling/sampling_logp_difference/mean": 0.05790141224861145, "step": 2967, "step_time": 3.906249364015821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41793637443333864, "epoch": 0.02968, "grad_norm": 0.023468945175409317, "kl": 0.5603964924812317, "learning_rate": 9.99602218867506e-06, "loss": 0.0201, "step": 2968, "step_time": 1.9821574890011107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05240213684737682, "epoch": 0.02969, "frac_reward_zero_std": 1.0, "grad_norm": 0.00042384344851598144, "kl": 0.42556145787239075, "learning_rate": 9.996019475315118e-06, "loss": 0.0015, "num_tokens": 24979314.0, "reward": 0.8114615678787231, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8114615678787231, "rewards/rollout_reward_func/std": 0.4699174165725708, "sampling/importance_sampling_ratio/max": 0.9987007975578308, "sampling/importance_sampling_ratio/mean": 0.9957890510559082, "sampling/importance_sampling_ratio/min": 0.9906344413757324, "sampling/sampling_logp_difference/max": 0.006482277065515518, "sampling/sampling_logp_difference/mean": 0.0008353388402611017, "step": 2969, "step_time": 3.999022864001745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05315234512090683, "epoch": 0.0297, "grad_norm": 0.0004325199988670647, "kl": 0.4254060164093971, "learning_rate": 9.996016761030556e-06, "loss": 0.0014, "step": 2970, "step_time": 2.450951664999593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 174.5625, "completions/mean_terminated_length": 174.5625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.26498847641050816, "epoch": 0.02971, "frac_reward_zero_std": 0.75, "grad_norm": 0.007673053070902824, "kl": 0.4671930558979511, "learning_rate": 9.996014045821381e-06, "loss": 0.0302, "num_tokens": 24995668.0, "reward": 0.8933509588241577, "reward_std": 0.0031683773268014193, "rewards/rollout_reward_func/mean": 0.8933509588241577, "rewards/rollout_reward_func/std": 0.35157206654548645, "sampling/importance_sampling_ratio/max": 1.0000008344650269, "sampling/importance_sampling_ratio/mean": 0.9642605781555176, "sampling/importance_sampling_ratio/min": 0.00013998667418491095, "sampling/sampling_logp_difference/max": 1.988176941871643, "sampling/sampling_logp_difference/mean": 0.04663704335689545, "step": 2971, "step_time": 4.765221241010295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26902286568656564, "epoch": 0.02972, "grad_norm": 0.007502387743443251, "kl": 0.46643848717212677, "learning_rate": 9.996011329687589e-06, "loss": 0.0302, "step": 2972, "step_time": 2.03001764902001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 187.4375, "completions/mean_terminated_length": 187.4375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.297385832760483, "epoch": 0.02973, "frac_reward_zero_std": 1.0, "grad_norm": 0.005725650116801262, "kl": 0.4113124869763851, "learning_rate": 9.996008612629182e-06, "loss": 0.0017, "num_tokens": 25012346.0, "reward": 0.8035769462585449, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8035769462585449, "rewards/rollout_reward_func/std": 0.2623720169067383, "sampling/importance_sampling_ratio/max": 1.0001527070999146, "sampling/importance_sampling_ratio/mean": 0.9647442102432251, "sampling/importance_sampling_ratio/min": 8.385907392756062e-15, "sampling/sampling_logp_difference/max": 10.236885070800781, "sampling/sampling_logp_difference/mean": 0.12988656759262085, "step": 2973, "step_time": 4.532942500991339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29955620924010873, "epoch": 0.02974, "grad_norm": 0.006112479139119387, "kl": 0.4131992273032665, "learning_rate": 9.996005894646163e-06, "loss": 0.0018, "step": 2974, "step_time": 2.065373441997508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.667062649037689, "epoch": 0.02975, "frac_reward_zero_std": 0.5, "grad_norm": 0.007518255617469549, "kl": 0.5210824906826019, "learning_rate": 9.996003175738529e-06, "loss": -0.0652, "num_tokens": 25028746.0, "reward": 0.9500528573989868, "reward_std": 0.13180741667747498, "rewards/rollout_reward_func/mean": 0.9500528573989868, "rewards/rollout_reward_func/std": 0.33161672949790955, "sampling/importance_sampling_ratio/max": 0.9991506338119507, "sampling/importance_sampling_ratio/mean": 0.9319099187850952, "sampling/importance_sampling_ratio/min": 1.0331836081123313e-10, "sampling/sampling_logp_difference/max": 2.9912619590759277, "sampling/sampling_logp_difference/mean": 0.17009828984737396, "step": 2975, "step_time": 4.40513330099202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6684909611940384, "epoch": 0.02976, "grad_norm": 0.007052501663565636, "kl": 0.5174146257340908, "learning_rate": 9.99600045590628e-06, "loss": -0.0652, "step": 2976, "step_time": 2.452422125992598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 159.6875, "completions/mean_terminated_length": 159.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 1.1091736485250294, "epoch": 0.02977, "frac_reward_zero_std": 0.5, "grad_norm": 0.02646360732614994, "kl": 0.519776564091444, "learning_rate": 9.995997735149423e-06, "loss": -0.0546, "num_tokens": 25044680.0, "reward": 0.4952884316444397, "reward_std": 0.044772569090127945, "rewards/rollout_reward_func/mean": 0.4952884316444397, "rewards/rollout_reward_func/std": 0.08845728635787964, "sampling/importance_sampling_ratio/max": 0.9987106919288635, "sampling/importance_sampling_ratio/mean": 0.8999214172363281, "sampling/importance_sampling_ratio/min": 2.6253647647394436e-18, "sampling/sampling_logp_difference/max": 3.637333869934082, "sampling/sampling_logp_difference/mean": 0.44832032918930054, "step": 2977, "step_time": 5.140051194000989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.109337080270052, "epoch": 0.02978, "grad_norm": 0.026005128398537636, "kl": 0.5170490741729736, "learning_rate": 9.995995013467951e-06, "loss": -0.0546, "step": 2978, "step_time": 2.060646423000435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 239.78125, "completions/mean_terminated_length": 239.78125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.21736663160845637, "epoch": 0.02979, "frac_reward_zero_std": 0.75, "grad_norm": 0.0046492330729961395, "kl": 0.4262812174856663, "learning_rate": 9.99599229086187e-06, "loss": -0.0364, "num_tokens": 25063089.0, "reward": 0.6678606271743774, "reward_std": 0.02791712060570717, "rewards/rollout_reward_func/mean": 0.6678606271743774, "rewards/rollout_reward_func/std": 0.4575512409210205, "sampling/importance_sampling_ratio/max": 0.997451663017273, "sampling/importance_sampling_ratio/mean": 0.9634254574775696, "sampling/importance_sampling_ratio/min": 0.007606970611959696, "sampling/sampling_logp_difference/max": 1.5629276037216187, "sampling/sampling_logp_difference/mean": 0.02300645411014557, "step": 2979, "step_time": 4.365921120006533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21132262563332915, "epoch": 0.0298, "grad_norm": 0.004826842341572046, "kl": 0.42605334892868996, "learning_rate": 9.99598956733118e-06, "loss": -0.0364, "step": 2980, "step_time": 2.046606594005425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 119.71875, "completions/mean_terminated_length": 119.71875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.4250026233494282, "epoch": 0.02981, "frac_reward_zero_std": 0.5, "grad_norm": 0.04012202098965645, "kl": 0.5913733169436455, "learning_rate": 9.995986842875878e-06, "loss": 0.0378, "num_tokens": 25077656.0, "reward": 0.7231249809265137, "reward_std": 0.008838832378387451, "rewards/rollout_reward_func/mean": 0.7231249809265137, "rewards/rollout_reward_func/std": 0.10715818405151367, "sampling/importance_sampling_ratio/max": 1.001572847366333, "sampling/importance_sampling_ratio/mean": 0.9356449246406555, "sampling/importance_sampling_ratio/min": 0.03852234035730362, "sampling/sampling_logp_difference/max": 1.353601098060608, "sampling/sampling_logp_difference/mean": 0.04971805214881897, "step": 2981, "step_time": 4.300688101007836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.42039508279412985, "epoch": 0.02982, "grad_norm": 0.0409676618874073, "kl": 0.5899987667798996, "learning_rate": 9.995984117495967e-06, "loss": 0.0377, "step": 2982, "step_time": 1.9943527099967469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 175.21875, "completions/mean_terminated_length": 175.21875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.5229987371712923, "epoch": 0.02983, "frac_reward_zero_std": 0.5, "grad_norm": 0.021504083648324013, "kl": 0.4604923017323017, "learning_rate": 9.99598139119145e-06, "loss": -0.0141, "num_tokens": 25094087.0, "reward": 0.6252341270446777, "reward_std": 0.01435018889605999, "rewards/rollout_reward_func/mean": 0.6252341270446777, "rewards/rollout_reward_func/std": 0.22989816963672638, "sampling/importance_sampling_ratio/max": 0.9985054135322571, "sampling/importance_sampling_ratio/mean": 0.9342060685157776, "sampling/importance_sampling_ratio/min": 9.51624352242496e-19, "sampling/sampling_logp_difference/max": 3.8004977703094482, "sampling/sampling_logp_difference/mean": 0.27221453189849854, "step": 2983, "step_time": 5.379412002992467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5221624067053199, "epoch": 0.02984, "grad_norm": 0.019992616027593613, "kl": 0.46358155086636543, "learning_rate": 9.995978663962324e-06, "loss": -0.0141, "step": 2984, "step_time": 2.1589475959917763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 174.5, "completions/mean_terminated_length": 174.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3418083442375064, "epoch": 0.02985, "frac_reward_zero_std": 0.5, "grad_norm": 0.022855833172798157, "kl": 0.5868017002940178, "learning_rate": 9.995975935808592e-06, "loss": 0.0035, "num_tokens": 25110439.0, "reward": 0.8695144057273865, "reward_std": 0.040182702243328094, "rewards/rollout_reward_func/mean": 0.8695144057273865, "rewards/rollout_reward_func/std": 0.32507821917533875, "sampling/importance_sampling_ratio/max": 1.000222086906433, "sampling/importance_sampling_ratio/mean": 0.9346468448638916, "sampling/importance_sampling_ratio/min": 0.0004495684406720102, "sampling/sampling_logp_difference/max": 2.66413950920105, "sampling/sampling_logp_difference/mean": 0.061107851564884186, "step": 2985, "step_time": 4.329551413007721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3443268444389105, "epoch": 0.02986, "grad_norm": 0.014720600098371506, "kl": 0.6114354282617569, "learning_rate": 9.995973206730254e-06, "loss": 0.0035, "step": 2986, "step_time": 2.1182079349891865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 296.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 191.625, "completions/mean_terminated_length": 188.258056640625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.7279515811242163, "epoch": 0.02987, "frac_reward_zero_std": 0.25, "grad_norm": 0.027287058532238007, "kl": 0.7575042955577374, "learning_rate": 9.995970476727312e-06, "loss": -0.0869, "num_tokens": 25127307.0, "reward": 0.7735288739204407, "reward_std": 0.06258179247379303, "rewards/rollout_reward_func/mean": 0.7735288739204407, "rewards/rollout_reward_func/std": 0.11456301063299179, "sampling/importance_sampling_ratio/max": 0.9987250566482544, "sampling/importance_sampling_ratio/mean": 0.8753015995025635, "sampling/importance_sampling_ratio/min": 1.3575565128745142e-16, "sampling/sampling_logp_difference/max": 3.0665435791015625, "sampling/sampling_logp_difference/mean": 0.20064184069633484, "step": 2987, "step_time": 4.997869302002073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7263426738791168, "epoch": 0.02988, "grad_norm": 0.027806570753455162, "kl": 0.7369997650384903, "learning_rate": 9.995967745799762e-06, "loss": -0.0869, "step": 2988, "step_time": 2.0620239240088267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 98.375, "completions/mean_terminated_length": 98.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2180509101599455, "epoch": 0.02989, "frac_reward_zero_std": 0.75, "grad_norm": 0.025848058983683586, "kl": 0.5146409571170807, "learning_rate": 9.99596501394761e-06, "loss": 0.0187, "num_tokens": 25141223.0, "reward": 0.7252885103225708, "reward_std": 0.004079463891685009, "rewards/rollout_reward_func/mean": 0.7252885103225708, "rewards/rollout_reward_func/std": 0.28677618503570557, "sampling/importance_sampling_ratio/max": 1.0002055168151855, "sampling/importance_sampling_ratio/mean": 0.96578049659729, "sampling/importance_sampling_ratio/min": 0.05836332589387894, "sampling/sampling_logp_difference/max": 1.3220326900482178, "sampling/sampling_logp_difference/mean": 0.02291806973516941, "step": 2989, "step_time": 4.4633168849977665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21606227941811085, "epoch": 0.0299, "grad_norm": 0.02556038647890091, "kl": 0.5161171369254589, "learning_rate": 9.995962281170854e-06, "loss": 0.0187, "step": 2990, "step_time": 2.027595333987847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 176.84375, "completions/mean_terminated_length": 176.84375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5960985952988267, "epoch": 0.02991, "frac_reward_zero_std": 0.5, "grad_norm": 0.01210480835288763, "kl": 0.4483649767935276, "learning_rate": 9.995959547469496e-06, "loss": -0.008, "num_tokens": 25157618.0, "reward": 0.7185769081115723, "reward_std": 0.04106659069657326, "rewards/rollout_reward_func/mean": 0.7185769081115723, "rewards/rollout_reward_func/std": 0.2589940130710602, "sampling/importance_sampling_ratio/max": 0.9980749487876892, "sampling/importance_sampling_ratio/mean": 0.930768609046936, "sampling/importance_sampling_ratio/min": 1.524156232335372e-06, "sampling/sampling_logp_difference/max": 1.859921932220459, "sampling/sampling_logp_difference/mean": 0.11115247756242752, "step": 2991, "step_time": 4.6219110260062735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5966026382520795, "epoch": 0.02992, "grad_norm": 0.01199529878795147, "kl": 0.44772813841700554, "learning_rate": 9.995956812843535e-06, "loss": -0.008, "step": 2992, "step_time": 2.0882628099934664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 194.5, "completions/mean_terminated_length": 194.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.17513136006891727, "epoch": 0.02993, "frac_reward_zero_std": 0.5, "grad_norm": 0.032381583005189896, "kl": 0.6280642114579678, "learning_rate": 9.995954077292974e-06, "loss": -0.0578, "num_tokens": 25174610.0, "reward": 0.5850192308425903, "reward_std": 0.2651650309562683, "rewards/rollout_reward_func/mean": 0.5850192308425903, "rewards/rollout_reward_func/std": 0.5745700001716614, "sampling/importance_sampling_ratio/max": 1.0008445978164673, "sampling/importance_sampling_ratio/mean": 0.9415699243545532, "sampling/importance_sampling_ratio/min": 0.07937812805175781, "sampling/sampling_logp_difference/max": 2.593925714492798, "sampling/sampling_logp_difference/mean": 0.025244172662496567, "step": 2993, "step_time": 4.762847926001996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.17530005797743797, "epoch": 0.02994, "grad_norm": 0.03387080505490303, "kl": 0.6253192611038685, "learning_rate": 9.995951340817811e-06, "loss": -0.0578, "step": 2994, "step_time": 2.542954708005709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 143.75, "completions/mean_terminated_length": 143.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.07938738819211721, "epoch": 0.02995, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007052633445709944, "kl": 0.4673219658434391, "learning_rate": 9.995948603418049e-06, "loss": 0.0014, "num_tokens": 25189978.0, "reward": 0.7247307300567627, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7247307300567627, "rewards/rollout_reward_func/std": 0.3464825749397278, "sampling/importance_sampling_ratio/max": 1.002349615097046, "sampling/importance_sampling_ratio/mean": 0.9965851902961731, "sampling/importance_sampling_ratio/min": 0.9900107383728027, "sampling/sampling_logp_difference/max": 0.0060378313064575195, "sampling/sampling_logp_difference/mean": 0.0008886740542948246, "step": 2995, "step_time": 3.912027958998806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07875638455152512, "epoch": 0.02996, "grad_norm": 0.0006899376166984439, "kl": 0.4674328900873661, "learning_rate": 9.995945865093687e-06, "loss": 0.0014, "step": 2996, "step_time": 2.0172802270026295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 216.9375, "completions/mean_terminated_length": 216.9375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.17726468155160546, "epoch": 0.02997, "frac_reward_zero_std": 0.75, "grad_norm": 0.006929398979991674, "kl": 0.42341379076242447, "learning_rate": 9.995943125844726e-06, "loss": -0.0365, "num_tokens": 25207744.0, "reward": 0.9335817098617554, "reward_std": 0.023157751187682152, "rewards/rollout_reward_func/mean": 0.9335817098617554, "rewards/rollout_reward_func/std": 0.31670188903808594, "sampling/importance_sampling_ratio/max": 0.9981062412261963, "sampling/importance_sampling_ratio/mean": 0.9632854461669922, "sampling/importance_sampling_ratio/min": 0.004160585347563028, "sampling/sampling_logp_difference/max": 2.132887601852417, "sampling/sampling_logp_difference/mean": 0.025942588225007057, "step": 2997, "step_time": 4.42587186498713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1765306848101318, "epoch": 0.02998, "grad_norm": 0.006826794706285, "kl": 0.4224540889263153, "learning_rate": 9.995940385671167e-06, "loss": -0.0365, "step": 2998, "step_time": 2.0225092659820803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.05698623089119792, "epoch": 0.02999, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005290059489198029, "kl": 0.3947179727256298, "learning_rate": 9.995937644573011e-06, "loss": 0.0015, "num_tokens": 25224560.0, "reward": 0.7434999942779541, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7434999942779541, "rewards/rollout_reward_func/std": 0.4455370306968689, "sampling/importance_sampling_ratio/max": 0.9997155666351318, "sampling/importance_sampling_ratio/mean": 0.9955642223358154, "sampling/importance_sampling_ratio/min": 0.9914910197257996, "sampling/sampling_logp_difference/max": 0.005432751029729843, "sampling/sampling_logp_difference/mean": 0.0008562029106542468, "step": 2999, "step_time": 5.021049647999462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05587652325630188, "epoch": 0.03, "grad_norm": 0.0005189777002669871, "kl": 0.3949177861213684, "learning_rate": 9.99593490255026e-06, "loss": 0.0015, "step": 3000, "step_time": 2.525653793985839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 153.1875, "completions/mean_terminated_length": 153.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3548110295087099, "epoch": 0.03001, "frac_reward_zero_std": 0.5, "grad_norm": 0.13895849883556366, "kl": 0.5533613413572311, "learning_rate": 9.995932159602911e-06, "loss": 0.0041, "num_tokens": 25240286.0, "reward": 0.8594903945922852, "reward_std": 0.07139058411121368, "rewards/rollout_reward_func/mean": 0.8594903945922852, "rewards/rollout_reward_func/std": 0.38778361678123474, "sampling/importance_sampling_ratio/max": 0.9987826347351074, "sampling/importance_sampling_ratio/mean": 0.9377094507217407, "sampling/importance_sampling_ratio/min": 0.006501227617263794, "sampling/sampling_logp_difference/max": 2.5354483127593994, "sampling/sampling_logp_difference/mean": 0.05083619803190231, "step": 3001, "step_time": 3.7801036440068856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34275904670357704, "epoch": 0.03002, "grad_norm": 0.16832540929317474, "kl": 0.5484061650931835, "learning_rate": 9.995929415730967e-06, "loss": 0.0036, "step": 3002, "step_time": 2.032155155982764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 97.5, "completions/mean_terminated_length": 97.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9742588410153985, "epoch": 0.03003, "frac_reward_zero_std": 0.25, "grad_norm": 0.029731204733252525, "kl": 0.6716460697352886, "learning_rate": 9.995926670934428e-06, "loss": -0.0625, "num_tokens": 25254174.0, "reward": 0.7565865516662598, "reward_std": 0.0965944230556488, "rewards/rollout_reward_func/mean": 0.7565865516662598, "rewards/rollout_reward_func/std": 0.2368556261062622, "sampling/importance_sampling_ratio/max": 0.9976874589920044, "sampling/importance_sampling_ratio/mean": 0.8715401887893677, "sampling/importance_sampling_ratio/min": 0.002678263932466507, "sampling/sampling_logp_difference/max": 1.5247383117675781, "sampling/sampling_logp_difference/mean": 0.1335826963186264, "step": 3003, "step_time": 3.837153998996655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9583954205736518, "epoch": 0.03004, "grad_norm": 0.03019564226269722, "kl": 0.6715594381093979, "learning_rate": 9.995923925213294e-06, "loss": -0.0625, "step": 3004, "step_time": 2.514114513011009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 121.375, "completions/mean_terminated_length": 121.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.30811667907983065, "epoch": 0.03005, "frac_reward_zero_std": 0.75, "grad_norm": 0.010316365398466587, "kl": 0.6026118583977222, "learning_rate": 9.995921178567569e-06, "loss": 0.0196, "num_tokens": 25268914.0, "reward": 0.8166057467460632, "reward_std": 0.00040794836240820587, "rewards/rollout_reward_func/mean": 0.8166057467460632, "rewards/rollout_reward_func/std": 0.377151221036911, "sampling/importance_sampling_ratio/max": 0.9993977546691895, "sampling/importance_sampling_ratio/mean": 0.9645342230796814, "sampling/importance_sampling_ratio/min": 0.0014712376287207007, "sampling/sampling_logp_difference/max": 2.7153332233428955, "sampling/sampling_logp_difference/mean": 0.05284735932946205, "step": 3005, "step_time": 4.0373424970093765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30606468115001917, "epoch": 0.03006, "grad_norm": 0.010445020161569118, "kl": 0.6024174652993679, "learning_rate": 9.995918430997251e-06, "loss": 0.0196, "step": 3006, "step_time": 2.5531494500028202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 219.9375, "completions/mean_terminated_length": 220.29031372070312, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.39545657951384783, "epoch": 0.03007, "frac_reward_zero_std": 0.75, "grad_norm": 0.004478528164327145, "kl": 0.42425011470913887, "learning_rate": 9.99591568250234e-06, "loss": -0.027, "num_tokens": 25286776.0, "reward": 0.8351826667785645, "reward_std": 0.01699776202440262, "rewards/rollout_reward_func/mean": 0.8351826667785645, "rewards/rollout_reward_func/std": 0.33205175399780273, "sampling/importance_sampling_ratio/max": 0.9989212155342102, "sampling/importance_sampling_ratio/mean": 0.9634483456611633, "sampling/importance_sampling_ratio/min": 3.9127413802013563e-16, "sampling/sampling_logp_difference/max": 3.0237975120544434, "sampling/sampling_logp_difference/mean": 0.1620674431324005, "step": 3007, "step_time": 4.451948623005592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.39589399844408035, "epoch": 0.03008, "grad_norm": 0.004067053087055683, "kl": 0.4224816933274269, "learning_rate": 9.995912933082838e-06, "loss": -0.0271, "step": 3008, "step_time": 2.0459110099909594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 137.5625, "completions/mean_terminated_length": 137.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.099370245821774, "epoch": 0.03009, "frac_reward_zero_std": 0.75, "grad_norm": 1.1407796144485474, "kl": 0.428056962788105, "learning_rate": 9.995910182738747e-06, "loss": -0.0367, "num_tokens": 25302058.0, "reward": 0.5558654069900513, "reward_std": 0.005971723236143589, "rewards/rollout_reward_func/mean": 0.5558654069900513, "rewards/rollout_reward_func/std": 0.08038315922021866, "sampling/importance_sampling_ratio/max": 1.2695621252059937, "sampling/importance_sampling_ratio/mean": 0.9447164535522461, "sampling/importance_sampling_ratio/min": 0.4648728668689728, "sampling/sampling_logp_difference/max": 0.773231029510498, "sampling/sampling_logp_difference/mean": 0.02389669604599476, "step": 3009, "step_time": 3.988709644996561 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.06250000093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06770833441987634, "entropy": 0.3049332033842802, "epoch": 0.0301, "grad_norm": 0.013913297094404697, "kl": 0.5043050572276115, "learning_rate": 9.995907431470065e-06, "loss": -0.0389, "step": 3010, "step_time": 2.4871975319838384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.22374064847826958, "epoch": 0.03011, "frac_reward_zero_std": 0.75, "grad_norm": 0.024861810728907585, "kl": 0.5707359574735165, "learning_rate": 9.995904679276794e-06, "loss": 0.0291, "num_tokens": 25317122.0, "reward": 0.5693268775939941, "reward_std": 0.006799104157835245, "rewards/rollout_reward_func/mean": 0.5693268775939941, "rewards/rollout_reward_func/std": 0.21122242510318756, "sampling/importance_sampling_ratio/max": 0.9994765520095825, "sampling/importance_sampling_ratio/mean": 0.9599560499191284, "sampling/importance_sampling_ratio/min": 0.05071617662906647, "sampling/sampling_logp_difference/max": 1.1298894882202148, "sampling/sampling_logp_difference/mean": 0.02081933803856373, "step": 3011, "step_time": 4.32413526000164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2198917744681239, "epoch": 0.03012, "grad_norm": 0.025109097361564636, "kl": 0.5703265592455864, "learning_rate": 9.995901926158935e-06, "loss": 0.029, "step": 3012, "step_time": 1.9790109290115652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 243.0625, "completions/mean_terminated_length": 243.0625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.29639696329832077, "epoch": 0.03013, "frac_reward_zero_std": 0.75, "grad_norm": 0.01919793337583542, "kl": 0.4352186843752861, "learning_rate": 9.995899172116488e-06, "loss": -0.0253, "num_tokens": 25335636.0, "reward": 1.1259520053863525, "reward_std": 0.01590990275144577, "rewards/rollout_reward_func/mean": 1.1259520053863525, "rewards/rollout_reward_func/std": 0.29991698265075684, "sampling/importance_sampling_ratio/max": 0.9974681735038757, "sampling/importance_sampling_ratio/mean": 0.9359995126724243, "sampling/importance_sampling_ratio/min": 0.04787149280309677, "sampling/sampling_logp_difference/max": 1.07014799118042, "sampling/sampling_logp_difference/mean": 0.025412920862436295, "step": 3013, "step_time": 4.541141635017993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29362716199830174, "epoch": 0.03014, "grad_norm": 0.01938115991652012, "kl": 0.4495830535888672, "learning_rate": 9.995896417149453e-06, "loss": -0.0253, "step": 3014, "step_time": 2.0741945690097054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 139.78125, "completions/mean_terminated_length": 139.78125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.5891679981723428, "epoch": 0.03015, "frac_reward_zero_std": 0.5, "grad_norm": 0.6672667860984802, "kl": 0.5306020379066467, "learning_rate": 9.995893661257832e-06, "loss": -0.0382, "num_tokens": 25350877.0, "reward": 0.7584375143051147, "reward_std": 0.1148798018693924, "rewards/rollout_reward_func/mean": 0.7584375143051147, "rewards/rollout_reward_func/std": 0.19774296879768372, "sampling/importance_sampling_ratio/max": 1.0240670442581177, "sampling/importance_sampling_ratio/mean": 0.9292001724243164, "sampling/importance_sampling_ratio/min": 0.0009343473939225078, "sampling/sampling_logp_difference/max": 2.114220142364502, "sampling/sampling_logp_difference/mean": 0.08535543829202652, "step": 3015, "step_time": 4.0388282670101034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02500000037252903, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02500000037252903, "entropy": 0.6307276245206594, "epoch": 0.03016, "grad_norm": 0.15997904539108276, "kl": 0.5482916980981827, "learning_rate": 9.995890904441625e-06, "loss": -0.0423, "step": 3016, "step_time": 2.4790808119942085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 129.40625, "completions/mean_terminated_length": 129.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7469677589833736, "epoch": 0.03017, "frac_reward_zero_std": 0.5, "grad_norm": 0.08687583357095718, "kl": 0.5846377946436405, "learning_rate": 9.995888146700834e-06, "loss": -0.0188, "num_tokens": 25365842.0, "reward": 0.819658637046814, "reward_std": 0.052260272204875946, "rewards/rollout_reward_func/mean": 0.819658637046814, "rewards/rollout_reward_func/std": 0.08669565618038177, "sampling/importance_sampling_ratio/max": 0.9979380965232849, "sampling/importance_sampling_ratio/mean": 0.9083278775215149, "sampling/importance_sampling_ratio/min": 0.00019887162488885224, "sampling/sampling_logp_difference/max": 3.3912410736083984, "sampling/sampling_logp_difference/mean": 0.12921015918254852, "step": 3017, "step_time": 4.764867129000777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.795129313133657, "epoch": 0.03018, "grad_norm": 0.07805339992046356, "kl": 0.606142558157444, "learning_rate": 9.995885388035457e-06, "loss": -0.0179, "step": 3018, "step_time": 2.0718607339877053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 149.53125, "completions/mean_terminated_length": 149.53125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.13019747799262404, "epoch": 0.03019, "frac_reward_zero_std": 0.75, "grad_norm": 0.07917750626802444, "kl": 0.40638617426157, "learning_rate": 9.995882628445497e-06, "loss": -0.0126, "num_tokens": 25381451.0, "reward": 1.0450000762939453, "reward_std": 0.01359820831567049, "rewards/rollout_reward_func/mean": 1.0450000762939453, "rewards/rollout_reward_func/std": 0.15587939321994781, "sampling/importance_sampling_ratio/max": 0.9987622499465942, "sampling/importance_sampling_ratio/mean": 0.9730075597763062, "sampling/importance_sampling_ratio/min": 0.27007120847702026, "sampling/sampling_logp_difference/max": 0.6833903193473816, "sampling/sampling_logp_difference/mean": 0.009028206579387188, "step": 3019, "step_time": 3.9518922670031316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13931834883987904, "epoch": 0.0302, "grad_norm": 0.06727343052625656, "kl": 0.41339709237217903, "learning_rate": 9.995879867930954e-06, "loss": -0.0133, "step": 3020, "step_time": 2.028627563988266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 168.75, "completions/mean_terminated_length": 168.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.1930821742862463, "epoch": 0.03021, "frac_reward_zero_std": 1.0, "grad_norm": 0.005759666208177805, "kl": 0.47865522652864456, "learning_rate": 9.995877106491827e-06, "loss": 0.0016, "num_tokens": 25397651.0, "reward": 0.566653847694397, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.566653847694397, "rewards/rollout_reward_func/std": 0.14020095765590668, "sampling/importance_sampling_ratio/max": 1.0030475854873657, "sampling/importance_sampling_ratio/mean": 0.9653110504150391, "sampling/importance_sampling_ratio/min": 0.03050575964152813, "sampling/sampling_logp_difference/max": 1.2526650428771973, "sampling/sampling_logp_difference/mean": 0.017590954899787903, "step": 3021, "step_time": 4.258644805988297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.19315412594005466, "epoch": 0.03022, "grad_norm": 0.0058071003295481205, "kl": 0.4798170253634453, "learning_rate": 9.99587434412812e-06, "loss": 0.0016, "step": 3022, "step_time": 2.3956683769938536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 189.84375, "completions/mean_terminated_length": 189.84375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3257860029116273, "epoch": 0.03023, "frac_reward_zero_std": 0.75, "grad_norm": 0.005480833351612091, "kl": 0.5650601647794247, "learning_rate": 9.995871580839832e-06, "loss": -0.0364, "num_tokens": 25414494.0, "reward": 0.8088028430938721, "reward_std": 0.21040505170822144, "rewards/rollout_reward_func/mean": 0.8088028430938721, "rewards/rollout_reward_func/std": 0.4587382674217224, "sampling/importance_sampling_ratio/max": 0.9996266961097717, "sampling/importance_sampling_ratio/mean": 0.9642398357391357, "sampling/importance_sampling_ratio/min": 0.0006591504788957536, "sampling/sampling_logp_difference/max": 1.9463800191879272, "sampling/sampling_logp_difference/mean": 0.04108564928174019, "step": 3023, "step_time": 4.716851761993894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3235463388264179, "epoch": 0.03024, "grad_norm": 0.0053560263477265835, "kl": 0.5643330179154873, "learning_rate": 9.995868816626964e-06, "loss": -0.0365, "step": 3024, "step_time": 2.0565805419973913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 195.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.051717455964535475, "epoch": 0.03025, "frac_reward_zero_std": 1.0, "grad_norm": 0.00044937123311683536, "kl": 0.4849614389240742, "learning_rate": 9.995866051489514e-06, "loss": 0.0018, "num_tokens": 25431510.0, "reward": 0.6554999947547913, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6554999947547913, "rewards/rollout_reward_func/std": 0.1527237594127655, "sampling/importance_sampling_ratio/max": 0.9998233914375305, "sampling/importance_sampling_ratio/mean": 0.9951950311660767, "sampling/importance_sampling_ratio/min": 0.9903878569602966, "sampling/sampling_logp_difference/max": 0.006110787391662598, "sampling/sampling_logp_difference/mean": 0.0008963979780673981, "step": 3025, "step_time": 4.293557000994042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05025319615378976, "epoch": 0.03026, "grad_norm": 0.00043462251778692007, "kl": 0.4852118082344532, "learning_rate": 9.995863285427487e-06, "loss": 0.0018, "step": 3026, "step_time": 2.0419287869954132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 208.65625, "completions/mean_terminated_length": 208.65625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3215877693146467, "epoch": 0.03027, "frac_reward_zero_std": 0.5, "grad_norm": 0.009036735631525517, "kl": 0.4729586951434612, "learning_rate": 9.995860518440882e-06, "loss": -0.055, "num_tokens": 25448867.0, "reward": 0.7539134621620178, "reward_std": 0.18615946173667908, "rewards/rollout_reward_func/mean": 0.7539134621620178, "rewards/rollout_reward_func/std": 0.4352615475654602, "sampling/importance_sampling_ratio/max": 1.000567078590393, "sampling/importance_sampling_ratio/mean": 0.9353194832801819, "sampling/importance_sampling_ratio/min": 0.005174309015274048, "sampling/sampling_logp_difference/max": 2.0468673706054688, "sampling/sampling_logp_difference/mean": 0.04490319639444351, "step": 3027, "step_time": 4.365432999009499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3208410325460136, "epoch": 0.03028, "grad_norm": 0.008800092153251171, "kl": 0.47573839128017426, "learning_rate": 9.995857750529699e-06, "loss": -0.055, "step": 3028, "step_time": 2.5014635910192737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 180.8125, "completions/mean_terminated_length": 180.8125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3093058201484382, "epoch": 0.03029, "frac_reward_zero_std": 0.75, "grad_norm": 0.005445954389870167, "kl": 0.4984905980527401, "learning_rate": 9.99585498169394e-06, "loss": -0.027, "num_tokens": 25465421.0, "reward": 0.5313547849655151, "reward_std": 0.040770333260297775, "rewards/rollout_reward_func/mean": 0.5313547849655151, "rewards/rollout_reward_func/std": 0.12101948261260986, "sampling/importance_sampling_ratio/max": 0.9990329742431641, "sampling/importance_sampling_ratio/mean": 0.9648118019104004, "sampling/importance_sampling_ratio/min": 9.893165952234995e-06, "sampling/sampling_logp_difference/max": 1.1564021110534668, "sampling/sampling_logp_difference/mean": 0.05529928579926491, "step": 3029, "step_time": 5.315108842987684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.310790971852839, "epoch": 0.0303, "grad_norm": 0.005199955310672522, "kl": 0.49559737741947174, "learning_rate": 9.995852211933602e-06, "loss": -0.027, "step": 3030, "step_time": 2.2021531369973673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 217.21875, "completions/mean_terminated_length": 217.21875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2967425277456641, "epoch": 0.03031, "frac_reward_zero_std": 0.75, "grad_norm": 0.004000048153102398, "kl": 0.5300623401999474, "learning_rate": 9.99584944124869e-06, "loss": -0.0363, "num_tokens": 25483052.0, "reward": 0.814350962638855, "reward_std": 0.009559537284076214, "rewards/rollout_reward_func/mean": 0.814350962638855, "rewards/rollout_reward_func/std": 0.36814484000205994, "sampling/importance_sampling_ratio/max": 0.9974802136421204, "sampling/importance_sampling_ratio/mean": 0.9643127918243408, "sampling/importance_sampling_ratio/min": 0.00036097460542805493, "sampling/sampling_logp_difference/max": 2.0249297618865967, "sampling/sampling_logp_difference/mean": 0.042000263929367065, "step": 3031, "step_time": 4.4277988340109005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2992898253723979, "epoch": 0.03032, "grad_norm": 0.0038946829736232758, "kl": 0.5303429067134857, "learning_rate": 9.995846669639204e-06, "loss": -0.0363, "step": 3032, "step_time": 2.0460691829939606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 237.5, "completions/mean_terminated_length": 237.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.040026386734098196, "epoch": 0.03033, "frac_reward_zero_std": 1.0, "grad_norm": 0.00040363662992604077, "kl": 0.4411285072565079, "learning_rate": 9.995843897105142e-06, "loss": 0.0019, "num_tokens": 25501276.0, "reward": 0.5498461723327637, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5498461723327637, "rewards/rollout_reward_func/std": 0.18464109301567078, "sampling/importance_sampling_ratio/max": 0.998820960521698, "sampling/importance_sampling_ratio/mean": 0.9958406686782837, "sampling/importance_sampling_ratio/min": 0.9933168888092041, "sampling/sampling_logp_difference/max": 0.002454785630106926, "sampling/sampling_logp_difference/mean": 0.0006290702149271965, "step": 3033, "step_time": 5.107878384987998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.039335144218057394, "epoch": 0.03034, "grad_norm": 0.00040281706606037915, "kl": 0.44122904539108276, "learning_rate": 9.995841123646507e-06, "loss": 0.0019, "step": 3034, "step_time": 2.001737555998261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 164.21875, "completions/mean_terminated_length": 164.21875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3291316572576761, "epoch": 0.03035, "frac_reward_zero_std": 0.5, "grad_norm": 0.01934545859694481, "kl": 0.749883558601141, "learning_rate": 9.995838349263301e-06, "loss": -0.0538, "num_tokens": 25517299.0, "reward": 0.6953413486480713, "reward_std": 0.15601223707199097, "rewards/rollout_reward_func/mean": 0.6953413486480713, "rewards/rollout_reward_func/std": 0.43465203046798706, "sampling/importance_sampling_ratio/max": 1.0002460479736328, "sampling/importance_sampling_ratio/mean": 0.9370059967041016, "sampling/importance_sampling_ratio/min": 0.0008060003165155649, "sampling/sampling_logp_difference/max": 2.5641164779663086, "sampling/sampling_logp_difference/mean": 0.058788079768419266, "step": 3035, "step_time": 4.635024838011304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3334226254373789, "epoch": 0.03036, "grad_norm": 0.018613548949360847, "kl": 0.7273551113903522, "learning_rate": 9.995835573955521e-06, "loss": -0.0538, "step": 3036, "step_time": 2.00717189401621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 144.53125, "completions/mean_terminated_length": 144.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.37453212309628725, "epoch": 0.03037, "frac_reward_zero_std": 0.75, "grad_norm": 0.009838360361754894, "kl": 0.5621496140956879, "learning_rate": 9.99583279772317e-06, "loss": 0.0306, "num_tokens": 25532692.0, "reward": 0.6637019515037537, "reward_std": 0.01808561570942402, "rewards/rollout_reward_func/mean": 0.6637019515037537, "rewards/rollout_reward_func/std": 0.31185418367385864, "sampling/importance_sampling_ratio/max": 0.9986042380332947, "sampling/importance_sampling_ratio/mean": 0.9639843702316284, "sampling/importance_sampling_ratio/min": 9.084562435646149e-09, "sampling/sampling_logp_difference/max": 3.594909191131592, "sampling/sampling_logp_difference/mean": 0.09575295448303223, "step": 3037, "step_time": 4.097979189005855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3758276770822704, "epoch": 0.03038, "grad_norm": 0.010904333554208279, "kl": 0.5661860592663288, "learning_rate": 9.995830020566248e-06, "loss": 0.0306, "step": 3038, "step_time": 2.0136975700079347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 177.75, "completions/mean_terminated_length": 177.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.10710044670850039, "epoch": 0.03039, "frac_reward_zero_std": 0.75, "grad_norm": 0.10818815231323242, "kl": 0.5994399636983871, "learning_rate": 9.995827242484757e-06, "loss": -0.0127, "num_tokens": 25549292.0, "reward": 0.8995577096939087, "reward_std": 0.04487408697605133, "rewards/rollout_reward_func/mean": 0.8995577096939087, "rewards/rollout_reward_func/std": 0.4340728223323822, "sampling/importance_sampling_ratio/max": 0.9995691180229187, "sampling/importance_sampling_ratio/mean": 0.9727908372879028, "sampling/importance_sampling_ratio/min": 0.24407656490802765, "sampling/sampling_logp_difference/max": 1.6563090085983276, "sampling/sampling_logp_difference/mean": 0.012642034329473972, "step": 3039, "step_time": 4.7706293989904225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10823816456831992, "epoch": 0.0304, "grad_norm": 0.07337956875562668, "kl": 0.6380797289311886, "learning_rate": 9.995824463478696e-06, "loss": -0.0129, "step": 3040, "step_time": 2.0374761760103866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 194.75, "completions/mean_terminated_length": 194.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.04173919511958957, "epoch": 0.03041, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003614248416852206, "kl": 0.4298769198358059, "learning_rate": 9.995821683548066e-06, "loss": 0.0016, "num_tokens": 25566372.0, "reward": 0.561923086643219, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.561923086643219, "rewards/rollout_reward_func/std": 0.1886357218027115, "sampling/importance_sampling_ratio/max": 0.9995893836021423, "sampling/importance_sampling_ratio/mean": 0.9959360361099243, "sampling/importance_sampling_ratio/min": 0.9922981858253479, "sampling/sampling_logp_difference/max": 0.0048448555171489716, "sampling/sampling_logp_difference/mean": 0.0007584482664242387, "step": 3041, "step_time": 4.757969982994837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.042081426829099655, "epoch": 0.03042, "grad_norm": 0.00036828755401074886, "kl": 0.4298366568982601, "learning_rate": 9.995818902692869e-06, "loss": 0.0016, "step": 3042, "step_time": 1.931122747999325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6661376231350005, "epoch": 0.03043, "frac_reward_zero_std": 0.5, "grad_norm": 0.02558411844074726, "kl": 0.49935332685709, "learning_rate": 9.995816120913102e-06, "loss": -0.007, "num_tokens": 25583684.0, "reward": 0.5838221311569214, "reward_std": 0.007519809529185295, "rewards/rollout_reward_func/mean": 0.5838221311569214, "rewards/rollout_reward_func/std": 0.24918632209300995, "sampling/importance_sampling_ratio/max": 0.9986716508865356, "sampling/importance_sampling_ratio/mean": 0.9332133531570435, "sampling/importance_sampling_ratio/min": 1.0538285027332072e-14, "sampling/sampling_logp_difference/max": 3.962881565093994, "sampling/sampling_logp_difference/mean": 0.19197824597358704, "step": 3043, "step_time": 4.88032544998714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6691113491542637, "epoch": 0.03044, "grad_norm": 0.025414733216166496, "kl": 0.49657850340008736, "learning_rate": 9.99581333820877e-06, "loss": -0.007, "step": 3044, "step_time": 2.154311569989659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 139.25, "completions/mean_terminated_length": 139.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.04530538013204932, "epoch": 0.03045, "frac_reward_zero_std": 1.0, "grad_norm": 0.00034954515285789967, "kl": 0.5188087597489357, "learning_rate": 9.995810554579873e-06, "loss": 0.0016, "num_tokens": 25598764.0, "reward": 0.6430768966674805, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.6430768966674805, "rewards/rollout_reward_func/std": 0.14914503693580627, "sampling/importance_sampling_ratio/max": 0.998733639717102, "sampling/importance_sampling_ratio/mean": 0.9955515265464783, "sampling/importance_sampling_ratio/min": 0.9922847151756287, "sampling/sampling_logp_difference/max": 0.006142761558294296, "sampling/sampling_logp_difference/mean": 0.0009410380735062063, "step": 3045, "step_time": 4.727902522994555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04652067180722952, "epoch": 0.03046, "grad_norm": 0.000361580983735621, "kl": 0.5186159461736679, "learning_rate": 9.995807770026411e-06, "loss": 0.0016, "step": 3046, "step_time": 2.0259330310072983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 124.53125, "completions/mean_terminated_length": 124.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4046818702481687, "epoch": 0.03047, "frac_reward_zero_std": 0.75, "grad_norm": 0.003117004409432411, "kl": 0.6692830175161362, "learning_rate": 9.995804984548383e-06, "loss": -0.027, "num_tokens": 25613461.0, "reward": 0.5663942098617554, "reward_std": 0.015637939795851707, "rewards/rollout_reward_func/mean": 0.5663942098617554, "rewards/rollout_reward_func/std": 0.22316300868988037, "sampling/importance_sampling_ratio/max": 0.997534453868866, "sampling/importance_sampling_ratio/mean": 0.9641492962837219, "sampling/importance_sampling_ratio/min": 5.143544331076555e-05, "sampling/sampling_logp_difference/max": 1.9374375343322754, "sampling/sampling_logp_difference/mean": 0.07545716315507889, "step": 3047, "step_time": 4.791122546004772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4045561421662569, "epoch": 0.03048, "grad_norm": 0.0032004080712795258, "kl": 0.6641379967331886, "learning_rate": 9.99580219814579e-06, "loss": -0.027, "step": 3048, "step_time": 2.103989294002531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 171.1875, "completions/mean_terminated_length": 171.1875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 1.0634294883348048, "epoch": 0.03049, "frac_reward_zero_std": 0.25, "grad_norm": 0.012709872797131538, "kl": 0.504638247191906, "learning_rate": 9.995799410818636e-06, "loss": 0.0012, "num_tokens": 25629563.0, "reward": 0.4303365647792816, "reward_std": 0.0316864550113678, "rewards/rollout_reward_func/mean": 0.4303365647792816, "rewards/rollout_reward_func/std": 0.12697742879390717, "sampling/importance_sampling_ratio/max": 0.9990039467811584, "sampling/importance_sampling_ratio/mean": 0.870367705821991, "sampling/importance_sampling_ratio/min": 1.7676780207086747e-17, "sampling/sampling_logp_difference/max": 3.387673854827881, "sampling/sampling_logp_difference/mean": 0.28928789496421814, "step": 3049, "step_time": 4.382493458993849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0682228412479162, "epoch": 0.0305, "grad_norm": 0.0120106041431427, "kl": 0.4988491013646126, "learning_rate": 9.995796622566918e-06, "loss": 0.0012, "step": 3050, "step_time": 2.0837910430054762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 124.3125, "completions/mean_terminated_length": 124.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.12542463932186365, "epoch": 0.03051, "frac_reward_zero_std": 0.75, "grad_norm": 0.009896949864923954, "kl": 0.6555928811430931, "learning_rate": 9.995793833390639e-06, "loss": -0.0257, "num_tokens": 25644453.0, "reward": 0.7166346311569214, "reward_std": 0.004079465754330158, "rewards/rollout_reward_func/mean": 0.7166346311569214, "rewards/rollout_reward_func/std": 0.24650265276432037, "sampling/importance_sampling_ratio/max": 0.9984204173088074, "sampling/importance_sampling_ratio/mean": 0.9654985666275024, "sampling/importance_sampling_ratio/min": 0.06729994714260101, "sampling/sampling_logp_difference/max": 2.6955409049987793, "sampling/sampling_logp_difference/mean": 0.015688883140683174, "step": 3051, "step_time": 4.493490484994254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1256171721033752, "epoch": 0.03052, "grad_norm": 0.010649423114955425, "kl": 0.6630744263529778, "learning_rate": 9.9957910432898e-06, "loss": -0.0257, "step": 3052, "step_time": 1.9992594509894843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 146.1875, "completions/mean_terminated_length": 146.1875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.211422068066895, "epoch": 0.03053, "frac_reward_zero_std": 0.75, "grad_norm": 0.04778030514717102, "kl": 0.8296538144350052, "learning_rate": 9.995788252264398e-06, "loss": 0.0312, "num_tokens": 25659955.0, "reward": 0.5647596120834351, "reward_std": 0.01563793420791626, "rewards/rollout_reward_func/mean": 0.5647596120834351, "rewards/rollout_reward_func/std": 0.19179955124855042, "sampling/importance_sampling_ratio/max": 0.9990108013153076, "sampling/importance_sampling_ratio/mean": 0.964836597442627, "sampling/importance_sampling_ratio/min": 0.011489487253129482, "sampling/sampling_logp_difference/max": 1.7104711532592773, "sampling/sampling_logp_difference/mean": 0.029538745060563087, "step": 3053, "step_time": 4.63344075599889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21240056166425347, "epoch": 0.03054, "grad_norm": 0.04330802336335182, "kl": 0.7945844680070877, "learning_rate": 9.995785460314438e-06, "loss": 0.0311, "step": 3054, "step_time": 2.0877968589993543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 130.8125, "completions/mean_terminated_length": 128.29031372070312, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8330033025704324, "epoch": 0.03055, "frac_reward_zero_std": 0.25, "grad_norm": 0.01728994958102703, "kl": 0.760612778365612, "learning_rate": 9.995782667439917e-06, "loss": -0.0273, "num_tokens": 25674965.0, "reward": 0.5115384459495544, "reward_std": 0.06499943137168884, "rewards/rollout_reward_func/mean": 0.5115384459495544, "rewards/rollout_reward_func/std": 0.21160994470119476, "sampling/importance_sampling_ratio/max": 1.0100189447402954, "sampling/importance_sampling_ratio/mean": 0.9047157764434814, "sampling/importance_sampling_ratio/min": 3.172127283068704e-10, "sampling/sampling_logp_difference/max": 2.8038039207458496, "sampling/sampling_logp_difference/mean": 0.22290587425231934, "step": 3055, "step_time": 4.509321387995442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8337245672009885, "epoch": 0.03056, "grad_norm": 0.01623552478849888, "kl": 0.7453760281205177, "learning_rate": 9.99577987364084e-06, "loss": -0.0274, "step": 3056, "step_time": 2.076616036989435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 166.03125, "completions/mean_terminated_length": 166.03125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.21764279017224908, "epoch": 0.03057, "frac_reward_zero_std": 0.75, "grad_norm": 0.005846640095114708, "kl": 0.6207927167415619, "learning_rate": 9.995777078917205e-06, "loss": -0.0262, "num_tokens": 25690958.0, "reward": 0.6756731271743774, "reward_std": 0.016589809209108353, "rewards/rollout_reward_func/mean": 0.6756731271743774, "rewards/rollout_reward_func/std": 0.11167633533477783, "sampling/importance_sampling_ratio/max": 1.0009711980819702, "sampling/importance_sampling_ratio/mean": 0.9648759365081787, "sampling/importance_sampling_ratio/min": 0.023223094642162323, "sampling/sampling_logp_difference/max": 1.380266547203064, "sampling/sampling_logp_difference/mean": 0.023735996335744858, "step": 3057, "step_time": 4.711855796980672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22012515366077423, "epoch": 0.03058, "grad_norm": 0.0068615456111729145, "kl": 0.61436478048563, "learning_rate": 9.995774283269013e-06, "loss": -0.0262, "step": 3058, "step_time": 2.503967239987105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06800201162695885, "epoch": 0.03059, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005968050681985915, "kl": 0.48232267424464226, "learning_rate": 9.995771486696262e-06, "loss": 0.0014, "num_tokens": 25706334.0, "reward": 0.878653883934021, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.878653883934021, "rewards/rollout_reward_func/std": 0.2795991003513336, "sampling/importance_sampling_ratio/max": 1.0005100965499878, "sampling/importance_sampling_ratio/mean": 0.9936630725860596, "sampling/importance_sampling_ratio/min": 0.9873546957969666, "sampling/sampling_logp_difference/max": 0.010081473737955093, "sampling/sampling_logp_difference/mean": 0.001496907090768218, "step": 3059, "step_time": 4.104780756002583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06935515720397234, "epoch": 0.0306, "grad_norm": 0.0006088243098929524, "kl": 0.48206835985183716, "learning_rate": 9.995768689198958e-06, "loss": 0.0014, "step": 3060, "step_time": 2.027297042994178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.05399515479803085, "epoch": 0.03061, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004901042557321489, "kl": 0.4256693497300148, "learning_rate": 9.995765890777097e-06, "loss": 0.0015, "num_tokens": 25722806.0, "reward": 1.0692307949066162, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 1.0692307949066162, "rewards/rollout_reward_func/std": 0.12128597497940063, "sampling/importance_sampling_ratio/max": 0.999127209186554, "sampling/importance_sampling_ratio/mean": 0.9945225715637207, "sampling/importance_sampling_ratio/min": 0.9883521795272827, "sampling/sampling_logp_difference/max": 0.01000310480594635, "sampling/sampling_logp_difference/mean": 0.001197055564261973, "step": 3061, "step_time": 4.2072863040011725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05569980153813958, "epoch": 0.03062, "grad_norm": 0.0005028591258451343, "kl": 0.4253757931292057, "learning_rate": 9.995763091430683e-06, "loss": 0.0015, "step": 3062, "step_time": 2.0286483890013187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 174.375, "completions/mean_terminated_length": 174.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.27165821287781, "epoch": 0.03063, "frac_reward_zero_std": 0.75, "grad_norm": 0.009687607176601887, "kl": 0.430630125105381, "learning_rate": 9.995760291159716e-06, "loss": -0.0175, "num_tokens": 25739154.0, "reward": 0.9943943023681641, "reward_std": 0.00475937407463789, "rewards/rollout_reward_func/mean": 0.9943943023681641, "rewards/rollout_reward_func/std": 0.376132071018219, "sampling/importance_sampling_ratio/max": 0.9989484548568726, "sampling/importance_sampling_ratio/mean": 0.9616221785545349, "sampling/importance_sampling_ratio/min": 0.0004776104469783604, "sampling/sampling_logp_difference/max": 2.6639232635498047, "sampling/sampling_logp_difference/mean": 0.046867989003658295, "step": 3063, "step_time": 4.697450644009223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27140525449067354, "epoch": 0.03064, "grad_norm": 0.009463833644986153, "kl": 0.4314270503818989, "learning_rate": 9.995757489964197e-06, "loss": -0.0175, "step": 3064, "step_time": 2.5381597240047995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 199.0, "completions/mean_terminated_length": 199.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4070335258729756, "epoch": 0.03065, "frac_reward_zero_std": 0.5, "grad_norm": 0.007343706209212542, "kl": 0.8745122291147709, "learning_rate": 9.995754687844124e-06, "loss": -0.0441, "num_tokens": 25756290.0, "reward": 0.8539182543754578, "reward_std": 0.16689079999923706, "rewards/rollout_reward_func/mean": 0.8539182543754578, "rewards/rollout_reward_func/std": 0.546825647354126, "sampling/importance_sampling_ratio/max": 0.9972965121269226, "sampling/importance_sampling_ratio/mean": 0.9318445324897766, "sampling/importance_sampling_ratio/min": 4.49873368779663e-05, "sampling/sampling_logp_difference/max": 2.75761079788208, "sampling/sampling_logp_difference/mean": 0.08188385516405106, "step": 3065, "step_time": 4.46755888601183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4073830395936966, "epoch": 0.03066, "grad_norm": 0.007253551390022039, "kl": 0.8453876599669456, "learning_rate": 9.9957518847995e-06, "loss": -0.0441, "step": 3066, "step_time": 2.06445591001102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 170.25, "completions/mean_terminated_length": 170.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.0657556802034378, "epoch": 0.03067, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005943448049947619, "kl": 0.4354745037853718, "learning_rate": 9.995749080830324e-06, "loss": 0.0015, "num_tokens": 25772530.0, "reward": 0.4976922869682312, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.4976922869682312, "rewards/rollout_reward_func/std": 0.08485732227563858, "sampling/importance_sampling_ratio/max": 0.9986909031867981, "sampling/importance_sampling_ratio/mean": 0.9948285818099976, "sampling/importance_sampling_ratio/min": 0.9890960454940796, "sampling/sampling_logp_difference/max": 0.009084129706025124, "sampling/sampling_logp_difference/mean": 0.0010655985679477453, "step": 3067, "step_time": 4.167224340017128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06606569699943066, "epoch": 0.03068, "grad_norm": 0.000592901895288378, "kl": 0.435399878770113, "learning_rate": 9.9957462759366e-06, "loss": 0.0015, "step": 3068, "step_time": 2.487376624987519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 187.5, "completions/mean_terminated_length": 187.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.0685382867231965, "epoch": 0.03069, "frac_reward_zero_std": 1.0, "grad_norm": 0.000757543952204287, "kl": 0.4408867359161377, "learning_rate": 9.995743470118326e-06, "loss": 0.0017, "num_tokens": 25789210.0, "reward": 0.4754999876022339, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.4754999876022339, "rewards/rollout_reward_func/std": 0.13560520112514496, "sampling/importance_sampling_ratio/max": 1.0005074739456177, "sampling/importance_sampling_ratio/mean": 0.9930851459503174, "sampling/importance_sampling_ratio/min": 0.9837740063667297, "sampling/sampling_logp_difference/max": 0.011156845837831497, "sampling/sampling_logp_difference/mean": 0.0013644504360854626, "step": 3069, "step_time": 4.123727760008478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0678169159218669, "epoch": 0.0307, "grad_norm": 0.0007414862047880888, "kl": 0.44104891270399094, "learning_rate": 9.995740663375502e-06, "loss": 0.0017, "step": 3070, "step_time": 2.4665296470047906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.1125204530544579, "epoch": 0.03071, "frac_reward_zero_std": 0.75, "grad_norm": 0.008602586574852467, "kl": 0.6050967685878277, "learning_rate": 9.995737855708131e-06, "loss": -0.025, "num_tokens": 25804954.0, "reward": 0.9696346521377563, "reward_std": 0.23796862363815308, "rewards/rollout_reward_func/mean": 0.9696346521377563, "rewards/rollout_reward_func/std": 0.5089191794395447, "sampling/importance_sampling_ratio/max": 0.998816967010498, "sampling/importance_sampling_ratio/mean": 0.9648679494857788, "sampling/importance_sampling_ratio/min": 0.06311631947755814, "sampling/sampling_logp_difference/max": 2.6935875415802, "sampling/sampling_logp_difference/mean": 0.018501795828342438, "step": 3071, "step_time": 4.014940686014597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11286062141880393, "epoch": 0.03072, "grad_norm": 0.00769229419529438, "kl": 0.6271810680627823, "learning_rate": 9.995735047116213e-06, "loss": -0.025, "step": 3072, "step_time": 2.0256281340043643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 94.875, "completions/mean_terminated_length": 94.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.14921589754521847, "epoch": 0.03073, "frac_reward_zero_std": 0.75, "grad_norm": 0.016126668080687523, "kl": 0.649230495095253, "learning_rate": 9.995732237599747e-06, "loss": -0.0246, "num_tokens": 25818790.0, "reward": 0.6332211494445801, "reward_std": 0.0006799129769206047, "rewards/rollout_reward_func/mean": 0.6332211494445801, "rewards/rollout_reward_func/std": 0.13196668028831482, "sampling/importance_sampling_ratio/max": 0.9989717602729797, "sampling/importance_sampling_ratio/mean": 0.9651283025741577, "sampling/importance_sampling_ratio/min": 0.08341411501169205, "sampling/sampling_logp_difference/max": 2.478339910507202, "sampling/sampling_logp_difference/mean": 0.01760159619152546, "step": 3073, "step_time": 3.8975335669892957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14784847106784582, "epoch": 0.03074, "grad_norm": 0.014473003335297108, "kl": 0.6521258689463139, "learning_rate": 9.995729427158738e-06, "loss": -0.0247, "step": 3074, "step_time": 2.482541028992273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 143.21875, "completions/mean_terminated_length": 143.21875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.4049042370170355, "epoch": 0.03075, "frac_reward_zero_std": 0.5, "grad_norm": 0.014046354219317436, "kl": 0.6268773749470711, "learning_rate": 9.99572661579318e-06, "loss": -0.0447, "num_tokens": 25834053.0, "reward": 0.8066345453262329, "reward_std": 0.17432901263237, "rewards/rollout_reward_func/mean": 0.8066345453262329, "rewards/rollout_reward_func/std": 0.3906557559967041, "sampling/importance_sampling_ratio/max": 1.0026812553405762, "sampling/importance_sampling_ratio/mean": 0.9353519678115845, "sampling/importance_sampling_ratio/min": 0.00039825262501835823, "sampling/sampling_logp_difference/max": 2.5994133949279785, "sampling/sampling_logp_difference/mean": 0.07617868483066559, "step": 3075, "step_time": 4.061512143998698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4013172839768231, "epoch": 0.03076, "grad_norm": 0.013229308649897575, "kl": 0.6266562454402447, "learning_rate": 9.995723803503078e-06, "loss": -0.0448, "step": 3076, "step_time": 2.4546672369979206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 165.34375, "completions/mean_terminated_length": 165.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.38501093769446015, "epoch": 0.03077, "frac_reward_zero_std": 0.75, "grad_norm": 0.02082110568881035, "kl": 0.5716426931321621, "learning_rate": 9.995720990288436e-06, "loss": -0.0363, "num_tokens": 25850056.0, "reward": 0.751868724822998, "reward_std": 0.010270723141729832, "rewards/rollout_reward_func/mean": 0.751868724822998, "rewards/rollout_reward_func/std": 0.2294778972864151, "sampling/importance_sampling_ratio/max": 0.9981670379638672, "sampling/importance_sampling_ratio/mean": 0.9315418004989624, "sampling/importance_sampling_ratio/min": 9.795969069280926e-15, "sampling/sampling_logp_difference/max": 18.953855514526367, "sampling/sampling_logp_difference/mean": 0.20569369196891785, "step": 3077, "step_time": 4.600059297983535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.38372579496353865, "epoch": 0.03078, "grad_norm": 0.021386418491601944, "kl": 0.5759652182459831, "learning_rate": 9.995718176149247e-06, "loss": -0.0362, "step": 3078, "step_time": 2.0800950300108525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6862024730071425, "epoch": 0.03079, "frac_reward_zero_std": 0.25, "grad_norm": 0.014720666222274303, "kl": 0.7404615953564644, "learning_rate": 9.995715361085516e-06, "loss": -0.0831, "num_tokens": 25865504.0, "reward": 0.8448606133460999, "reward_std": 0.07034352421760559, "rewards/rollout_reward_func/mean": 0.8448606133460999, "rewards/rollout_reward_func/std": 0.3517184555530548, "sampling/importance_sampling_ratio/max": 0.9991537928581238, "sampling/importance_sampling_ratio/mean": 0.9026157855987549, "sampling/importance_sampling_ratio/min": 0.0002998756826855242, "sampling/sampling_logp_difference/max": 2.826906442642212, "sampling/sampling_logp_difference/mean": 0.10636317729949951, "step": 3079, "step_time": 4.055861146996904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6815048470161855, "epoch": 0.0308, "grad_norm": 0.013262597844004631, "kl": 0.7506885454058647, "learning_rate": 9.995712545097244e-06, "loss": -0.0831, "step": 3080, "step_time": 2.4925653699974646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.04861478786915541, "epoch": 0.03081, "frac_reward_zero_std": 1.0, "grad_norm": 0.000421595002990216, "kl": 0.41101130843162537, "learning_rate": 9.995709728184433e-06, "loss": 0.0014, "num_tokens": 25881880.0, "reward": 0.892153799533844, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.892153799533844, "rewards/rollout_reward_func/std": 0.3351004421710968, "sampling/importance_sampling_ratio/max": 1.0013843774795532, "sampling/importance_sampling_ratio/mean": 0.9952229857444763, "sampling/importance_sampling_ratio/min": 0.9894667267799377, "sampling/sampling_logp_difference/max": 0.007059095427393913, "sampling/sampling_logp_difference/mean": 0.000937602948397398, "step": 3081, "step_time": 4.635179425000388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04748562537133694, "epoch": 0.03082, "grad_norm": 0.0004084294196218252, "kl": 0.4112011678516865, "learning_rate": 9.995706910347079e-06, "loss": 0.0014, "step": 3082, "step_time": 2.056347790006839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 124.0, "completions/mean_terminated_length": 124.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0686181839555502, "epoch": 0.03083, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004421100893523544, "kl": 0.5520796477794647, "learning_rate": 9.995704091585186e-06, "loss": 0.0014, "num_tokens": 25896672.0, "reward": 0.8048076629638672, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.8048076629638672, "rewards/rollout_reward_func/std": 0.18659023940563202, "sampling/importance_sampling_ratio/max": 1.000312328338623, "sampling/importance_sampling_ratio/mean": 0.9953494071960449, "sampling/importance_sampling_ratio/min": 0.9881209135055542, "sampling/sampling_logp_difference/max": 0.008859334513545036, "sampling/sampling_logp_difference/mean": 0.0012837592512369156, "step": 3083, "step_time": 3.9236201500025345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06665123486891389, "epoch": 0.03084, "grad_norm": 0.0004194884968455881, "kl": 0.5524645149707794, "learning_rate": 9.995701271898753e-06, "loss": 0.0014, "step": 3084, "step_time": 2.01613372599968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 188.03125, "completions/mean_terminated_length": 188.03125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.21057278849184513, "epoch": 0.03085, "frac_reward_zero_std": 0.75, "grad_norm": 0.004129092674702406, "kl": 0.47473926842212677, "learning_rate": 9.995698451287781e-06, "loss": -0.0267, "num_tokens": 25913425.0, "reward": 0.7611057758331299, "reward_std": 0.029508108273148537, "rewards/rollout_reward_func/mean": 0.7611057758331299, "rewards/rollout_reward_func/std": 0.35944801568984985, "sampling/importance_sampling_ratio/max": 0.9990211725234985, "sampling/importance_sampling_ratio/mean": 0.9658418893814087, "sampling/importance_sampling_ratio/min": 0.02003198117017746, "sampling/sampling_logp_difference/max": 1.4922330379486084, "sampling/sampling_logp_difference/mean": 0.021528325974941254, "step": 3085, "step_time": 4.013342169993848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.21016416838392615, "epoch": 0.03086, "grad_norm": 0.004040599334985018, "kl": 0.4747994728386402, "learning_rate": 9.995695629752274e-06, "loss": -0.0267, "step": 3086, "step_time": 2.430652244001976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 166.4375, "completions/mean_terminated_length": 166.4375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.10364679666236043, "epoch": 0.03087, "frac_reward_zero_std": 0.75, "grad_norm": 0.01459534652531147, "kl": 0.5266567692160606, "learning_rate": 9.995692807292228e-06, "loss": -0.0253, "num_tokens": 25929599.0, "reward": 0.8555769324302673, "reward_std": 0.15937097370624542, "rewards/rollout_reward_func/mean": 0.8555769324302673, "rewards/rollout_reward_func/std": 0.3953954577445984, "sampling/importance_sampling_ratio/max": 0.9994307160377502, "sampling/importance_sampling_ratio/mean": 0.9682379961013794, "sampling/importance_sampling_ratio/min": 0.09226997941732407, "sampling/sampling_logp_difference/max": 2.3802542686462402, "sampling/sampling_logp_difference/mean": 0.013342737220227718, "step": 3087, "step_time": 4.445403973986686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10210495488718152, "epoch": 0.03088, "grad_norm": 0.013011722825467587, "kl": 0.5313945971429348, "learning_rate": 9.995689983907647e-06, "loss": -0.0253, "step": 3088, "step_time": 1.9956499169929884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 168.25, "completions/mean_terminated_length": 168.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.6496163276024163, "epoch": 0.03089, "frac_reward_zero_std": 0.75, "grad_norm": 0.014173217117786407, "kl": 0.6288516819477081, "learning_rate": 9.99568715959853e-06, "loss": -0.0321, "num_tokens": 25945751.0, "reward": 0.776394248008728, "reward_std": 0.02545771934092045, "rewards/rollout_reward_func/mean": 0.776394248008728, "rewards/rollout_reward_func/std": 0.2589094638824463, "sampling/importance_sampling_ratio/max": 1.0004993677139282, "sampling/importance_sampling_ratio/mean": 0.9068635106086731, "sampling/importance_sampling_ratio/min": 8.220491128085872e-12, "sampling/sampling_logp_difference/max": 16.205324172973633, "sampling/sampling_logp_difference/mean": 0.19231699407100677, "step": 3089, "step_time": 4.224571941005706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.647565514780581, "epoch": 0.0309, "grad_norm": 0.013483989983797073, "kl": 0.6149446666240692, "learning_rate": 9.99568433436488e-06, "loss": -0.0322, "step": 3090, "step_time": 2.0443021789906197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 213.15625, "completions/mean_terminated_length": 213.15625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3366304114460945, "epoch": 0.03091, "frac_reward_zero_std": 0.5, "grad_norm": 0.01620728150010109, "kl": 0.6417427211999893, "learning_rate": 9.995681508206693e-06, "loss": -0.0072, "num_tokens": 25963308.0, "reward": 0.48545193672180176, "reward_std": 0.08199719339609146, "rewards/rollout_reward_func/mean": 0.48545193672180176, "rewards/rollout_reward_func/std": 0.15543553233146667, "sampling/importance_sampling_ratio/max": 1.0000704526901245, "sampling/importance_sampling_ratio/mean": 0.9344087839126587, "sampling/importance_sampling_ratio/min": 0.00469363946467638, "sampling/sampling_logp_difference/max": 2.230377674102783, "sampling/sampling_logp_difference/mean": 0.04966241866350174, "step": 3091, "step_time": 4.833222556990222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.33685338171198964, "epoch": 0.03092, "grad_norm": 0.015127092599868774, "kl": 0.6239571906626225, "learning_rate": 9.995678681123973e-06, "loss": -0.0072, "step": 3092, "step_time": 2.0483482390045538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 141.53125, "completions/mean_terminated_length": 142.8386993408203, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.44152139313519, "epoch": 0.03093, "frac_reward_zero_std": 0.75, "grad_norm": 0.010793360881507397, "kl": 0.5470367297530174, "learning_rate": 9.995675853116722e-06, "loss": -0.027, "num_tokens": 25978549.0, "reward": 0.7100480794906616, "reward_std": 0.009110798127949238, "rewards/rollout_reward_func/mean": 0.7100480794906616, "rewards/rollout_reward_func/std": 0.31399038434028625, "sampling/importance_sampling_ratio/max": 0.9987383484840393, "sampling/importance_sampling_ratio/mean": 0.9646015763282776, "sampling/importance_sampling_ratio/min": 1.0306409043337158e-17, "sampling/sampling_logp_difference/max": 3.2747700214385986, "sampling/sampling_logp_difference/mean": 0.18203774094581604, "step": 3093, "step_time": 4.502671556016139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.44202956231310964, "epoch": 0.03094, "grad_norm": 0.010933667421340942, "kl": 0.5469958521425724, "learning_rate": 9.995673024184938e-06, "loss": -0.0269, "step": 3094, "step_time": 2.0067241720025777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.05466884886845946, "epoch": 0.03095, "frac_reward_zero_std": 1.0, "grad_norm": 0.00041056450572796166, "kl": 0.4775036722421646, "learning_rate": 9.99567019432862e-06, "loss": 0.0013, "num_tokens": 25993341.0, "reward": 0.7503845691680908, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7503845691680908, "rewards/rollout_reward_func/std": 0.23142899572849274, "sampling/importance_sampling_ratio/max": 0.9995899200439453, "sampling/importance_sampling_ratio/mean": 0.9969994425773621, "sampling/importance_sampling_ratio/min": 0.9943239092826843, "sampling/sampling_logp_difference/max": 0.0030490662902593613, "sampling/sampling_logp_difference/mean": 0.0008906646398827434, "step": 3095, "step_time": 3.9261739060020773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05469175800681114, "epoch": 0.03096, "grad_norm": 0.0004144535050727427, "kl": 0.4774910472333431, "learning_rate": 9.995667363547776e-06, "loss": 0.0013, "step": 3096, "step_time": 1.9993538679991616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 96.5, "completions/mean_terminated_length": 96.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06429558619856834, "epoch": 0.03097, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004883338115178049, "kl": 0.4946225881576538, "learning_rate": 9.995664531842399e-06, "loss": 0.0011, "num_tokens": 26007253.0, "reward": 0.5334615707397461, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5334615707397461, "rewards/rollout_reward_func/std": 0.14131981134414673, "sampling/importance_sampling_ratio/max": 0.9990140795707703, "sampling/importance_sampling_ratio/mean": 0.9950207471847534, "sampling/importance_sampling_ratio/min": 0.990150511264801, "sampling/sampling_logp_difference/max": 0.007483345456421375, "sampling/sampling_logp_difference/mean": 0.0015483667375519872, "step": 3097, "step_time": 4.315945800997724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06491441838443279, "epoch": 0.03098, "grad_norm": 0.0005089168553240597, "kl": 0.4945017583668232, "learning_rate": 9.995661699212496e-06, "loss": 0.0011, "step": 3098, "step_time": 2.428701589022239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 171.65625, "completions/mean_terminated_length": 171.65625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.8174462402239442, "epoch": 0.03099, "frac_reward_zero_std": 0.5, "grad_norm": 0.008330253884196281, "kl": 0.5879816710948944, "learning_rate": 9.995658865658062e-06, "loss": -0.0592, "num_tokens": 26023458.0, "reward": 0.6518557667732239, "reward_std": 0.02822015807032585, "rewards/rollout_reward_func/mean": 0.6518557667732239, "rewards/rollout_reward_func/std": 0.29472169280052185, "sampling/importance_sampling_ratio/max": 0.9976839423179626, "sampling/importance_sampling_ratio/mean": 0.9021816253662109, "sampling/importance_sampling_ratio/min": 3.867311055520635e-20, "sampling/sampling_logp_difference/max": 4.3210625648498535, "sampling/sampling_logp_difference/mean": 0.24887752532958984, "step": 3099, "step_time": 4.364076019002823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8151022200472653, "epoch": 0.031, "grad_norm": 0.007783148903399706, "kl": 0.5832655727863312, "learning_rate": 9.9956560311791e-06, "loss": -0.0592, "step": 3100, "step_time": 2.0327261080092285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.23693002806976438, "epoch": 0.03101, "frac_reward_zero_std": 0.75, "grad_norm": 0.0036593482363969088, "kl": 0.4141830764710903, "learning_rate": 9.995653195775612e-06, "loss": -0.0272, "num_tokens": 26040466.0, "reward": 0.7705962061882019, "reward_std": 0.00679910508915782, "rewards/rollout_reward_func/mean": 0.7705962061882019, "rewards/rollout_reward_func/std": 0.4143095314502716, "sampling/importance_sampling_ratio/max": 0.9998796582221985, "sampling/importance_sampling_ratio/mean": 0.9660383462905884, "sampling/importance_sampling_ratio/min": 0.003858179086819291, "sampling/sampling_logp_difference/max": 1.695500135421753, "sampling/sampling_logp_difference/mean": 0.030803514644503593, "step": 3101, "step_time": 4.133614701015176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24403378879651427, "epoch": 0.03102, "grad_norm": 0.0026010614819824696, "kl": 0.4090849310159683, "learning_rate": 9.995650359447596e-06, "loss": -0.0272, "step": 3102, "step_time": 2.013865890992747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 149.3125, "completions/mean_terminated_length": 149.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.35656143305823207, "epoch": 0.03103, "frac_reward_zero_std": 0.75, "grad_norm": 0.00555227929726243, "kl": 0.6210373193025589, "learning_rate": 9.995647522195056e-06, "loss": -0.0175, "num_tokens": 26056132.0, "reward": 0.8502788543701172, "reward_std": 0.04446613788604736, "rewards/rollout_reward_func/mean": 0.8502788543701172, "rewards/rollout_reward_func/std": 0.3589107096195221, "sampling/importance_sampling_ratio/max": 0.9976868629455566, "sampling/importance_sampling_ratio/mean": 0.9634609222412109, "sampling/importance_sampling_ratio/min": 0.0002186042838729918, "sampling/sampling_logp_difference/max": 2.0741002559661865, "sampling/sampling_logp_difference/mean": 0.04605339094996452, "step": 3103, "step_time": 4.673609111989208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3554002153687179, "epoch": 0.03104, "grad_norm": 0.005190617870539427, "kl": 0.6143748611211777, "learning_rate": 9.995644684017989e-06, "loss": -0.0175, "step": 3104, "step_time": 2.486575041002652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.27943824604153633, "epoch": 0.03105, "frac_reward_zero_std": 0.75, "grad_norm": 0.00641182204708457, "kl": 0.6536837518215179, "learning_rate": 9.9956418449164e-06, "loss": -0.0266, "num_tokens": 26072176.0, "reward": 0.5757692456245422, "reward_std": 0.010878566652536392, "rewards/rollout_reward_func/mean": 0.5757692456245422, "rewards/rollout_reward_func/std": 0.13062870502471924, "sampling/importance_sampling_ratio/max": 0.9981015920639038, "sampling/importance_sampling_ratio/mean": 0.963404655456543, "sampling/importance_sampling_ratio/min": 0.007099025417119265, "sampling/sampling_logp_difference/max": 2.144662618637085, "sampling/sampling_logp_difference/mean": 0.03279856592416763, "step": 3105, "step_time": 4.332573655003216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2764962278306484, "epoch": 0.03106, "grad_norm": 0.006032382603734732, "kl": 0.648974135518074, "learning_rate": 9.995639004890287e-06, "loss": -0.0266, "step": 3106, "step_time": 2.007638263006811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 218.46875, "completions/mean_terminated_length": 218.46875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.20241063134744763, "epoch": 0.03107, "frac_reward_zero_std": 0.75, "grad_norm": 0.011086365208029747, "kl": 0.5670751668512821, "learning_rate": 9.995636163939649e-06, "loss": -0.026, "num_tokens": 26089903.0, "reward": 1.0501827001571655, "reward_std": 0.015909897163510323, "rewards/rollout_reward_func/mean": 1.0501827001571655, "rewards/rollout_reward_func/std": 0.3467893898487091, "sampling/importance_sampling_ratio/max": 0.9982143640518188, "sampling/importance_sampling_ratio/mean": 0.9657984375953674, "sampling/importance_sampling_ratio/min": 0.02683020941913128, "sampling/sampling_logp_difference/max": 1.3769129514694214, "sampling/sampling_logp_difference/mean": 0.0232778862118721, "step": 3107, "step_time": 4.664880883996375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.20015323627740145, "epoch": 0.03108, "grad_norm": 0.0111664654687047, "kl": 0.5674660578370094, "learning_rate": 9.99563332206449e-06, "loss": -0.026, "step": 3108, "step_time": 2.0292776659989613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 166.03125, "completions/mean_terminated_length": 166.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.31447287648916245, "epoch": 0.03109, "frac_reward_zero_std": 0.75, "grad_norm": 0.03141593188047409, "kl": 0.5785054042935371, "learning_rate": 9.995630479264808e-06, "loss": -0.0267, "num_tokens": 26105984.0, "reward": 0.6554567217826843, "reward_std": 0.12881581485271454, "rewards/rollout_reward_func/mean": 0.6554567217826843, "rewards/rollout_reward_func/std": 0.43070635199546814, "sampling/importance_sampling_ratio/max": 1.0006539821624756, "sampling/importance_sampling_ratio/mean": 0.9177733063697815, "sampling/importance_sampling_ratio/min": 0.011629844084382057, "sampling/sampling_logp_difference/max": 2.6923186779022217, "sampling/sampling_logp_difference/mean": 0.04855884611606598, "step": 3109, "step_time": 4.51271686200198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3137693554162979, "epoch": 0.0311, "grad_norm": 0.029073525220155716, "kl": 0.5742341838777065, "learning_rate": 9.995627635540608e-06, "loss": -0.0268, "step": 3110, "step_time": 2.0039517490004073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.22192887961864471, "epoch": 0.03111, "frac_reward_zero_std": 0.75, "grad_norm": 0.018324080854654312, "kl": 0.5857999697327614, "learning_rate": 9.995624790891887e-06, "loss": 0.03, "num_tokens": 26121350.0, "reward": 0.7378365397453308, "reward_std": 0.01563793420791626, "rewards/rollout_reward_func/mean": 0.7378365397453308, "rewards/rollout_reward_func/std": 0.2775927484035492, "sampling/importance_sampling_ratio/max": 0.9996300339698792, "sampling/importance_sampling_ratio/mean": 0.965365469455719, "sampling/importance_sampling_ratio/min": 0.02485831081867218, "sampling/sampling_logp_difference/max": 1.5889122486114502, "sampling/sampling_logp_difference/mean": 0.030894285067915916, "step": 3111, "step_time": 4.618979247003153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.22356805112212896, "epoch": 0.03112, "grad_norm": 0.01833558827638626, "kl": 0.58195661008358, "learning_rate": 9.995621945318646e-06, "loss": 0.0299, "step": 3112, "step_time": 2.0179422340079327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4974488392472267, "epoch": 0.03113, "frac_reward_zero_std": 0.5, "grad_norm": 0.007371143903583288, "kl": 0.566815260797739, "learning_rate": 9.995619098820886e-06, "loss": -0.0556, "num_tokens": 26137350.0, "reward": 0.5818797945976257, "reward_std": 0.03199658542871475, "rewards/rollout_reward_func/mean": 0.5818797945976257, "rewards/rollout_reward_func/std": 0.19586151838302612, "sampling/importance_sampling_ratio/max": 0.996810257434845, "sampling/importance_sampling_ratio/mean": 0.9321324825286865, "sampling/importance_sampling_ratio/min": 0.00044639117550104856, "sampling/sampling_logp_difference/max": 1.9617053270339966, "sampling/sampling_logp_difference/mean": 0.08536220341920853, "step": 3113, "step_time": 4.445843365996552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5010868436656892, "epoch": 0.03114, "grad_norm": 0.00785991083830595, "kl": 0.5627855807542801, "learning_rate": 9.995616251398608e-06, "loss": -0.0556, "step": 3114, "step_time": 2.067713405995164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 191.96875, "completions/mean_terminated_length": 191.96875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.27647718228399754, "epoch": 0.03115, "frac_reward_zero_std": 0.75, "grad_norm": 0.003922146745026112, "kl": 0.4894736669957638, "learning_rate": 9.995613403051814e-06, "loss": -0.027, "num_tokens": 26154229.0, "reward": 0.4912499785423279, "reward_std": 0.16725794970989227, "rewards/rollout_reward_func/mean": 0.4912499785423279, "rewards/rollout_reward_func/std": 0.3368493914604187, "sampling/importance_sampling_ratio/max": 1.0004414319992065, "sampling/importance_sampling_ratio/mean": 0.9656122922897339, "sampling/importance_sampling_ratio/min": 0.0014059502864256501, "sampling/sampling_logp_difference/max": 2.0143723487854004, "sampling/sampling_logp_difference/mean": 0.0362367257475853, "step": 3115, "step_time": 4.713199431003886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.27748626889660954, "epoch": 0.03116, "grad_norm": 0.003858134848996997, "kl": 0.4890819899737835, "learning_rate": 9.995610553780502e-06, "loss": -0.027, "step": 3116, "step_time": 2.487855202984065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4167386759072542, "epoch": 0.03117, "frac_reward_zero_std": 0.75, "grad_norm": 0.007047408726066351, "kl": 0.50080406665802, "learning_rate": 9.995607703584674e-06, "loss": -0.027, "num_tokens": 26170601.0, "reward": 0.6060096025466919, "reward_std": 0.015637939795851707, "rewards/rollout_reward_func/mean": 0.6060096025466919, "rewards/rollout_reward_func/std": 0.12162189185619354, "sampling/importance_sampling_ratio/max": 0.9992517828941345, "sampling/importance_sampling_ratio/mean": 0.9636046886444092, "sampling/importance_sampling_ratio/min": 2.9303659498935986e-10, "sampling/sampling_logp_difference/max": 3.16348934173584, "sampling/sampling_logp_difference/mean": 0.1065356582403183, "step": 3117, "step_time": 4.278262799001823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.41763345059007406, "epoch": 0.03118, "grad_norm": 0.007340048439800739, "kl": 0.5003146678209305, "learning_rate": 9.995604852464331e-06, "loss": -0.027, "step": 3118, "step_time": 2.048270499013597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.15678240964189172, "epoch": 0.03119, "frac_reward_zero_std": 0.75, "grad_norm": 0.04258999228477478, "kl": 0.6937395222485065, "learning_rate": 9.995602000419475e-06, "loss": 0.0182, "num_tokens": 26185385.0, "reward": 0.9078077077865601, "reward_std": 0.01359820831567049, "rewards/rollout_reward_func/mean": 0.9078077077865601, "rewards/rollout_reward_func/std": 0.26942551136016846, "sampling/importance_sampling_ratio/max": 1.0020339488983154, "sampling/importance_sampling_ratio/mean": 0.9671967625617981, "sampling/importance_sampling_ratio/min": 0.10355579853057861, "sampling/sampling_logp_difference/max": 2.023038387298584, "sampling/sampling_logp_difference/mean": 0.01582028716802597, "step": 3119, "step_time": 3.9423352109952248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.15720787830650806, "epoch": 0.0312, "grad_norm": 0.04188660532236099, "kl": 0.6822008155286312, "learning_rate": 9.995599147450103e-06, "loss": 0.0181, "step": 3120, "step_time": 2.4793560009857174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 214.59375, "completions/mean_terminated_length": 214.59375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.8128514257259667, "epoch": 0.03121, "frac_reward_zero_std": 0.5, "grad_norm": 0.014826681464910507, "kl": 0.5139042064547539, "learning_rate": 9.995596293556219e-06, "loss": -0.0667, "num_tokens": 26203052.0, "reward": 1.1491379737854004, "reward_std": 0.08292501419782639, "rewards/rollout_reward_func/mean": 1.1491379737854004, "rewards/rollout_reward_func/std": 0.20257119834423065, "sampling/importance_sampling_ratio/max": 1.0081870555877686, "sampling/importance_sampling_ratio/mean": 0.9090352058410645, "sampling/importance_sampling_ratio/min": 7.535723989987956e-27, "sampling/sampling_logp_difference/max": 16.605409622192383, "sampling/sampling_logp_difference/mean": 0.3418879806995392, "step": 3121, "step_time": 4.522365226002876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8132727486081421, "epoch": 0.03122, "grad_norm": 0.01553113479167223, "kl": 0.5088163241744041, "learning_rate": 9.995593438737822e-06, "loss": -0.0667, "step": 3122, "step_time": 2.552552093002305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 192.5, "completions/mean_terminated_length": 192.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.13221643399447203, "epoch": 0.03123, "frac_reward_zero_std": 0.75, "grad_norm": 0.025514807552099228, "kl": 0.47033847868442535, "learning_rate": 9.995590582994914e-06, "loss": -0.0333, "num_tokens": 26219892.0, "reward": 0.8289072513580322, "reward_std": 0.010270723141729832, "rewards/rollout_reward_func/mean": 0.8289072513580322, "rewards/rollout_reward_func/std": 0.34159091114997864, "sampling/importance_sampling_ratio/max": 0.9973297715187073, "sampling/importance_sampling_ratio/mean": 0.9627046585083008, "sampling/importance_sampling_ratio/min": 0.06737063825130463, "sampling/sampling_logp_difference/max": 1.8436089754104614, "sampling/sampling_logp_difference/mean": 0.015755796805024147, "step": 3123, "step_time": 4.594554789007816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.13192121544852853, "epoch": 0.03124, "grad_norm": 0.024900687858462334, "kl": 0.47624149546027184, "learning_rate": 9.995587726327495e-06, "loss": -0.0333, "step": 3124, "step_time": 2.0869612030073768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 169.5, "completions/mean_terminated_length": 169.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.18754333909600973, "epoch": 0.03125, "frac_reward_zero_std": 0.75, "grad_norm": 0.02712210826575756, "kl": 0.513349823653698, "learning_rate": 9.995584868735565e-06, "loss": -0.0239, "num_tokens": 26235996.0, "reward": 0.6328557729721069, "reward_std": 0.07818970084190369, "rewards/rollout_reward_func/mean": 0.6328557729721069, "rewards/rollout_reward_func/std": 0.2544982135295868, "sampling/importance_sampling_ratio/max": 0.9995536804199219, "sampling/importance_sampling_ratio/mean": 0.9409853219985962, "sampling/importance_sampling_ratio/min": 0.11266136914491653, "sampling/sampling_logp_difference/max": 2.051191806793213, "sampling/sampling_logp_difference/mean": 0.024951878935098648, "step": 3125, "step_time": 4.457373353005096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18777371663600206, "epoch": 0.03126, "grad_norm": 0.023205479606986046, "kl": 0.5186765491962433, "learning_rate": 9.995582010219128e-06, "loss": -0.0239, "step": 3126, "step_time": 2.4775023160109413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 98.4375, "completions/mean_terminated_length": 98.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.24678576923906803, "epoch": 0.03127, "frac_reward_zero_std": 0.75, "grad_norm": 0.011924042366445065, "kl": 0.667362853884697, "learning_rate": 9.995579150778179e-06, "loss": -0.0266, "num_tokens": 26250034.0, "reward": 0.591201901435852, "reward_std": 0.14346109330654144, "rewards/rollout_reward_func/mean": 0.591201901435852, "rewards/rollout_reward_func/std": 0.32493409514427185, "sampling/importance_sampling_ratio/max": 0.999126672744751, "sampling/importance_sampling_ratio/mean": 0.961767315864563, "sampling/importance_sampling_ratio/min": 0.024278776720166206, "sampling/sampling_logp_difference/max": 1.6112407445907593, "sampling/sampling_logp_difference/mean": 0.025930428877472878, "step": 3127, "step_time": 4.0213745800137985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.24690983537584543, "epoch": 0.03128, "grad_norm": 0.011153893545269966, "kl": 0.6717972978949547, "learning_rate": 9.995576290412723e-06, "loss": -0.0266, "step": 3128, "step_time": 2.4884952810025425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.11432388611137867, "epoch": 0.03129, "frac_reward_zero_std": 1.0, "grad_norm": 0.00926064234226942, "kl": 0.5749696902930737, "learning_rate": 9.99557342912276e-06, "loss": 0.0019, "num_tokens": 26265490.0, "reward": 0.7092307806015015, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7092307806015015, "rewards/rollout_reward_func/std": 0.09920932352542877, "sampling/importance_sampling_ratio/max": 0.9980207681655884, "sampling/importance_sampling_ratio/mean": 0.9710254669189453, "sampling/importance_sampling_ratio/min": 0.228143572807312, "sampling/sampling_logp_difference/max": 1.535091519355774, "sampling/sampling_logp_difference/mean": 0.00947191659361124, "step": 3129, "step_time": 4.244802943998366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11339590838178992, "epoch": 0.0313, "grad_norm": 0.009899773634970188, "kl": 0.579115480184555, "learning_rate": 9.99557056690829e-06, "loss": 0.0019, "step": 3130, "step_time": 1.9990831559916842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 166.0625, "completions/mean_terminated_length": 166.0625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.39723911322653294, "epoch": 0.03131, "frac_reward_zero_std": 0.5, "grad_norm": 0.016618188470602036, "kl": 0.4576787166297436, "learning_rate": 9.995567703769312e-06, "loss": 0.0011, "num_tokens": 26281540.0, "reward": 0.6212019324302673, "reward_std": 0.022437041625380516, "rewards/rollout_reward_func/mean": 0.6212019324302673, "rewards/rollout_reward_func/std": 0.272173136472702, "sampling/importance_sampling_ratio/max": 1.0009148120880127, "sampling/importance_sampling_ratio/mean": 0.9332183599472046, "sampling/importance_sampling_ratio/min": 0.008891469798982143, "sampling/sampling_logp_difference/max": 1.5301356315612793, "sampling/sampling_logp_difference/mean": 0.04936469346284866, "step": 3131, "step_time": 4.121611321010278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3900145534425974, "epoch": 0.03132, "grad_norm": 0.016503866761922836, "kl": 0.457039549946785, "learning_rate": 9.99556483970583e-06, "loss": 0.0011, "step": 3132, "step_time": 2.5268377989850705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 176.4375, "completions/mean_terminated_length": 176.4375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3586171502247453, "epoch": 0.03133, "frac_reward_zero_std": 0.75, "grad_norm": 0.0045458427630364895, "kl": 0.5064605586230755, "learning_rate": 9.995561974717846e-06, "loss": -0.0368, "num_tokens": 26298098.0, "reward": 0.9510360956192017, "reward_std": 0.047981277108192444, "rewards/rollout_reward_func/mean": 0.9510360956192017, "rewards/rollout_reward_func/std": 0.2624526619911194, "sampling/importance_sampling_ratio/max": 0.9992536902427673, "sampling/importance_sampling_ratio/mean": 0.9390709400177002, "sampling/importance_sampling_ratio/min": 0.00016837277507875115, "sampling/sampling_logp_difference/max": 1.688565731048584, "sampling/sampling_logp_difference/mean": 0.05422717332839966, "step": 3133, "step_time": 4.994310360998497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0036764706019312143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "entropy": 0.3553315601311624, "epoch": 0.03134, "grad_norm": 0.004918649327009916, "kl": 0.512989416718483, "learning_rate": 9.995559108805355e-06, "loss": -0.0367, "step": 3134, "step_time": 2.0673268690006807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 107.8125, "completions/mean_terminated_length": 107.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7028953433036804, "epoch": 0.03135, "frac_reward_zero_std": 0.25, "grad_norm": 0.012137713842093945, "kl": 0.6603409014642239, "learning_rate": 9.995556241968362e-06, "loss": -0.0554, "num_tokens": 26312436.0, "reward": 0.828125, "reward_std": 0.0716625526547432, "rewards/rollout_reward_func/mean": 0.828125, "rewards/rollout_reward_func/std": 0.2660449743270874, "sampling/importance_sampling_ratio/max": 0.9968106746673584, "sampling/importance_sampling_ratio/mean": 0.9003708362579346, "sampling/importance_sampling_ratio/min": 0.0004172443877905607, "sampling/sampling_logp_difference/max": 1.6343412399291992, "sampling/sampling_logp_difference/mean": 0.13101854920387268, "step": 3135, "step_time": 3.8698351579951122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7004395583644509, "epoch": 0.03136, "grad_norm": 0.01236695982515812, "kl": 0.6528775840997696, "learning_rate": 9.995553374206865e-06, "loss": -0.0554, "step": 3136, "step_time": 2.0321331340092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 167.3125, "completions/mean_terminated_length": 167.3125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.34354681987315416, "epoch": 0.03137, "frac_reward_zero_std": 0.75, "grad_norm": 0.007880831137299538, "kl": 0.46213749423623085, "learning_rate": 9.995550505520868e-06, "loss": -0.0369, "num_tokens": 26328502.0, "reward": 0.777745246887207, "reward_std": 0.02791711315512657, "rewards/rollout_reward_func/mean": 0.777745246887207, "rewards/rollout_reward_func/std": 0.32915937900543213, "sampling/importance_sampling_ratio/max": 0.9990643858909607, "sampling/importance_sampling_ratio/mean": 0.9625276327133179, "sampling/importance_sampling_ratio/min": 3.150577185806469e-06, "sampling/sampling_logp_difference/max": 3.382584571838379, "sampling/sampling_logp_difference/mean": 0.0576527863740921, "step": 3137, "step_time": 4.334490443987306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3427348406985402, "epoch": 0.03138, "grad_norm": 0.008037155494093895, "kl": 0.4615135192871094, "learning_rate": 9.99554763591037e-06, "loss": -0.0369, "step": 3138, "step_time": 2.5201223289914196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 100.25, "completions/mean_terminated_length": 100.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07441975641995668, "epoch": 0.03139, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005378237692639232, "kl": 0.5427576787769794, "learning_rate": 9.99554476537537e-06, "loss": 0.0013, "num_tokens": 26342534.0, "reward": 0.5449999570846558, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.5449999570846558, "rewards/rollout_reward_func/std": 0.19376303255558014, "sampling/importance_sampling_ratio/max": 0.9977311491966248, "sampling/importance_sampling_ratio/mean": 0.9924499988555908, "sampling/importance_sampling_ratio/min": 0.9854774475097656, "sampling/sampling_logp_difference/max": 0.012515336275100708, "sampling/sampling_logp_difference/mean": 0.0018957791617140174, "step": 3139, "step_time": 4.423048825992737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07360772229731083, "epoch": 0.0314, "grad_norm": 0.0005301095079630613, "kl": 0.5429390110075474, "learning_rate": 9.995541893915872e-06, "loss": 0.0013, "step": 3140, "step_time": 2.0181824850005796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 152.78125, "completions/mean_terminated_length": 152.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7487105643376708, "epoch": 0.03141, "frac_reward_zero_std": 0.25, "grad_norm": 0.015038052573800087, "kl": 0.585552915930748, "learning_rate": 9.995539021531873e-06, "loss": -0.0647, "num_tokens": 26358191.0, "reward": 0.847008228302002, "reward_std": 0.1107206791639328, "rewards/rollout_reward_func/mean": 0.847008228302002, "rewards/rollout_reward_func/std": 0.2986818850040436, "sampling/importance_sampling_ratio/max": 0.998387336730957, "sampling/importance_sampling_ratio/mean": 0.9009993672370911, "sampling/importance_sampling_ratio/min": 9.746197974891402e-06, "sampling/sampling_logp_difference/max": 3.0180468559265137, "sampling/sampling_logp_difference/mean": 0.12101209163665771, "step": 3141, "step_time": 4.315536295995116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7484887465834618, "epoch": 0.03142, "grad_norm": 0.014469767920672894, "kl": 0.5857901126146317, "learning_rate": 9.995536148223378e-06, "loss": -0.0647, "step": 3142, "step_time": 2.0608666700063623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 174.9375, "completions/mean_terminated_length": 174.9375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3115274729207158, "epoch": 0.03143, "frac_reward_zero_std": 0.75, "grad_norm": 0.012485464103519917, "kl": 0.5180972442030907, "learning_rate": 9.995533273990382e-06, "loss": 0.0303, "num_tokens": 26374613.0, "reward": 0.8438653945922852, "reward_std": 0.00679910508915782, "rewards/rollout_reward_func/mean": 0.8438653945922852, "rewards/rollout_reward_func/std": 0.35658344626426697, "sampling/importance_sampling_ratio/max": 0.9981755614280701, "sampling/importance_sampling_ratio/mean": 0.9628955125808716, "sampling/importance_sampling_ratio/min": 0.001980389701202512, "sampling/sampling_logp_difference/max": 1.686186671257019, "sampling/sampling_logp_difference/mean": 0.040080584585666656, "step": 3143, "step_time": 4.238351194988354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3138027945533395, "epoch": 0.03144, "grad_norm": 0.013030311092734337, "kl": 0.5185276009142399, "learning_rate": 9.995530398832893e-06, "loss": 0.0303, "step": 3144, "step_time": 2.5393890860068495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 130.40625, "completions/mean_terminated_length": 130.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.48264751955866814, "epoch": 0.03145, "frac_reward_zero_std": 0.75, "grad_norm": 0.010363689623773098, "kl": 0.5642175525426865, "learning_rate": 9.995527522750907e-06, "loss": 0.0112, "num_tokens": 26389554.0, "reward": 0.9284327030181885, "reward_std": 0.014278117567300797, "rewards/rollout_reward_func/mean": 0.9284327030181885, "rewards/rollout_reward_func/std": 0.30364790558815, "sampling/importance_sampling_ratio/max": 0.9988014101982117, "sampling/importance_sampling_ratio/mean": 0.9631534814834595, "sampling/importance_sampling_ratio/min": 1.5830149093438756e-23, "sampling/sampling_logp_difference/max": 3.715125560760498, "sampling/sampling_logp_difference/mean": 0.2558291554450989, "step": 3145, "step_time": 4.727674553992983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4836840140633285, "epoch": 0.03146, "grad_norm": 0.01006847620010376, "kl": 0.5626357384026051, "learning_rate": 9.995524645744426e-06, "loss": 0.0112, "step": 3146, "step_time": 2.055504750998807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0687813670374453, "epoch": 0.03147, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005688097444362938, "kl": 0.5289360992610455, "learning_rate": 9.995521767813447e-06, "loss": 0.0016, "num_tokens": 26405786.0, "reward": 0.7383077144622803, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 0.7383077144622803, "rewards/rollout_reward_func/std": 0.13933232426643372, "sampling/importance_sampling_ratio/max": 0.9990785717964172, "sampling/importance_sampling_ratio/mean": 0.9938032627105713, "sampling/importance_sampling_ratio/min": 0.9875329732894897, "sampling/sampling_logp_difference/max": 0.0076954662799835205, "sampling/sampling_logp_difference/mean": 0.0012165093794465065, "step": 3147, "step_time": 4.240850015994511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06920464616268873, "epoch": 0.03148, "grad_norm": 0.000582656473852694, "kl": 0.5288179777562618, "learning_rate": 9.995518888957978e-06, "loss": 0.0016, "step": 3148, "step_time": 2.064331301000493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3190673370845616, "epoch": 0.03149, "frac_reward_zero_std": 0.75, "grad_norm": 0.00878868717700243, "kl": 0.538806889206171, "learning_rate": 9.995516009178012e-06, "loss": -0.0268, "num_tokens": 26422786.0, "reward": 0.8213653564453125, "reward_std": 0.009518749080598354, "rewards/rollout_reward_func/mean": 0.8213653564453125, "rewards/rollout_reward_func/std": 0.3216063380241394, "sampling/importance_sampling_ratio/max": 0.9983477592468262, "sampling/importance_sampling_ratio/mean": 0.9621635675430298, "sampling/importance_sampling_ratio/min": 0.0001717820850899443, "sampling/sampling_logp_difference/max": 1.9235291481018066, "sampling/sampling_logp_difference/mean": 0.054819680750370026, "step": 3149, "step_time": 4.448663832998136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3180125621147454, "epoch": 0.0315, "grad_norm": 0.008669854141771793, "kl": 0.538688950240612, "learning_rate": 9.995513128473556e-06, "loss": -0.0268, "step": 3150, "step_time": 2.546873149985913 } ], "logging_steps": 1.0, "max_steps": 200000, "num_input_tokens_seen": 26422786, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }