| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.006, |
| "eval_steps": 500, |
| "global_step": 300, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1124.0, |
| "completions/max_terminated_length": 1124.0, |
| "completions/mean_length": 220.03125, |
| "completions/mean_terminated_length": 220.03125, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.10999894605993177, |
| "epoch": 2e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0684715509414673, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": 0.0088, |
| "num_tokens": 35769.0, |
| "reward": -0.7897067070007324, |
| "reward_std": 0.7282192707061768, |
| "rewards/rollout_reward_func/mean": -0.7897067070007324, |
| "rewards/rollout_reward_func/std": 0.6921246647834778, |
| "sampling/importance_sampling_ratio/max": 1.680492877960205, |
| "sampling/importance_sampling_ratio/mean": 0.8940014839172363, |
| "sampling/importance_sampling_ratio/min": 0.04773883521556854, |
| "sampling/sampling_logp_difference/max": 2.8054585456848145, |
| "sampling/sampling_logp_difference/mean": 0.07180452346801758, |
| "step": 1, |
| "step_time": 13.410615189000055 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1152.0, |
| "completions/max_terminated_length": 1152.0, |
| "completions/mean_length": 302.0, |
| "completions/mean_terminated_length": 302.0, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.1047596417774912, |
| "epoch": 4e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6198148727416992, |
| "kl": 0.0, |
| "learning_rate": 2.8571428571428573e-06, |
| "loss": -0.0135, |
| "num_tokens": 77064.0, |
| "reward": -0.7216953039169312, |
| "reward_std": 0.6422248482704163, |
| "rewards/rollout_reward_func/mean": -0.7216953039169312, |
| "rewards/rollout_reward_func/std": 0.7596297860145569, |
| "sampling/importance_sampling_ratio/max": 2.3478667736053467, |
| "sampling/importance_sampling_ratio/mean": 0.9927622675895691, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 2.459986925125122, |
| "sampling/sampling_logp_difference/mean": 0.0796830952167511, |
| "step": 2, |
| "step_time": 13.996805407999773 |
| }, |
| { |
| "clip_ratio/high_max": 0.02083333395421505, |
| "clip_ratio/high_mean": 0.02083333395421505, |
| "clip_ratio/low_mean": 0.016406250186264515, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.037239584140479565, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 854.0, |
| "completions/max_terminated_length": 854.0, |
| "completions/mean_length": 280.40625, |
| "completions/mean_terminated_length": 280.40625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.10994739399757236, |
| "epoch": 6e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.830590546131134, |
| "kl": 0.010026682673075604, |
| "learning_rate": 5.7142857142857145e-06, |
| "loss": 0.0022, |
| "num_tokens": 114789.0, |
| "reward": -0.6636782884597778, |
| "reward_std": 0.843880832195282, |
| "rewards/rollout_reward_func/mean": -0.6636782884597778, |
| "rewards/rollout_reward_func/std": 0.8200815320014954, |
| "sampling/importance_sampling_ratio/max": 1.9734946489334106, |
| "sampling/importance_sampling_ratio/mean": 0.9548332095146179, |
| "sampling/importance_sampling_ratio/min": 0.19213292002677917, |
| "sampling/sampling_logp_difference/max": 1.5306037664413452, |
| "sampling/sampling_logp_difference/mean": 0.04678232967853546, |
| "step": 3, |
| "step_time": 11.012894956999844 |
| }, |
| { |
| "clip_ratio/high_max": 0.02142857201397419, |
| "clip_ratio/high_mean": 0.010714286006987095, |
| "clip_ratio/low_mean": 0.05390625121071935, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.06462053721770644, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1307.0, |
| "completions/max_terminated_length": 1307.0, |
| "completions/mean_length": 368.0, |
| "completions/mean_terminated_length": 368.0, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.11609030631370842, |
| "epoch": 8e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.2315211296081543, |
| "kl": 0.04723566836028681, |
| "learning_rate": 8.571428571428573e-06, |
| "loss": -0.0016, |
| "num_tokens": 156927.0, |
| "reward": -0.5773141384124756, |
| "reward_std": 0.8677605390548706, |
| "rewards/rollout_reward_func/mean": -0.5773141384124756, |
| "rewards/rollout_reward_func/std": 0.8631168603897095, |
| "sampling/importance_sampling_ratio/max": 2.472893238067627, |
| "sampling/importance_sampling_ratio/mean": 0.9910582900047302, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.2368862628936768, |
| "sampling/sampling_logp_difference/mean": 0.0653323084115982, |
| "step": 4, |
| "step_time": 14.133309550000035 |
| }, |
| { |
| "clip_ratio/high_max": 0.023685516323894262, |
| "clip_ratio/high_mean": 0.011842758161947131, |
| "clip_ratio/low_mean": 0.03437500027939677, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.046217758441343904, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1653.0, |
| "completions/max_terminated_length": 1653.0, |
| "completions/mean_length": 358.71875, |
| "completions/mean_terminated_length": 358.71875, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.11123159667477012, |
| "epoch": 0.0001, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.8210487365722656, |
| "kl": 0.06365584026783466, |
| "learning_rate": 1.1428571428571429e-05, |
| "loss": -0.0616, |
| "num_tokens": 198606.0, |
| "reward": -0.51972895860672, |
| "reward_std": 0.8936069011688232, |
| "rewards/rollout_reward_func/mean": -0.51972895860672, |
| "rewards/rollout_reward_func/std": 0.9223779439926147, |
| "sampling/importance_sampling_ratio/max": 2.422663688659668, |
| "sampling/importance_sampling_ratio/mean": 1.0740761756896973, |
| "sampling/importance_sampling_ratio/min": 0.3291013836860657, |
| "sampling/sampling_logp_difference/max": 1.1113799810409546, |
| "sampling/sampling_logp_difference/mean": 0.0602106899023056, |
| "step": 5, |
| "step_time": 14.35855693699989 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.016666667070239782, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.020572917070239782, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1054.0, |
| "completions/max_terminated_length": 1054.0, |
| "completions/mean_length": 259.46875, |
| "completions/mean_terminated_length": 259.46875, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.09252851491328329, |
| "epoch": 0.00012, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.394017219543457, |
| "kl": 0.05397044947139307, |
| "learning_rate": 1.4285714285714285e-05, |
| "loss": 0.0026, |
| "num_tokens": 237513.0, |
| "reward": -0.9176378846168518, |
| "reward_std": 0.393246054649353, |
| "rewards/rollout_reward_func/mean": -0.9176378846168518, |
| "rewards/rollout_reward_func/std": 0.5164620280265808, |
| "sampling/importance_sampling_ratio/max": 1.197955846786499, |
| "sampling/importance_sampling_ratio/mean": 0.9096383452415466, |
| "sampling/importance_sampling_ratio/min": 0.07589028030633926, |
| "sampling/sampling_logp_difference/max": 2.950902223587036, |
| "sampling/sampling_logp_difference/mean": 0.06425637006759644, |
| "step": 6, |
| "step_time": 13.00664691299994 |
| }, |
| { |
| "clip_ratio/high_max": 0.033333334140479565, |
| "clip_ratio/high_mean": 0.016666667070239782, |
| "clip_ratio/low_mean": 0.012500000186264515, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.029166667256504297, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1530.0, |
| "completions/max_terminated_length": 1530.0, |
| "completions/mean_length": 242.15625, |
| "completions/mean_terminated_length": 242.15625, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.08201168113737367, |
| "epoch": 0.00014, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0027066469192505, |
| "kl": 0.08505436949724299, |
| "learning_rate": 1.7142857142857145e-05, |
| "loss": -0.0065, |
| "num_tokens": 275059.0, |
| "reward": -0.7800816893577576, |
| "reward_std": 0.610554039478302, |
| "rewards/rollout_reward_func/mean": -0.7800816893577576, |
| "rewards/rollout_reward_func/std": 0.6927689909934998, |
| "sampling/importance_sampling_ratio/max": 1.4916099309921265, |
| "sampling/importance_sampling_ratio/mean": 0.9357227087020874, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.6500071287155151, |
| "sampling/sampling_logp_difference/mean": 0.05492936447262764, |
| "step": 7, |
| "step_time": 13.66465330799997 |
| }, |
| { |
| "clip_ratio/high_max": 0.031250000931322575, |
| "clip_ratio/high_mean": 0.015625000465661287, |
| "clip_ratio/low_mean": 0.010156250093132257, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.025781250558793545, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1273.0, |
| "completions/max_terminated_length": 1273.0, |
| "completions/mean_length": 331.40625, |
| "completions/mean_terminated_length": 331.40625, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.10761701926821843, |
| "epoch": 0.00016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1414554119110107, |
| "kl": 0.31695732939988375, |
| "learning_rate": 2e-05, |
| "loss": -0.0063, |
| "num_tokens": 316632.0, |
| "reward": -0.1924755573272705, |
| "reward_std": 1.0470077991485596, |
| "rewards/rollout_reward_func/mean": -0.1924755573272705, |
| "rewards/rollout_reward_func/std": 1.0184544324874878, |
| "sampling/importance_sampling_ratio/max": 2.0652823448181152, |
| "sampling/importance_sampling_ratio/mean": 0.9144597053527832, |
| "sampling/importance_sampling_ratio/min": 0.11971249431371689, |
| "sampling/sampling_logp_difference/max": 2.123110055923462, |
| "sampling/sampling_logp_difference/mean": 0.06788742542266846, |
| "step": 8, |
| "step_time": 13.650407897999571 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.02473958395421505, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02473958395421505, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1164.0, |
| "completions/max_terminated_length": 1164.0, |
| "completions/mean_length": 223.5625, |
| "completions/mean_terminated_length": 223.5625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.11182238413312007, |
| "epoch": 0.00018, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2173668146133423, |
| "kl": 0.5084238715935498, |
| "learning_rate": 2.2857142857142858e-05, |
| "loss": -0.0034, |
| "num_tokens": 351791.0, |
| "reward": -0.020846828818321228, |
| "reward_std": 1.0506192445755005, |
| "rewards/rollout_reward_func/mean": -0.020846828818321228, |
| "rewards/rollout_reward_func/std": 1.0528043508529663, |
| "sampling/importance_sampling_ratio/max": 2.8375391960144043, |
| "sampling/importance_sampling_ratio/mean": 0.9840835332870483, |
| "sampling/importance_sampling_ratio/min": 0.08557987958192825, |
| "sampling/sampling_logp_difference/max": 2.453369379043579, |
| "sampling/sampling_logp_difference/mean": 0.05880027264356613, |
| "step": 9, |
| "step_time": 12.694531075999976 |
| }, |
| { |
| "clip_ratio/high_max": 0.020312500186264515, |
| "clip_ratio/high_mean": 0.010156250093132257, |
| "clip_ratio/low_mean": 0.028125000651925802, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03828125121071935, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1143.0, |
| "completions/max_terminated_length": 1143.0, |
| "completions/mean_length": 246.71875, |
| "completions/mean_terminated_length": 246.71875, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.06009864609222859, |
| "epoch": 0.0002, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3861747086048126, |
| "kl": 0.5755460254719464, |
| "learning_rate": 2.5714285714285714e-05, |
| "loss": 0.0033, |
| "num_tokens": 389005.0, |
| "reward": -0.13610242307186127, |
| "reward_std": 1.0442836284637451, |
| "rewards/rollout_reward_func/mean": -0.13610242307186127, |
| "rewards/rollout_reward_func/std": 1.038696050643921, |
| "sampling/importance_sampling_ratio/max": 2.78231143951416, |
| "sampling/importance_sampling_ratio/mean": 0.9999732375144958, |
| "sampling/importance_sampling_ratio/min": 0.27365317940711975, |
| "sampling/sampling_logp_difference/max": 1.3099734783172607, |
| "sampling/sampling_logp_difference/mean": 0.04202880337834358, |
| "step": 10, |
| "step_time": 12.608456127999943 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.016666667070239782, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.016666667070239782, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1094.0, |
| "completions/max_terminated_length": 1094.0, |
| "completions/mean_length": 334.1875, |
| "completions/mean_terminated_length": 334.1875, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.06632963023184857, |
| "epoch": 0.00022, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.2863199710845947, |
| "kl": 0.7946649645148227, |
| "learning_rate": 2.857142857142857e-05, |
| "loss": -0.0114, |
| "num_tokens": 430612.0, |
| "reward": 0.002641141414642334, |
| "reward_std": 1.030980110168457, |
| "rewards/rollout_reward_func/mean": 0.002641141414642334, |
| "rewards/rollout_reward_func/std": 1.0341851711273193, |
| "sampling/importance_sampling_ratio/max": 1.5416392087936401, |
| "sampling/importance_sampling_ratio/mean": 0.9006525278091431, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.674771785736084, |
| "sampling/sampling_logp_difference/mean": 0.052167896181344986, |
| "step": 11, |
| "step_time": 12.571019431999957 |
| }, |
| { |
| "clip_ratio/high_max": 0.012500000186264515, |
| "clip_ratio/high_mean": 0.0062500000931322575, |
| "clip_ratio/low_mean": 0.027678572107106447, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03392857266589999, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1281.0, |
| "completions/max_terminated_length": 1281.0, |
| "completions/mean_length": 327.84375, |
| "completions/mean_terminated_length": 327.84375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.06426836037917383, |
| "epoch": 0.00024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9448660612106323, |
| "kl": 1.1216627955436707, |
| "learning_rate": 3.142857142857143e-05, |
| "loss": -0.0031, |
| "num_tokens": 469849.0, |
| "reward": 0.44050925970077515, |
| "reward_std": 0.9405170679092407, |
| "rewards/rollout_reward_func/mean": 0.44050925970077515, |
| "rewards/rollout_reward_func/std": 0.9331760406494141, |
| "sampling/importance_sampling_ratio/max": 1.6956889629364014, |
| "sampling/importance_sampling_ratio/mean": 0.966093122959137, |
| "sampling/importance_sampling_ratio/min": 0.10954803228378296, |
| "sampling/sampling_logp_difference/max": 2.2184529304504395, |
| "sampling/sampling_logp_difference/mean": 0.06628098338842392, |
| "step": 12, |
| "step_time": 13.392358952999984 |
| }, |
| { |
| "clip_ratio/high_max": 0.02291666716337204, |
| "clip_ratio/high_mean": 0.01145833358168602, |
| "clip_ratio/low_mean": 0.02291666716337204, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.03437500074505806, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1111.0, |
| "completions/max_terminated_length": 1111.0, |
| "completions/mean_length": 263.53125, |
| "completions/mean_terminated_length": 263.53125, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.05535798534674541, |
| "epoch": 0.00026, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.7448770999908447, |
| "kl": 4.385535407811403, |
| "learning_rate": 3.428571428571429e-05, |
| "loss": -0.0296, |
| "num_tokens": 507656.0, |
| "reward": 0.6070110201835632, |
| "reward_std": 0.8325015902519226, |
| "rewards/rollout_reward_func/mean": 0.6070110201835632, |
| "rewards/rollout_reward_func/std": 0.8162023425102234, |
| "sampling/importance_sampling_ratio/max": 2.217287302017212, |
| "sampling/importance_sampling_ratio/mean": 0.8639674782752991, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 5.976809501647949, |
| "sampling/sampling_logp_difference/mean": 0.1777144819498062, |
| "step": 13, |
| "step_time": 12.483832168999925 |
| }, |
| { |
| "clip_ratio/high_max": 0.012500000186264515, |
| "clip_ratio/high_mean": 0.0062500000931322575, |
| "clip_ratio/low_mean": 0.016666667070239782, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.02291666716337204, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1204.0, |
| "completions/max_terminated_length": 1204.0, |
| "completions/mean_length": 175.375, |
| "completions/mean_terminated_length": 175.375, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.037813675041661554, |
| "epoch": 0.00028, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9732633829116821, |
| "kl": 3.082265551201999, |
| "learning_rate": 3.7142857142857143e-05, |
| "loss": -0.0294, |
| "num_tokens": 541743.0, |
| "reward": 0.44121330976486206, |
| "reward_std": 0.9663182497024536, |
| "rewards/rollout_reward_func/mean": 0.44121330976486206, |
| "rewards/rollout_reward_func/std": 0.9269620180130005, |
| "sampling/importance_sampling_ratio/max": 2.2618067264556885, |
| "sampling/importance_sampling_ratio/mean": 0.9065130352973938, |
| "sampling/importance_sampling_ratio/min": 0.00045742199290543795, |
| "sampling/sampling_logp_difference/max": 7.674319744110107, |
| "sampling/sampling_logp_difference/mean": 0.14372462034225464, |
| "step": 14, |
| "step_time": 12.786647417999802 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.012500000186264515, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012500000186264515, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 608.0, |
| "completions/max_terminated_length": 608.0, |
| "completions/mean_length": 137.71875, |
| "completions/mean_terminated_length": 137.71875, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.04335687311379388, |
| "epoch": 0.0003, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.684906482696533, |
| "kl": 9.895050249993801, |
| "learning_rate": 4e-05, |
| "loss": -0.0221, |
| "num_tokens": 574790.0, |
| "reward": 0.4522267282009125, |
| "reward_std": 0.7068161964416504, |
| "rewards/rollout_reward_func/mean": 0.4522267282009125, |
| "rewards/rollout_reward_func/std": 0.8777400851249695, |
| "sampling/importance_sampling_ratio/max": 2.254448890686035, |
| "sampling/importance_sampling_ratio/mean": 0.9891846179962158, |
| "sampling/importance_sampling_ratio/min": 0.0026889105793088675, |
| "sampling/sampling_logp_difference/max": 5.908565044403076, |
| "sampling/sampling_logp_difference/mean": 0.12002458423376083, |
| "step": 15, |
| "step_time": 10.161046653999847 |
| }, |
| { |
| "clip_ratio/high_max": 0.020312500186264515, |
| "clip_ratio/high_mean": 0.010156250093132257, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010156250093132257, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1153.0, |
| "completions/max_terminated_length": 1153.0, |
| "completions/mean_length": 235.75, |
| "completions/mean_terminated_length": 235.75, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.07614646388719848, |
| "epoch": 0.00032, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.6830127239227295, |
| "kl": 2.908422634471208, |
| "learning_rate": 4.2857142857142856e-05, |
| "loss": -0.0083, |
| "num_tokens": 612382.0, |
| "reward": 0.7016310095787048, |
| "reward_std": 0.6054160594940186, |
| "rewards/rollout_reward_func/mean": 0.7016310095787048, |
| "rewards/rollout_reward_func/std": 0.7000784277915955, |
| "sampling/importance_sampling_ratio/max": 1.1297085285186768, |
| "sampling/importance_sampling_ratio/mean": 0.8337504863739014, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 6.672256946563721, |
| "sampling/sampling_logp_difference/mean": 0.1192205548286438, |
| "step": 16, |
| "step_time": 12.473349069999585 |
| }, |
| { |
| "clip_ratio/high_max": 0.028125000186264515, |
| "clip_ratio/high_mean": 0.014062500093132257, |
| "clip_ratio/low_mean": 0.0062500000931322575, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.020312499720603228, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1184.0, |
| "completions/max_terminated_length": 1184.0, |
| "completions/mean_length": 272.1875, |
| "completions/mean_terminated_length": 272.1875, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.07187356776557863, |
| "epoch": 0.00034, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0832687616348267, |
| "kl": 2.5055846490431577, |
| "learning_rate": 4.5714285714285716e-05, |
| "loss": -0.0186, |
| "num_tokens": 652165.0, |
| "reward": 0.42282533645629883, |
| "reward_std": 0.9263465404510498, |
| "rewards/rollout_reward_func/mean": 0.42282533645629883, |
| "rewards/rollout_reward_func/std": 0.9135696291923523, |
| "sampling/importance_sampling_ratio/max": 1.9455113410949707, |
| "sampling/importance_sampling_ratio/mean": 0.8885751962661743, |
| "sampling/importance_sampling_ratio/min": 0.020396223291754723, |
| "sampling/sampling_logp_difference/max": 3.8924026489257812, |
| "sampling/sampling_logp_difference/mean": 0.10638131201267242, |
| "step": 17, |
| "step_time": 12.625183045000085 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.04062500037252903, |
| "clip_ratio/low_min": 0.012500000186264515, |
| "clip_ratio/region_mean": 0.04062500037252903, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 638.0, |
| "completions/max_terminated_length": 638.0, |
| "completions/mean_length": 209.0, |
| "completions/mean_terminated_length": 209.0, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.08415665684276519, |
| "epoch": 0.00036, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 177.4296875, |
| "kl": 69.13030137866735, |
| "learning_rate": 4.8571428571428576e-05, |
| "loss": 0.0815, |
| "num_tokens": 688568.0, |
| "reward": 0.1535860300064087, |
| "reward_std": 1.023172378540039, |
| "rewards/rollout_reward_func/mean": 0.1535860300064087, |
| "rewards/rollout_reward_func/std": 1.0038524866104126, |
| "sampling/importance_sampling_ratio/max": 1.140038251876831, |
| "sampling/importance_sampling_ratio/mean": 0.7489842176437378, |
| "sampling/importance_sampling_ratio/min": 0.0006028485368005931, |
| "sampling/sampling_logp_difference/max": 6.032342433929443, |
| "sampling/sampling_logp_difference/mean": 0.2542610764503479, |
| "step": 18, |
| "step_time": 10.28650643099968 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.014322916977107525, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.014322916977107525, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 914.0, |
| "completions/max_terminated_length": 914.0, |
| "completions/mean_length": 284.375, |
| "completions/mean_terminated_length": 284.375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.08210387951271514, |
| "epoch": 0.00038, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.229077935218811, |
| "kl": 3.3837516829371452, |
| "learning_rate": 5.142857142857143e-05, |
| "loss": -0.0122, |
| "num_tokens": 727121.0, |
| "reward": 0.41616448760032654, |
| "reward_std": 0.7233636975288391, |
| "rewards/rollout_reward_func/mean": 0.41616448760032654, |
| "rewards/rollout_reward_func/std": 0.9268736243247986, |
| "sampling/importance_sampling_ratio/max": 2.5834200382232666, |
| "sampling/importance_sampling_ratio/mean": 0.9522545337677002, |
| "sampling/importance_sampling_ratio/min": 0.038529153913259506, |
| "sampling/sampling_logp_difference/max": 2.787929058074951, |
| "sampling/sampling_logp_difference/mean": 0.10299322754144669, |
| "step": 19, |
| "step_time": 11.579511369000329 |
| }, |
| { |
| "clip_ratio/high_max": 0.012500000186264515, |
| "clip_ratio/high_mean": 0.0062500000931322575, |
| "clip_ratio/low_mean": 0.02291666716337204, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.029166667256504297, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1124.0, |
| "completions/max_terminated_length": 1124.0, |
| "completions/mean_length": 230.0625, |
| "completions/mean_terminated_length": 230.0625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.11127446611544656, |
| "epoch": 0.0004, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9922760725021362, |
| "kl": 3.5128636844456196, |
| "learning_rate": 5.428571428571428e-05, |
| "loss": -0.0049, |
| "num_tokens": 764269.0, |
| "reward": 0.21909433603286743, |
| "reward_std": 0.9744149446487427, |
| "rewards/rollout_reward_func/mean": 0.21909433603286743, |
| "rewards/rollout_reward_func/std": 1.0001939535140991, |
| "sampling/importance_sampling_ratio/max": 1.926369547843933, |
| "sampling/importance_sampling_ratio/mean": 0.9654070734977722, |
| "sampling/importance_sampling_ratio/min": 0.12186437845230103, |
| "sampling/sampling_logp_difference/max": 3.6394667625427246, |
| "sampling/sampling_logp_difference/mean": 0.0875457376241684, |
| "step": 20, |
| "step_time": 12.297075339999765 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.010416666977107525, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010416666977107525, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1024.0, |
| "completions/max_terminated_length": 1024.0, |
| "completions/mean_length": 198.5625, |
| "completions/mean_terminated_length": 198.5625, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.045957784245274524, |
| "epoch": 0.00042, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.33694905042648315, |
| "kl": 2.610730357468128, |
| "learning_rate": 5.714285714285714e-05, |
| "loss": -0.0197, |
| "num_tokens": 801033.0, |
| "reward": 0.48597443103790283, |
| "reward_std": 0.9212241172790527, |
| "rewards/rollout_reward_func/mean": 0.48597443103790283, |
| "rewards/rollout_reward_func/std": 0.8926963210105896, |
| "sampling/importance_sampling_ratio/max": 1.204205870628357, |
| "sampling/importance_sampling_ratio/mean": 0.9019367694854736, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.5509400367736816, |
| "sampling/sampling_logp_difference/mean": 0.03802880644798279, |
| "step": 21, |
| "step_time": 12.204463460999932 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 578.0, |
| "completions/max_terminated_length": 578.0, |
| "completions/mean_length": 168.46875, |
| "completions/mean_terminated_length": 168.46875, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.07223565122012587, |
| "epoch": 0.00044, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6332337856292725, |
| "kl": 3.1288448721170425, |
| "learning_rate": 6e-05, |
| "loss": -0.0054, |
| "num_tokens": 836429.0, |
| "reward": 0.31113743782043457, |
| "reward_std": 0.9805623292922974, |
| "rewards/rollout_reward_func/mean": 0.31113743782043457, |
| "rewards/rollout_reward_func/std": 0.9535738229751587, |
| "sampling/importance_sampling_ratio/max": 1.7136268615722656, |
| "sampling/importance_sampling_ratio/mean": 0.9599097967147827, |
| "sampling/importance_sampling_ratio/min": 0.3510478734970093, |
| "sampling/sampling_logp_difference/max": 1.039665699005127, |
| "sampling/sampling_logp_difference/mean": 0.0384407714009285, |
| "step": 22, |
| "step_time": 10.721943561000444 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.016666667070239782, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.016666667070239782, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1273.0, |
| "completions/max_terminated_length": 1273.0, |
| "completions/mean_length": 274.90625, |
| "completions/mean_terminated_length": 274.90625, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.07087159495586093, |
| "epoch": 0.00046, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6439398527145386, |
| "kl": 3.9721709862351418, |
| "learning_rate": 6.285714285714286e-05, |
| "loss": -0.0091, |
| "num_tokens": 875727.0, |
| "reward": 0.23191100358963013, |
| "reward_std": 0.9855490922927856, |
| "rewards/rollout_reward_func/mean": 0.23191100358963013, |
| "rewards/rollout_reward_func/std": 0.9528393745422363, |
| "sampling/importance_sampling_ratio/max": 1.7258304357528687, |
| "sampling/importance_sampling_ratio/mean": 0.8456529378890991, |
| "sampling/importance_sampling_ratio/min": 0.08970843255519867, |
| "sampling/sampling_logp_difference/max": 1.6682171821594238, |
| "sampling/sampling_logp_difference/mean": 0.08408902585506439, |
| "step": 23, |
| "step_time": 12.50699520400076 |
| }, |
| { |
| "clip_ratio/high_max": 0.012500000186264515, |
| "clip_ratio/high_mean": 0.0062500000931322575, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0062500000931322575, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1014.0, |
| "completions/max_terminated_length": 1014.0, |
| "completions/mean_length": 233.0, |
| "completions/mean_terminated_length": 233.0, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.057429388416494476, |
| "epoch": 0.00048, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2117566019296646, |
| "kl": 2.7027904614806175, |
| "learning_rate": 6.571428571428571e-05, |
| "loss": -0.0072, |
| "num_tokens": 912356.0, |
| "reward": 0.7661426067352295, |
| "reward_std": 0.4510638415813446, |
| "rewards/rollout_reward_func/mean": 0.7661426067352295, |
| "rewards/rollout_reward_func/std": 0.6789449453353882, |
| "sampling/importance_sampling_ratio/max": 1.2537685632705688, |
| "sampling/importance_sampling_ratio/mean": 0.9261639714241028, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 3.0963659286499023, |
| "sampling/sampling_logp_difference/mean": 0.0501352995634079, |
| "step": 24, |
| "step_time": 12.994097786999873 |
| }, |
| { |
| "clip_ratio/high_max": 0.012500000186264515, |
| "clip_ratio/high_mean": 0.0062500000931322575, |
| "clip_ratio/low_mean": 0.0062500000931322575, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012500000186264515, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 864.0, |
| "completions/max_terminated_length": 864.0, |
| "completions/mean_length": 284.15625, |
| "completions/mean_terminated_length": 284.15625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.07915750662141363, |
| "epoch": 0.0005, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2489689290523529, |
| "kl": 3.51483316719532, |
| "learning_rate": 6.857142857142858e-05, |
| "loss": -0.0141, |
| "num_tokens": 951303.0, |
| "reward": 0.6401481628417969, |
| "reward_std": 0.7306121587753296, |
| "rewards/rollout_reward_func/mean": 0.6401481628417969, |
| "rewards/rollout_reward_func/std": 0.7182337045669556, |
| "sampling/importance_sampling_ratio/max": 1.7077192068099976, |
| "sampling/importance_sampling_ratio/mean": 0.9699938893318176, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.2909077405929565, |
| "sampling/sampling_logp_difference/mean": 0.06180023029446602, |
| "step": 25, |
| "step_time": 11.09379838199925 |
| }, |
| { |
| "clip_ratio/high_max": 0.012500000186264515, |
| "clip_ratio/high_mean": 0.0062500000931322575, |
| "clip_ratio/low_mean": 0.010416666977107525, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.016666667070239782, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1052.0, |
| "completions/max_terminated_length": 1052.0, |
| "completions/mean_length": 248.09375, |
| "completions/mean_terminated_length": 248.09375, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.06904788634801662, |
| "epoch": 0.00052, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.426376074552536, |
| "kl": 3.752748064696789, |
| "learning_rate": 7.142857142857143e-05, |
| "loss": -0.0171, |
| "num_tokens": 989455.0, |
| "reward": 0.5763981938362122, |
| "reward_std": 0.8214547634124756, |
| "rewards/rollout_reward_func/mean": 0.5763981938362122, |
| "rewards/rollout_reward_func/std": 0.8107219934463501, |
| "sampling/importance_sampling_ratio/max": 1.2095879316329956, |
| "sampling/importance_sampling_ratio/mean": 0.9170844554901123, |
| "sampling/importance_sampling_ratio/min": 0.17924457788467407, |
| "sampling/sampling_logp_difference/max": 1.90674889087677, |
| "sampling/sampling_logp_difference/mean": 0.06156245991587639, |
| "step": 26, |
| "step_time": 12.128414304999978 |
| }, |
| { |
| "clip_ratio/high_max": 0.02500000037252903, |
| "clip_ratio/high_mean": 0.012500000186264515, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012500000186264515, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 913.0, |
| "completions/max_terminated_length": 913.0, |
| "completions/mean_length": 238.5, |
| "completions/mean_terminated_length": 238.5, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.047321546943749127, |
| "epoch": 0.00054, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13649658858776093, |
| "kl": 3.7867856696248055, |
| "learning_rate": 7.428571428571429e-05, |
| "loss": 0.0024, |
| "num_tokens": 1026966.0, |
| "reward": 0.5796800851821899, |
| "reward_std": 0.5545504093170166, |
| "rewards/rollout_reward_func/mean": 0.5796800851821899, |
| "rewards/rollout_reward_func/std": 0.7230707406997681, |
| "sampling/importance_sampling_ratio/max": 1.8340015411376953, |
| "sampling/importance_sampling_ratio/mean": 0.9975396990776062, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.570065975189209, |
| "sampling/sampling_logp_difference/mean": 0.04671240225434303, |
| "step": 27, |
| "step_time": 11.592925249000245 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0062500000931322575, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0062500000931322575, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 695.0, |
| "completions/max_terminated_length": 695.0, |
| "completions/mean_length": 213.03125, |
| "completions/mean_terminated_length": 213.03125, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.02343887006645673, |
| "epoch": 0.00056, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.22743558883667, |
| "kl": 15.115263469517231, |
| "learning_rate": 7.714285714285715e-05, |
| "loss": 0.019, |
| "num_tokens": 1062974.0, |
| "reward": 0.7336158752441406, |
| "reward_std": 0.5854325294494629, |
| "rewards/rollout_reward_func/mean": 0.7336158752441406, |
| "rewards/rollout_reward_func/std": 0.5881174206733704, |
| "sampling/importance_sampling_ratio/max": 1.5286439657211304, |
| "sampling/importance_sampling_ratio/mean": 1.003950834274292, |
| "sampling/importance_sampling_ratio/min": 0.12551912665367126, |
| "sampling/sampling_logp_difference/max": 1.9951891899108887, |
| "sampling/sampling_logp_difference/mean": 0.025340761989355087, |
| "step": 28, |
| "step_time": 10.574544273999663 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 677.0, |
| "completions/max_terminated_length": 677.0, |
| "completions/mean_length": 202.3125, |
| "completions/mean_terminated_length": 202.3125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.021139492744623567, |
| "epoch": 0.00058, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.06938721239566803, |
| "kl": 3.8235014770179987, |
| "learning_rate": 8e-05, |
| "loss": -0.0084, |
| "num_tokens": 1099842.0, |
| "reward": 0.7044858932495117, |
| "reward_std": 0.3917056620121002, |
| "rewards/rollout_reward_func/mean": 0.7044858932495117, |
| "rewards/rollout_reward_func/std": 0.4747315049171448, |
| "sampling/importance_sampling_ratio/max": 1.07483971118927, |
| "sampling/importance_sampling_ratio/mean": 1.0014500617980957, |
| "sampling/importance_sampling_ratio/min": 0.9500851035118103, |
| "sampling/sampling_logp_difference/max": 0.12000381201505661, |
| "sampling/sampling_logp_difference/mean": 0.004348148591816425, |
| "step": 29, |
| "step_time": 10.580135002999441 |
| }, |
| { |
| "clip_ratio/high_max": 0.012500000186264515, |
| "clip_ratio/high_mean": 0.0062500000931322575, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0062500000931322575, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 914.0, |
| "completions/max_terminated_length": 914.0, |
| "completions/mean_length": 231.90625, |
| "completions/mean_terminated_length": 231.90625, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.05673399774605059, |
| "epoch": 0.0006, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.9880616664886475, |
| "kl": 22.03186371177435, |
| "learning_rate": 8.285714285714287e-05, |
| "loss": 0.0294, |
| "num_tokens": 1136891.0, |
| "reward": 0.7692453861236572, |
| "reward_std": 0.40567266941070557, |
| "rewards/rollout_reward_func/mean": 0.7692453861236572, |
| "rewards/rollout_reward_func/std": 0.5713350176811218, |
| "sampling/importance_sampling_ratio/max": 1.5409276485443115, |
| "sampling/importance_sampling_ratio/mean": 0.9707353115081787, |
| "sampling/importance_sampling_ratio/min": 0.3624197244644165, |
| "sampling/sampling_logp_difference/max": 1.0131800174713135, |
| "sampling/sampling_logp_difference/mean": 0.02174563892185688, |
| "step": 30, |
| "step_time": 11.996624365999878 |
| }, |
| { |
| "clip_ratio/high_max": 0.0078125, |
| "clip_ratio/high_mean": 0.00390625, |
| "clip_ratio/low_mean": 0.0062500000931322575, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010156250093132257, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 930.0, |
| "completions/max_terminated_length": 930.0, |
| "completions/mean_length": 248.90625, |
| "completions/mean_terminated_length": 248.90625, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.16393944306037156, |
| "epoch": 0.00062, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5357746481895447, |
| "kl": 3.937878467142582, |
| "learning_rate": 8.571428571428571e-05, |
| "loss": 0.014, |
| "num_tokens": 1175169.0, |
| "reward": 0.357496440410614, |
| "reward_std": 0.7932678461074829, |
| "rewards/rollout_reward_func/mean": 0.357496440410614, |
| "rewards/rollout_reward_func/std": 0.7959622144699097, |
| "sampling/importance_sampling_ratio/max": 2.3141262531280518, |
| "sampling/importance_sampling_ratio/mean": 0.9407311677932739, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.5570695400238037, |
| "sampling/sampling_logp_difference/mean": 0.04863358661532402, |
| "step": 31, |
| "step_time": 11.595104513999786 |
| }, |
| { |
| "clip_ratio/high_max": 0.012500000186264515, |
| "clip_ratio/high_mean": 0.0062500000931322575, |
| "clip_ratio/low_mean": 0.0062500000931322575, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.012500000186264515, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1023.0, |
| "completions/max_terminated_length": 1023.0, |
| "completions/mean_length": 243.15625, |
| "completions/mean_terminated_length": 243.15625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.18836299496615538, |
| "epoch": 0.00064, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19593629240989685, |
| "kl": 3.839852336794138, |
| "learning_rate": 8.857142857142857e-05, |
| "loss": -0.0085, |
| "num_tokens": 1213031.0, |
| "reward": 0.7341962456703186, |
| "reward_std": 0.5838453769683838, |
| "rewards/rollout_reward_func/mean": 0.7341962456703186, |
| "rewards/rollout_reward_func/std": 0.5884881019592285, |
| "sampling/importance_sampling_ratio/max": 1.9368575811386108, |
| "sampling/importance_sampling_ratio/mean": 0.998816728591919, |
| "sampling/importance_sampling_ratio/min": 0.486605167388916, |
| "sampling/sampling_logp_difference/max": 0.6482534408569336, |
| "sampling/sampling_logp_difference/mean": 0.05510255694389343, |
| "step": 32, |
| "step_time": 13.026593125999852 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1079.0, |
| "completions/max_terminated_length": 1079.0, |
| "completions/mean_length": 180.9375, |
| "completions/mean_terminated_length": 180.9375, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.19241300441353815, |
| "epoch": 0.00066, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21624083817005157, |
| "kl": 3.650024376809597, |
| "learning_rate": 9.142857142857143e-05, |
| "loss": -0.0102, |
| "num_tokens": 1246957.0, |
| "reward": 0.6980147361755371, |
| "reward_std": 0.6458718776702881, |
| "rewards/rollout_reward_func/mean": 0.6980147361755371, |
| "rewards/rollout_reward_func/std": 0.6438339352607727, |
| "sampling/importance_sampling_ratio/max": 1.5693196058273315, |
| "sampling/importance_sampling_ratio/mean": 1.0197675228118896, |
| "sampling/importance_sampling_ratio/min": 0.33508750796318054, |
| "sampling/sampling_logp_difference/max": 0.8358498811721802, |
| "sampling/sampling_logp_difference/mean": 0.046664539724588394, |
| "step": 33, |
| "step_time": 11.550154542999962 |
| }, |
| { |
| "clip_ratio/high_max": 0.012500000186264515, |
| "clip_ratio/high_mean": 0.0062500000931322575, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0062500000931322575, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 668.0, |
| "completions/max_terminated_length": 668.0, |
| "completions/mean_length": 252.46875, |
| "completions/mean_terminated_length": 252.46875, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.3369806137343403, |
| "epoch": 0.00068, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24248099327087402, |
| "kl": 4.222835153341293, |
| "learning_rate": 9.428571428571429e-05, |
| "loss": -0.0043, |
| "num_tokens": 1286249.0, |
| "reward": 0.5828893184661865, |
| "reward_std": 0.6884247064590454, |
| "rewards/rollout_reward_func/mean": 0.5828893184661865, |
| "rewards/rollout_reward_func/std": 0.7157434821128845, |
| "sampling/importance_sampling_ratio/max": 2.1673381328582764, |
| "sampling/importance_sampling_ratio/mean": 0.952195405960083, |
| "sampling/importance_sampling_ratio/min": 0.5653239488601685, |
| "sampling/sampling_logp_difference/max": 0.8169012069702148, |
| "sampling/sampling_logp_difference/mean": 0.047646619379520416, |
| "step": 34, |
| "step_time": 11.025269679999838 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0062500000931322575, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0062500000931322575, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 548.0, |
| "completions/max_terminated_length": 548.0, |
| "completions/mean_length": 174.75, |
| "completions/mean_terminated_length": 174.75, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.26513355743372813, |
| "epoch": 0.0007, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.12303785234689713, |
| "kl": 4.241589665412903, |
| "learning_rate": 9.714285714285715e-05, |
| "loss": -0.0109, |
| "num_tokens": 1322454.0, |
| "reward": 0.5774887204170227, |
| "reward_std": 0.5838077068328857, |
| "rewards/rollout_reward_func/mean": 0.5774887204170227, |
| "rewards/rollout_reward_func/std": 0.7217473387718201, |
| "sampling/importance_sampling_ratio/max": 2.0283899307250977, |
| "sampling/importance_sampling_ratio/mean": 1.0524414777755737, |
| "sampling/importance_sampling_ratio/min": 0.8173112273216248, |
| "sampling/sampling_logp_difference/max": 0.7269496917724609, |
| "sampling/sampling_logp_difference/mean": 0.027478456497192383, |
| "step": 35, |
| "step_time": 9.609598415999699 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0062500000931322575, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0062500000931322575, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1031.0, |
| "completions/max_terminated_length": 1031.0, |
| "completions/mean_length": 243.75, |
| "completions/mean_terminated_length": 243.75, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.2274461947963573, |
| "epoch": 0.00072, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4030201733112335, |
| "kl": 4.836214572191238, |
| "learning_rate": 0.0001, |
| "loss": -0.0031, |
| "num_tokens": 1360418.0, |
| "reward": 0.45134004950523376, |
| "reward_std": 0.6674649119377136, |
| "rewards/rollout_reward_func/mean": 0.45134004950523376, |
| "rewards/rollout_reward_func/std": 0.6721746325492859, |
| "sampling/importance_sampling_ratio/max": 1.166141390800476, |
| "sampling/importance_sampling_ratio/mean": 0.9575443863868713, |
| "sampling/importance_sampling_ratio/min": 0.4807126522064209, |
| "sampling/sampling_logp_difference/max": 0.5672388076782227, |
| "sampling/sampling_logp_difference/mean": 0.02744535729289055, |
| "step": 36, |
| "step_time": 12.980629156000077 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 558.0, |
| "completions/max_terminated_length": 558.0, |
| "completions/mean_length": 195.125, |
| "completions/mean_terminated_length": 195.125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.17616549076046795, |
| "epoch": 0.00074, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1173086166381836, |
| "kl": 3.182346645742655, |
| "learning_rate": 9.999736485702831e-05, |
| "loss": -0.0185, |
| "num_tokens": 1396929.0, |
| "reward": 0.8315432071685791, |
| "reward_std": 0.3220025300979614, |
| "rewards/rollout_reward_func/mean": 0.8315432071685791, |
| "rewards/rollout_reward_func/std": 0.40297287702560425, |
| "sampling/importance_sampling_ratio/max": 1.438734769821167, |
| "sampling/importance_sampling_ratio/mean": 1.0194497108459473, |
| "sampling/importance_sampling_ratio/min": 0.805536150932312, |
| "sampling/sampling_logp_difference/max": 0.3535919189453125, |
| "sampling/sampling_logp_difference/mean": 0.011818947270512581, |
| "step": 37, |
| "step_time": 9.724833724999371 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1022.0, |
| "completions/max_terminated_length": 1022.0, |
| "completions/mean_length": 309.0, |
| "completions/mean_terminated_length": 309.0, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.22344195400364697, |
| "epoch": 0.00076, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27476319670677185, |
| "kl": 4.308917306363583, |
| "learning_rate": 9.998945979845876e-05, |
| "loss": 0.0127, |
| "num_tokens": 1437079.0, |
| "reward": 0.6767849922180176, |
| "reward_std": 0.5794224143028259, |
| "rewards/rollout_reward_func/mean": 0.6767849922180176, |
| "rewards/rollout_reward_func/std": 0.6077510714530945, |
| "sampling/importance_sampling_ratio/max": 1.5739073753356934, |
| "sampling/importance_sampling_ratio/mean": 1.0125946998596191, |
| "sampling/importance_sampling_ratio/min": 0.7139722108840942, |
| "sampling/sampling_logp_difference/max": 0.46108484268188477, |
| "sampling/sampling_logp_difference/mean": 0.02042299136519432, |
| "step": 38, |
| "step_time": 12.233769962999759 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 608.0, |
| "completions/max_terminated_length": 608.0, |
| "completions/mean_length": 232.28125, |
| "completions/mean_terminated_length": 232.28125, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.21799559774808586, |
| "epoch": 0.00078, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.292609840631485, |
| "kl": 3.502250585705042, |
| "learning_rate": 9.997628593527586e-05, |
| "loss": -0.0016, |
| "num_tokens": 1475190.0, |
| "reward": 0.6436625123023987, |
| "reward_std": 0.7471475005149841, |
| "rewards/rollout_reward_func/mean": 0.6436625123023987, |
| "rewards/rollout_reward_func/std": 0.7162776589393616, |
| "sampling/importance_sampling_ratio/max": 1.1646549701690674, |
| "sampling/importance_sampling_ratio/mean": 0.9416862726211548, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 1.2828545570373535, |
| "sampling/sampling_logp_difference/mean": 0.02604352869093418, |
| "step": 39, |
| "step_time": 10.37775418499973 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 658.0, |
| "completions/max_terminated_length": 658.0, |
| "completions/mean_length": 186.8125, |
| "completions/mean_terminated_length": 186.8125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.1057002441957593, |
| "epoch": 0.0008, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6740612387657166, |
| "kl": 4.765574503690004, |
| "learning_rate": 9.995784511894694e-05, |
| "loss": -0.0092, |
| "num_tokens": 1510583.0, |
| "reward": 0.5121347904205322, |
| "reward_std": 0.5755149126052856, |
| "rewards/rollout_reward_func/mean": 0.5121347904205322, |
| "rewards/rollout_reward_func/std": 0.5685732364654541, |
| "sampling/importance_sampling_ratio/max": 2.6313016414642334, |
| "sampling/importance_sampling_ratio/mean": 1.0894813537597656, |
| "sampling/importance_sampling_ratio/min": 0.952610433101654, |
| "sampling/sampling_logp_difference/max": 0.9827308654785156, |
| "sampling/sampling_logp_difference/mean": 0.023295916616916656, |
| "step": 40, |
| "step_time": 10.425912250000465 |
| }, |
| { |
| "clip_ratio/high_max": 0.0069444444961845875, |
| "clip_ratio/high_mean": 0.0034722222480922937, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0034722222480922937, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1387.0, |
| "completions/max_terminated_length": 1387.0, |
| "completions/mean_length": 186.34375, |
| "completions/mean_terminated_length": 186.34375, |
| "completions/min_length": 2.0, |
| "completions/min_terminated_length": 2.0, |
| "entropy": 0.15000496682478115, |
| "epoch": 0.00082, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2697708010673523, |
| "kl": 4.247643496841192, |
| "learning_rate": 9.993413994116206e-05, |
| "loss": -0.0199, |
| "num_tokens": 1545590.0, |
| "reward": 0.6390933990478516, |
| "reward_std": 0.6065553426742554, |
| "rewards/rollout_reward_func/mean": 0.6390933990478516, |
| "rewards/rollout_reward_func/std": 0.6180797815322876, |
| "sampling/importance_sampling_ratio/max": 2.1225719451904297, |
| "sampling/importance_sampling_ratio/mean": 0.9938382506370544, |
| "sampling/importance_sampling_ratio/min": 0.4029119610786438, |
| "sampling/sampling_logp_difference/max": 0.9079653024673462, |
| "sampling/sampling_logp_difference/mean": 0.04717833548784256, |
| "step": 41, |
| "step_time": 13.721549190000815 |
| }, |
| { |
| "clip_ratio/high_max": 0.0017857142956927419, |
| "clip_ratio/high_mean": 0.0008928571478463709, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0008928571478463709, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 178.8125, |
| "completions/mean_terminated_length": 178.8125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.09959755872841924, |
| "epoch": 0.00084, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.18356552720069885, |
| "kl": 4.275203719735146, |
| "learning_rate": 9.990517373346957e-05, |
| "loss": 0.0207, |
| "num_tokens": 1580072.0, |
| "reward": 0.670920729637146, |
| "reward_std": 0.49966514110565186, |
| "rewards/rollout_reward_func/mean": 0.670920729637146, |
| "rewards/rollout_reward_func/std": 0.47837620973587036, |
| "sampling/importance_sampling_ratio/max": 1.096825122833252, |
| "sampling/importance_sampling_ratio/mean": 0.9490351676940918, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 0.6992363929748535, |
| "sampling/sampling_logp_difference/mean": 0.018723629415035248, |
| "step": 42, |
| "step_time": 13.593521437000163 |
| }, |
| { |
| "clip_ratio/high_max": 0.012202381272800267, |
| "clip_ratio/high_mean": 0.006101190636400133, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006101190636400133, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 579.0, |
| "completions/max_terminated_length": 579.0, |
| "completions/mean_length": 125.875, |
| "completions/mean_terminated_length": 125.875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.13518582028336823, |
| "epoch": 0.00086, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.5788092613220215, |
| "kl": 4.227362431585789, |
| "learning_rate": 9.98709505668081e-05, |
| "loss": -0.016, |
| "num_tokens": 1612885.0, |
| "reward": 0.7634040117263794, |
| "reward_std": 0.4348216652870178, |
| "rewards/rollout_reward_func/mean": 0.7634040117263794, |
| "rewards/rollout_reward_func/std": 0.4481413960456848, |
| "sampling/importance_sampling_ratio/max": 1.1939719915390015, |
| "sampling/importance_sampling_ratio/mean": 0.9179114699363708, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 0.5640859603881836, |
| "sampling/sampling_logp_difference/mean": 0.029528968036174774, |
| "step": 43, |
| "step_time": 12.544377819000147 |
| }, |
| { |
| "clip_ratio/high_max": 0.010416666977107525, |
| "clip_ratio/high_mean": 0.0052083334885537624, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0052083334885537624, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 236.71875, |
| "completions/mean_terminated_length": 236.71875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.21137235511559993, |
| "epoch": 0.00088, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.37624236941337585, |
| "kl": 5.903023138642311, |
| "learning_rate": 9.983147525093428e-05, |
| "loss": -0.0268, |
| "num_tokens": 1651371.0, |
| "reward": 0.6768415570259094, |
| "reward_std": 0.49680012464523315, |
| "rewards/rollout_reward_func/mean": 0.6768415570259094, |
| "rewards/rollout_reward_func/std": 0.484732449054718, |
| "sampling/importance_sampling_ratio/max": 1.658105731010437, |
| "sampling/importance_sampling_ratio/mean": 1.0146586894989014, |
| "sampling/importance_sampling_ratio/min": 0.6395002603530884, |
| "sampling/sampling_logp_difference/max": 0.44228410720825195, |
| "sampling/sampling_logp_difference/mean": 0.032690562307834625, |
| "step": 44, |
| "step_time": 13.895338261000461 |
| }, |
| { |
| "clip_ratio/high_max": 0.010714286123402417, |
| "clip_ratio/high_mean": 0.005357143061701208, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005357143061701208, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 629.0, |
| "completions/max_terminated_length": 629.0, |
| "completions/mean_length": 265.65625, |
| "completions/mean_terminated_length": 265.65625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.34391735191456974, |
| "epoch": 0.0009, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2791600823402405, |
| "kl": 4.59170226752758, |
| "learning_rate": 9.978675333374685e-05, |
| "loss": -0.0415, |
| "num_tokens": 1690386.0, |
| "reward": 0.8033539056777954, |
| "reward_std": 0.4299496114253998, |
| "rewards/rollout_reward_func/mean": 0.8033539056777954, |
| "rewards/rollout_reward_func/std": 0.42693132162094116, |
| "sampling/importance_sampling_ratio/max": 1.4508812427520752, |
| "sampling/importance_sampling_ratio/mean": 0.9076820015907288, |
| "sampling/importance_sampling_ratio/min": 0.0, |
| "sampling/sampling_logp_difference/max": 0.5162277221679688, |
| "sampling/sampling_logp_difference/mean": 0.043557487428188324, |
| "step": 45, |
| "step_time": 12.850337384999875 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 231.5625, |
| "completions/mean_terminated_length": 238.0, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.20788781438022852, |
| "epoch": 0.00092, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.45684507489204407, |
| "kl": 4.819624692201614, |
| "learning_rate": 9.973679110050689e-05, |
| "loss": 0.0593, |
| "num_tokens": 1727749.0, |
| "reward": 0.5148816704750061, |
| "reward_std": 0.5777592658996582, |
| "rewards/rollout_reward_func/mean": 0.5148816704750061, |
| "rewards/rollout_reward_func/std": 0.5782047510147095, |
| "sampling/importance_sampling_ratio/max": 1.0983736515045166, |
| "sampling/importance_sampling_ratio/mean": 0.945776104927063, |
| "sampling/importance_sampling_ratio/min": 0.5659478306770325, |
| "sampling/sampling_logp_difference/max": 0.5768914222717285, |
| "sampling/sampling_logp_difference/mean": 0.0224138256162405, |
| "step": 46, |
| "step_time": 14.142768939000007 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 687.0, |
| "completions/max_terminated_length": 687.0, |
| "completions/mean_length": 192.5625, |
| "completions/mean_terminated_length": 197.74192810058594, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.20947814546525478, |
| "epoch": 0.00094, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9501240849494934, |
| "kl": 4.3865058943629265, |
| "learning_rate": 9.968159557295458e-05, |
| "loss": 0.1084, |
| "num_tokens": 1762294.0, |
| "reward": 0.7052456140518188, |
| "reward_std": 0.5941587686538696, |
| "rewards/rollout_reward_func/mean": 0.7052456140518188, |
| "rewards/rollout_reward_func/std": 0.5893100500106812, |
| "sampling/importance_sampling_ratio/max": 1.3676362037658691, |
| "sampling/importance_sampling_ratio/mean": 0.9286255836486816, |
| "sampling/importance_sampling_ratio/min": 0.6919541954994202, |
| "sampling/sampling_logp_difference/max": 0.6358003616333008, |
| "sampling/sampling_logp_difference/mean": 0.03289835900068283, |
| "step": 47, |
| "step_time": 12.296653302000095 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.010416666977107525, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010416666977107525, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 997.0, |
| "completions/max_terminated_length": 997.0, |
| "completions/mean_length": 231.25, |
| "completions/mean_terminated_length": 231.25, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.10634760372340679, |
| "epoch": 0.00096, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5390028953552246, |
| "kl": 4.756238490343094, |
| "learning_rate": 9.962117450832225e-05, |
| "loss": -0.001, |
| "num_tokens": 1798402.0, |
| "reward": 0.7353510856628418, |
| "reward_std": 0.542205810546875, |
| "rewards/rollout_reward_func/mean": 0.7353510856628418, |
| "rewards/rollout_reward_func/std": 0.6517592072486877, |
| "sampling/importance_sampling_ratio/max": 2.089435577392578, |
| "sampling/importance_sampling_ratio/mean": 0.9923274517059326, |
| "sampling/importance_sampling_ratio/min": 0.5558032989501953, |
| "sampling/sampling_logp_difference/max": 0.7373199462890625, |
| "sampling/sampling_logp_difference/mean": 0.030953995883464813, |
| "step": 48, |
| "step_time": 14.2758760239999 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 237.375, |
| "completions/mean_terminated_length": 237.375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.0647038493771106, |
| "epoch": 0.00098, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.09670515358448029, |
| "kl": 4.974646285176277, |
| "learning_rate": 9.955553639824423e-05, |
| "loss": -0.0057, |
| "num_tokens": 1836399.0, |
| "reward": 0.6488252282142639, |
| "reward_std": 0.6594193577766418, |
| "rewards/rollout_reward_func/mean": 0.6488252282142639, |
| "rewards/rollout_reward_func/std": 0.6682618856430054, |
| "sampling/importance_sampling_ratio/max": 1.2080090045928955, |
| "sampling/importance_sampling_ratio/mean": 1.002594232559204, |
| "sampling/importance_sampling_ratio/min": 0.8824653625488281, |
| "sampling/sampling_logp_difference/max": 0.18907243013381958, |
| "sampling/sampling_logp_difference/mean": 0.007402781397104263, |
| "step": 49, |
| "step_time": 12.638315505999799 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 285.59375, |
| "completions/mean_terminated_length": 285.59375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.050345817173365504, |
| "epoch": 0.001, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.07216052711009979, |
| "kl": 5.428681015968323, |
| "learning_rate": 9.948469046756344e-05, |
| "loss": 0.0006, |
| "num_tokens": 1875978.0, |
| "reward": 0.6179745197296143, |
| "reward_std": 0.5279039740562439, |
| "rewards/rollout_reward_func/mean": 0.6179745197296143, |
| "rewards/rollout_reward_func/std": 0.5638126730918884, |
| "sampling/importance_sampling_ratio/max": 1.2487702369689941, |
| "sampling/importance_sampling_ratio/mean": 0.974506139755249, |
| "sampling/importance_sampling_ratio/min": 0.6416374444961548, |
| "sampling/sampling_logp_difference/max": 0.4437246322631836, |
| "sampling/sampling_logp_difference/mean": 0.01099600363522768, |
| "step": 50, |
| "step_time": 13.79704612400019 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 189.34375, |
| "completions/mean_terminated_length": 189.34375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.044075825016989256, |
| "epoch": 0.00102, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.16175970435142517, |
| "kl": 4.5743416622281075, |
| "learning_rate": 9.940864667303489e-05, |
| "loss": 0.001, |
| "num_tokens": 1910574.0, |
| "reward": 0.67287677526474, |
| "reward_std": 0.5520733594894409, |
| "rewards/rollout_reward_func/mean": 0.67287677526474, |
| "rewards/rollout_reward_func/std": 0.5455199480056763, |
| "sampling/importance_sampling_ratio/max": 1.0763746500015259, |
| "sampling/importance_sampling_ratio/mean": 0.9990264773368835, |
| "sampling/importance_sampling_ratio/min": 0.9405080676078796, |
| "sampling/sampling_logp_difference/max": 0.07361352443695068, |
| "sampling/sampling_logp_difference/mean": 0.0017119484255090356, |
| "step": 51, |
| "step_time": 12.270658274999732 |
| }, |
| { |
| "clip_ratio/high_max": 0.010416666977107525, |
| "clip_ratio/high_mean": 0.0052083334885537624, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0052083334885537624, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 298.96875, |
| "completions/mean_terminated_length": 298.96875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.05881836503976956, |
| "epoch": 0.00104, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.454397052526474, |
| "kl": 5.88704876601696, |
| "learning_rate": 9.932741570192633e-05, |
| "loss": -0.003, |
| "num_tokens": 1951224.0, |
| "reward": 0.5557973384857178, |
| "reward_std": 0.7397861480712891, |
| "rewards/rollout_reward_func/mean": 0.5557973384857178, |
| "rewards/rollout_reward_func/std": 0.7254597544670105, |
| "sampling/importance_sampling_ratio/max": 1.504248023033142, |
| "sampling/importance_sampling_ratio/mean": 1.0347909927368164, |
| "sampling/importance_sampling_ratio/min": 0.7870653867721558, |
| "sampling/sampling_logp_difference/max": 0.4082956314086914, |
| "sampling/sampling_logp_difference/mean": 0.00977290328592062, |
| "step": 52, |
| "step_time": 13.339979744000175 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 245.40625, |
| "completions/mean_terminated_length": 245.40625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.04109222920851607, |
| "epoch": 0.00106, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.10122750699520111, |
| "kl": 5.1483363807201385, |
| "learning_rate": 9.924100897051629e-05, |
| "loss": 0.0049, |
| "num_tokens": 1988931.0, |
| "reward": 0.6121540665626526, |
| "reward_std": 0.5229529142379761, |
| "rewards/rollout_reward_func/mean": 0.6121540665626526, |
| "rewards/rollout_reward_func/std": 0.5000297427177429, |
| "sampling/importance_sampling_ratio/max": 1.0420476198196411, |
| "sampling/importance_sampling_ratio/mean": 0.9965465068817139, |
| "sampling/importance_sampling_ratio/min": 0.8893992900848389, |
| "sampling/sampling_logp_difference/max": 0.11720812320709229, |
| "sampling/sampling_logp_difference/mean": 0.0017938524251803756, |
| "step": 53, |
| "step_time": 12.912300477000372 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 193.15625, |
| "completions/mean_terminated_length": 193.15625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.023224864701660408, |
| "epoch": 0.00108, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.04124576970934868, |
| "kl": 4.823809951543808, |
| "learning_rate": 9.914943862248966e-05, |
| "loss": 0.0063, |
| "num_tokens": 2024848.0, |
| "reward": 0.7362499833106995, |
| "reward_std": 0.4534528851509094, |
| "rewards/rollout_reward_func/mean": 0.7362499833106995, |
| "rewards/rollout_reward_func/std": 0.4570716321468353, |
| "sampling/importance_sampling_ratio/max": 1.0256677865982056, |
| "sampling/importance_sampling_ratio/mean": 0.9978145360946655, |
| "sampling/importance_sampling_ratio/min": 0.8835611939430237, |
| "sampling/sampling_logp_difference/max": 0.12363004684448242, |
| "sampling/sampling_logp_difference/mean": 0.0014609881909564137, |
| "step": 54, |
| "step_time": 12.811207481999872 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 199.0, |
| "completions/mean_terminated_length": 199.0, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.02716700900498381, |
| "epoch": 0.0011, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.01915285922586918, |
| "kl": 4.195980899035931, |
| "learning_rate": 9.905271752723088e-05, |
| "loss": 0.0038, |
| "num_tokens": 2060220.0, |
| "reward": 0.7684207558631897, |
| "reward_std": 0.4268929362297058, |
| "rewards/rollout_reward_func/mean": 0.7684207558631897, |
| "rewards/rollout_reward_func/std": 0.44239601492881775, |
| "sampling/importance_sampling_ratio/max": 1.024228811264038, |
| "sampling/importance_sampling_ratio/mean": 1.0001416206359863, |
| "sampling/importance_sampling_ratio/min": 0.9879773259162903, |
| "sampling/sampling_logp_difference/max": 0.023940302431583405, |
| "sampling/sampling_logp_difference/mean": 0.00064814742654562, |
| "step": 55, |
| "step_time": 13.081034389999559 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 213.0, |
| "completions/mean_terminated_length": 213.0, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.03577808013187678, |
| "epoch": 0.00112, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14266906678676605, |
| "kl": 5.262552082538605, |
| "learning_rate": 9.895085927801542e-05, |
| "loss": -0.0074, |
| "num_tokens": 2097723.0, |
| "reward": 0.5815123319625854, |
| "reward_std": 0.5697444081306458, |
| "rewards/rollout_reward_func/mean": 0.5815123319625854, |
| "rewards/rollout_reward_func/std": 0.5674311518669128, |
| "sampling/importance_sampling_ratio/max": 1.1195484399795532, |
| "sampling/importance_sampling_ratio/mean": 1.004412055015564, |
| "sampling/importance_sampling_ratio/min": 0.9843540191650391, |
| "sampling/sampling_logp_difference/max": 0.11651325225830078, |
| "sampling/sampling_logp_difference/mean": 0.0017534032231196761, |
| "step": 56, |
| "step_time": 11.058078823999722 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 290.34375, |
| "completions/mean_terminated_length": 290.34375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.027698776671741143, |
| "epoch": 0.00114, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0155794033780694, |
| "kl": 5.111991301178932, |
| "learning_rate": 9.884387819009922e-05, |
| "loss": -0.007, |
| "num_tokens": 2137901.0, |
| "reward": 0.7130915522575378, |
| "reward_std": 0.3993915915489197, |
| "rewards/rollout_reward_func/mean": 0.7130915522575378, |
| "rewards/rollout_reward_func/std": 0.4723998010158539, |
| "sampling/importance_sampling_ratio/max": 1.0058835744857788, |
| "sampling/importance_sampling_ratio/mean": 0.9984175562858582, |
| "sampling/importance_sampling_ratio/min": 0.9888657331466675, |
| "sampling/sampling_logp_difference/max": 0.01078471913933754, |
| "sampling/sampling_logp_difference/mean": 0.0008065314614214003, |
| "step": 57, |
| "step_time": 11.11290242799987 |
| }, |
| { |
| "clip_ratio/high_max": 0.010416666977107525, |
| "clip_ratio/high_mean": 0.0052083334885537624, |
| "clip_ratio/low_mean": 0.0052083334885537624, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.010416666977107525, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 223.625, |
| "completions/mean_terminated_length": 223.625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.02802778462881861, |
| "epoch": 0.00116, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.09351195394992828, |
| "kl": 4.502256289124489, |
| "learning_rate": 9.873178929870695e-05, |
| "loss": 0.0007, |
| "num_tokens": 2175660.0, |
| "reward": 0.6445872187614441, |
| "reward_std": 0.5539962649345398, |
| "rewards/rollout_reward_func/mean": 0.6445872187614441, |
| "rewards/rollout_reward_func/std": 0.5525522828102112, |
| "sampling/importance_sampling_ratio/max": 1.131099820137024, |
| "sampling/importance_sampling_ratio/mean": 0.9855086803436279, |
| "sampling/importance_sampling_ratio/min": 0.6534705758094788, |
| "sampling/sampling_logp_difference/max": 0.4254617691040039, |
| "sampling/sampling_logp_difference/mean": 0.007256423123180866, |
| "step": 58, |
| "step_time": 11.697415718000002 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 286.8125, |
| "completions/mean_terminated_length": 286.8125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.014620267960026467, |
| "epoch": 0.00118, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.020578376948833466, |
| "kl": 5.063114173710346, |
| "learning_rate": 9.86146083569188e-05, |
| "loss": 0.007, |
| "num_tokens": 2215676.0, |
| "reward": 0.6477622985839844, |
| "reward_std": 0.4987693727016449, |
| "rewards/rollout_reward_func/mean": 0.6477622985839844, |
| "rewards/rollout_reward_func/std": 0.4924916923046112, |
| "sampling/importance_sampling_ratio/max": 1.0271024703979492, |
| "sampling/importance_sampling_ratio/mean": 0.9996525049209595, |
| "sampling/importance_sampling_ratio/min": 0.9902810454368591, |
| "sampling/sampling_logp_difference/max": 0.026741325855255127, |
| "sampling/sampling_logp_difference/mean": 0.0005729912081733346, |
| "step": 59, |
| "step_time": 10.581422125000472 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 232.03125, |
| "completions/mean_terminated_length": 232.03125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.006479981271610313, |
| "epoch": 0.0012, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.007562472950667143, |
| "kl": 5.7793563306331635, |
| "learning_rate": 9.84923518334567e-05, |
| "loss": 0.0099, |
| "num_tokens": 2253158.0, |
| "reward": 0.579670786857605, |
| "reward_std": 0.4654841423034668, |
| "rewards/rollout_reward_func/mean": 0.579670786857605, |
| "rewards/rollout_reward_func/std": 0.501867949962616, |
| "sampling/importance_sampling_ratio/max": 1.016674280166626, |
| "sampling/importance_sampling_ratio/mean": 1.0001671314239502, |
| "sampling/importance_sampling_ratio/min": 0.9948341846466064, |
| "sampling/sampling_logp_difference/max": 0.0165358018130064, |
| "sampling/sampling_logp_difference/mean": 0.0005334613961167634, |
| "step": 60, |
| "step_time": 11.516399574999923 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 304.8125, |
| "completions/mean_terminated_length": 304.8125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.01002441996001835, |
| "epoch": 0.00122, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2283029556274414, |
| "kl": 5.093721926212311, |
| "learning_rate": 9.83650369103696e-05, |
| "loss": 0.0015, |
| "num_tokens": 2293842.0, |
| "reward": 0.7455748319625854, |
| "reward_std": 0.5360188484191895, |
| "rewards/rollout_reward_func/mean": 0.7455748319625854, |
| "rewards/rollout_reward_func/std": 0.5213343501091003, |
| "sampling/importance_sampling_ratio/max": 1.000532865524292, |
| "sampling/importance_sampling_ratio/mean": 0.9848321676254272, |
| "sampling/importance_sampling_ratio/min": 0.5504863858222961, |
| "sampling/sampling_logp_difference/max": 0.5969529151916504, |
| "sampling/sampling_logp_difference/mean": 0.004100325983017683, |
| "step": 61, |
| "step_time": 10.772246324000207 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 687.0, |
| "completions/max_terminated_length": 687.0, |
| "completions/mean_length": 169.75, |
| "completions/mean_terminated_length": 169.75, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.005074412345948076, |
| "epoch": 0.00124, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.011714676395058632, |
| "kl": 4.614134136587381, |
| "learning_rate": 9.823268148061883e-05, |
| "loss": -0.0063, |
| "num_tokens": 2328593.0, |
| "reward": 0.7350000143051147, |
| "reward_std": 0.468832790851593, |
| "rewards/rollout_reward_func/mean": 0.7350000143051147, |
| "rewards/rollout_reward_func/std": 0.46188950538635254, |
| "sampling/importance_sampling_ratio/max": 1.0447978973388672, |
| "sampling/importance_sampling_ratio/mean": 1.0014455318450928, |
| "sampling/importance_sampling_ratio/min": 0.9953779578208923, |
| "sampling/sampling_logp_difference/max": 0.04382299259305, |
| "sampling/sampling_logp_difference/mean": 0.0006599511252716184, |
| "step": 62, |
| "step_time": 10.030213335000326 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 669.0, |
| "completions/max_terminated_length": 669.0, |
| "completions/mean_length": 239.4375, |
| "completions/mean_terminated_length": 239.4375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.0035096063779747055, |
| "epoch": 0.00126, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.002542179776355624, |
| "kl": 5.135300308465958, |
| "learning_rate": 9.809530414556335e-05, |
| "loss": 0.0115, |
| "num_tokens": 2366527.0, |
| "reward": 0.7065123319625854, |
| "reward_std": 0.47867923974990845, |
| "rewards/rollout_reward_func/mean": 0.7065123319625854, |
| "rewards/rollout_reward_func/std": 0.4686340391635895, |
| "sampling/importance_sampling_ratio/max": 1.0115028619766235, |
| "sampling/importance_sampling_ratio/mean": 1.0001249313354492, |
| "sampling/importance_sampling_ratio/min": 0.9947195053100586, |
| "sampling/sampling_logp_difference/max": 0.011437196284532547, |
| "sampling/sampling_logp_difference/mean": 0.00027378558297641575, |
| "step": 63, |
| "step_time": 11.122873709000032 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 232.09375, |
| "completions/mean_terminated_length": 232.09375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.0033450635638700987, |
| "epoch": 0.00128, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.005013711750507355, |
| "kl": 4.000352640869096, |
| "learning_rate": 9.79529242123455e-05, |
| "loss": -0.0043, |
| "num_tokens": 2403642.0, |
| "reward": 0.7711831331253052, |
| "reward_std": 0.41131600737571716, |
| "rewards/rollout_reward_func/mean": 0.7711831331253052, |
| "rewards/rollout_reward_func/std": 0.5058895945549011, |
| "sampling/importance_sampling_ratio/max": 1.0016292333602905, |
| "sampling/importance_sampling_ratio/mean": 0.9997192025184631, |
| "sampling/importance_sampling_ratio/min": 0.9981485605239868, |
| "sampling/sampling_logp_difference/max": 0.0019368196371942759, |
| "sampling/sampling_logp_difference/mean": 0.00012640230124816298, |
| "step": 64, |
| "step_time": 10.301682713000446 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 217.21875, |
| "completions/mean_terminated_length": 217.21875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.0023427301867116057, |
| "epoch": 0.0013, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0033939932473003864, |
| "kl": 5.74190029501915, |
| "learning_rate": 9.780556169117757e-05, |
| "loss": -0.0099, |
| "num_tokens": 2442962.0, |
| "reward": 0.6796708106994629, |
| "reward_std": 0.5027361512184143, |
| "rewards/rollout_reward_func/mean": 0.6796708106994629, |
| "rewards/rollout_reward_func/std": 0.4920700490474701, |
| "sampling/importance_sampling_ratio/max": 1.0012661218643188, |
| "sampling/importance_sampling_ratio/mean": 0.9997900724411011, |
| "sampling/importance_sampling_ratio/min": 0.9958721399307251, |
| "sampling/sampling_logp_difference/max": 0.004124820698052645, |
| "sampling/sampling_logp_difference/mean": 0.00011377107148291543, |
| "step": 65, |
| "step_time": 11.305019645000584 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 274.75, |
| "completions/mean_terminated_length": 274.75, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.0013343448747491493, |
| "epoch": 0.00132, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0006489777588285506, |
| "kl": 4.528398025780916, |
| "learning_rate": 9.765323729252955e-05, |
| "loss": 0.0078, |
| "num_tokens": 2482153.0, |
| "reward": 0.7402623295783997, |
| "reward_std": 0.3541877269744873, |
| "rewards/rollout_reward_func/mean": 0.7402623295783997, |
| "rewards/rollout_reward_func/std": 0.45108354091644287, |
| "sampling/importance_sampling_ratio/max": 1.0007672309875488, |
| "sampling/importance_sampling_ratio/mean": 0.9998499155044556, |
| "sampling/importance_sampling_ratio/min": 0.9979432821273804, |
| "sampling/sampling_logp_difference/max": 0.0020514577627182007, |
| "sampling/sampling_logp_difference/mean": 5.5366330343531445e-05, |
| "step": 66, |
| "step_time": 12.143032751000646 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 199.90625, |
| "completions/mean_terminated_length": 199.90625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.0006031800883192773, |
| "epoch": 0.00134, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.00118810695130378, |
| "kl": 4.914457447826862, |
| "learning_rate": 9.749597242421838e-05, |
| "loss": 0.0153, |
| "num_tokens": 2518141.0, |
| "reward": 0.6096707582473755, |
| "reward_std": 0.5082674622535706, |
| "rewards/rollout_reward_func/mean": 0.6096707582473755, |
| "rewards/rollout_reward_func/std": 0.4953238368034363, |
| "sampling/importance_sampling_ratio/max": 1.000567078590393, |
| "sampling/importance_sampling_ratio/mean": 1.000020980834961, |
| "sampling/importance_sampling_ratio/min": 0.9995952248573303, |
| "sampling/sampling_logp_difference/max": 0.0005287290550768375, |
| "sampling/sampling_logp_difference/mean": 2.2719808839610778e-05, |
| "step": 67, |
| "step_time": 13.364112795999972 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 196.71875, |
| "completions/mean_terminated_length": 196.71875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.0006413085729377599, |
| "epoch": 0.00136, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0004711895599029958, |
| "kl": 4.558197975158691, |
| "learning_rate": 9.733378918839942e-05, |
| "loss": 0.0142, |
| "num_tokens": 2553662.0, |
| "reward": 0.7346707582473755, |
| "reward_std": 0.4728625416755676, |
| "rewards/rollout_reward_func/mean": 0.7346707582473755, |
| "rewards/rollout_reward_func/std": 0.453955739736557, |
| "sampling/importance_sampling_ratio/max": 1.0002586841583252, |
| "sampling/importance_sampling_ratio/mean": 0.9999353289604187, |
| "sampling/importance_sampling_ratio/min": 0.9994540214538574, |
| "sampling/sampling_logp_difference/max": 0.0005491760093718767, |
| "sampling/sampling_logp_difference/mean": 2.3225420591188595e-05, |
| "step": 68, |
| "step_time": 12.225320441999429 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 276.5625, |
| "completions/mean_terminated_length": 276.5625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.0007257235612314616, |
| "epoch": 0.00138, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.00036485324380919337, |
| "kl": 4.824105702340603, |
| "learning_rate": 9.716671037846007e-05, |
| "loss": 0.0176, |
| "num_tokens": 2592569.0, |
| "reward": 0.7393414974212646, |
| "reward_std": 0.4584631323814392, |
| "rewards/rollout_reward_func/mean": 0.7393414974212646, |
| "rewards/rollout_reward_func/std": 0.45201003551483154, |
| "sampling/importance_sampling_ratio/max": 1.0006366968154907, |
| "sampling/importance_sampling_ratio/mean": 0.9999854564666748, |
| "sampling/importance_sampling_ratio/min": 0.9991105198860168, |
| "sampling/sampling_logp_difference/max": 0.0008880097884684801, |
| "sampling/sampling_logp_difference/mean": 3.5278186260256916e-05, |
| "step": 69, |
| "step_time": 13.64757463699948 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 212.9375, |
| "completions/mean_terminated_length": 212.9375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.0004797347238962857, |
| "epoch": 0.0014, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0005533385556191206, |
| "kl": 5.058364823460579, |
| "learning_rate": 9.699475947581644e-05, |
| "loss": -0.0099, |
| "num_tokens": 2629923.0, |
| "reward": 0.7068415284156799, |
| "reward_std": 0.3828880488872528, |
| "rewards/rollout_reward_func/mean": 0.7068415284156799, |
| "rewards/rollout_reward_func/std": 0.4743438959121704, |
| "sampling/importance_sampling_ratio/max": 1.000449538230896, |
| "sampling/importance_sampling_ratio/mean": 1.0000323057174683, |
| "sampling/importance_sampling_ratio/min": 0.9997754096984863, |
| "sampling/sampling_logp_difference/max": 0.00045936531387269497, |
| "sampling/sampling_logp_difference/mean": 1.7823947928263806e-05, |
| "step": 70, |
| "step_time": 12.950889879000442 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 178.4375, |
| "completions/mean_terminated_length": 178.4375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00032274034408885655, |
| "epoch": 0.00142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.00046846744953654706, |
| "kl": 4.175679575651884, |
| "learning_rate": 9.681796064661319e-05, |
| "loss": -0.0097, |
| "num_tokens": 2664803.0, |
| "reward": 0.9249999523162842, |
| "reward_std": 0.21163278818130493, |
| "rewards/rollout_reward_func/mean": 0.9249999523162842, |
| "rewards/rollout_reward_func/std": 0.2942458689212799, |
| "sampling/importance_sampling_ratio/max": 1.0001081228256226, |
| "sampling/importance_sampling_ratio/mean": 0.9999797344207764, |
| "sampling/importance_sampling_ratio/min": 0.9997093081474304, |
| "sampling/sampling_logp_difference/max": 0.0002909002359956503, |
| "sampling/sampling_logp_difference/mean": 1.2190073903184384e-05, |
| "step": 71, |
| "step_time": 12.77516862999937 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 176.96875, |
| "completions/mean_terminated_length": 176.96875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00026604766155458037, |
| "epoch": 0.00144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.00023387807596009225, |
| "kl": 5.009636074304581, |
| "learning_rate": 9.663633873832725e-05, |
| "loss": -0.0061, |
| "num_tokens": 2700946.0, |
| "reward": 0.7043415307998657, |
| "reward_std": 0.4547346234321594, |
| "rewards/rollout_reward_func/mean": 0.7043415307998657, |
| "rewards/rollout_reward_func/std": 0.47734788060188293, |
| "sampling/importance_sampling_ratio/max": 1.000250220298767, |
| "sampling/importance_sampling_ratio/mean": 1.0000195503234863, |
| "sampling/importance_sampling_ratio/min": 0.9998472929000854, |
| "sampling/sampling_logp_difference/max": 0.000253763806540519, |
| "sampling/sampling_logp_difference/mean": 9.226798283634707e-06, |
| "step": 72, |
| "step_time": 12.709148162000474 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 249.34375, |
| "completions/mean_terminated_length": 249.34375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00020393275298147273, |
| "epoch": 0.00146, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0001513528113719076, |
| "kl": 4.2001279490068555, |
| "learning_rate": 9.644991927627566e-05, |
| "loss": 0.0157, |
| "num_tokens": 2738134.0, |
| "reward": 0.7374999523162842, |
| "reward_std": 0.4485630393028259, |
| "rewards/rollout_reward_func/mean": 0.7374999523162842, |
| "rewards/rollout_reward_func/std": 0.45219889283180237, |
| "sampling/importance_sampling_ratio/max": 1.0001038312911987, |
| "sampling/importance_sampling_ratio/mean": 1.000006914138794, |
| "sampling/importance_sampling_ratio/min": 0.9998868107795715, |
| "sampling/sampling_logp_difference/max": 0.0001100691151805222, |
| "sampling/sampling_logp_difference/mean": 4.743244971905369e-06, |
| "step": 73, |
| "step_time": 13.106646253999997 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 263.4375, |
| "completions/mean_terminated_length": 263.4375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00018786232286061022, |
| "epoch": 0.00148, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0001859942130977288, |
| "kl": 5.355363771319389, |
| "learning_rate": 9.625872846002834e-05, |
| "loss": 0.0084, |
| "num_tokens": 2777654.0, |
| "reward": 0.6146707534790039, |
| "reward_std": 0.5113980770111084, |
| "rewards/rollout_reward_func/mean": 0.6146707534790039, |
| "rewards/rollout_reward_func/std": 0.49887460470199585, |
| "sampling/importance_sampling_ratio/max": 1.0002917051315308, |
| "sampling/importance_sampling_ratio/mean": 1.0000147819519043, |
| "sampling/importance_sampling_ratio/min": 0.9998480081558228, |
| "sampling/sampling_logp_difference/max": 0.0002930494665633887, |
| "sampling/sampling_logp_difference/mean": 8.865463314577937e-06, |
| "step": 74, |
| "step_time": 13.656730500999174 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 273.375, |
| "completions/mean_terminated_length": 273.375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00020519902130899936, |
| "epoch": 0.0015, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0001783866318874061, |
| "kl": 4.964824467897415, |
| "learning_rate": 9.606279315972582e-05, |
| "loss": 0.021, |
| "num_tokens": 2816342.0, |
| "reward": 0.5827623605728149, |
| "reward_std": 0.4826240539550781, |
| "rewards/rollout_reward_func/mean": 0.5827623605728149, |
| "rewards/rollout_reward_func/std": 0.49690043926239014, |
| "sampling/importance_sampling_ratio/max": 1.000037670135498, |
| "sampling/importance_sampling_ratio/mean": 0.9999814033508301, |
| "sampling/importance_sampling_ratio/min": 0.9998151063919067, |
| "sampling/sampling_logp_difference/max": 0.00018129183445125818, |
| "sampling/sampling_logp_difference/mean": 6.284242772380821e-06, |
| "step": 75, |
| "step_time": 12.500332886000024 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 276.46875, |
| "completions/mean_terminated_length": 276.46875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.000170203812402292, |
| "epoch": 0.00152, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.00013501929061021656, |
| "kl": 5.58905765414238, |
| "learning_rate": 9.586214091230304e-05, |
| "loss": 0.0046, |
| "num_tokens": 2856279.0, |
| "reward": 0.6765123605728149, |
| "reward_std": 0.4957999885082245, |
| "rewards/rollout_reward_func/mean": 0.6765123605728149, |
| "rewards/rollout_reward_func/std": 0.4851822853088379, |
| "sampling/importance_sampling_ratio/max": 1.0000401735305786, |
| "sampling/importance_sampling_ratio/mean": 0.99998939037323, |
| "sampling/importance_sampling_ratio/min": 0.9998947978019714, |
| "sampling/sampling_logp_difference/max": 8.464550046483055e-05, |
| "sampling/sampling_logp_difference/mean": 4.386237378639635e-06, |
| "step": 76, |
| "step_time": 12.885455153999828 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 227.0, |
| "completions/mean_terminated_length": 227.0, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00018892053580543688, |
| "epoch": 0.00154, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 7.705218013143167e-05, |
| "kl": 5.047797460108995, |
| "learning_rate": 9.565679991761914e-05, |
| "loss": 0.0011, |
| "num_tokens": 2894598.0, |
| "reward": 0.5208538770675659, |
| "reward_std": 0.6895849704742432, |
| "rewards/rollout_reward_func/mean": 0.5208538770675659, |
| "rewards/rollout_reward_func/std": 0.668798565864563, |
| "sampling/importance_sampling_ratio/max": 1.0001370906829834, |
| "sampling/importance_sampling_ratio/mean": 1.0000134706497192, |
| "sampling/importance_sampling_ratio/min": 0.9999558925628662, |
| "sampling/sampling_logp_difference/max": 0.0001458361221011728, |
| "sampling/sampling_logp_difference/mean": 6.068152288207784e-06, |
| "step": 77, |
| "step_time": 13.598546733000148 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 236.6875, |
| "completions/mean_terminated_length": 236.6875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00014567816769783803, |
| "epoch": 0.00156, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 8.185791375581175e-05, |
| "kl": 4.4403251856565475, |
| "learning_rate": 9.544679903449437e-05, |
| "loss": -0.0022, |
| "num_tokens": 2932073.0, |
| "reward": 0.7705915570259094, |
| "reward_std": 0.45292043685913086, |
| "rewards/rollout_reward_func/mean": 0.7705915570259094, |
| "rewards/rollout_reward_func/std": 0.4436517357826233, |
| "sampling/importance_sampling_ratio/max": 1.0000391006469727, |
| "sampling/importance_sampling_ratio/mean": 0.999990701675415, |
| "sampling/importance_sampling_ratio/min": 0.9998592734336853, |
| "sampling/sampling_logp_difference/max": 0.00012469961075112224, |
| "sampling/sampling_logp_difference/mean": 3.843690137728117e-06, |
| "step": 78, |
| "step_time": 12.254022738000003 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 267.46875, |
| "completions/mean_terminated_length": 267.46875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00017591889431400887, |
| "epoch": 0.00158, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.5074844567570835e-05, |
| "kl": 5.288512364029884, |
| "learning_rate": 9.523216777665409e-05, |
| "loss": -0.0031, |
| "num_tokens": 2971826.0, |
| "reward": 0.6496707201004028, |
| "reward_std": 0.5136716365814209, |
| "rewards/rollout_reward_func/mean": 0.6496707201004028, |
| "rewards/rollout_reward_func/std": 0.49660006165504456, |
| "sampling/importance_sampling_ratio/max": 1.0001001358032227, |
| "sampling/importance_sampling_ratio/mean": 0.9999943375587463, |
| "sampling/importance_sampling_ratio/min": 0.9999160766601562, |
| "sampling/sampling_logp_difference/max": 0.0001301814045291394, |
| "sampling/sampling_logp_difference/mean": 6.416719315893715e-06, |
| "step": 79, |
| "step_time": 13.015393361999713 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 229.65625, |
| "completions/mean_terminated_length": 229.65625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.0001409291044751626, |
| "epoch": 0.0016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.00011761792848119512, |
| "kl": 4.374786123633385, |
| "learning_rate": 9.501293630858103e-05, |
| "loss": -0.0023, |
| "num_tokens": 3009012.0, |
| "reward": 0.7709207534790039, |
| "reward_std": 0.4299285411834717, |
| "rewards/rollout_reward_func/mean": 0.7709207534790039, |
| "rewards/rollout_reward_func/std": 0.44675153493881226, |
| "sampling/importance_sampling_ratio/max": 1.0000734329223633, |
| "sampling/importance_sampling_ratio/mean": 0.9999964237213135, |
| "sampling/importance_sampling_ratio/min": 0.9997733235359192, |
| "sampling/sampling_logp_difference/max": 0.0001550959568703547, |
| "sampling/sampling_logp_difference/mean": 5.282415258989204e-06, |
| "step": 80, |
| "step_time": 12.882759786000634 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 231.40625, |
| "completions/mean_terminated_length": 231.40625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.748665153352931e-05, |
| "epoch": 0.00162, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.834749077213928e-05, |
| "kl": 4.857630208134651, |
| "learning_rate": 9.478913544127583e-05, |
| "loss": 0.0111, |
| "num_tokens": 3045676.0, |
| "reward": 0.5168415307998657, |
| "reward_std": 0.5236372947692871, |
| "rewards/rollout_reward_func/mean": 0.5168415307998657, |
| "rewards/rollout_reward_func/std": 0.5058324337005615, |
| "sampling/importance_sampling_ratio/max": 1.000028133392334, |
| "sampling/importance_sampling_ratio/mean": 0.9999990463256836, |
| "sampling/importance_sampling_ratio/min": 0.9999288320541382, |
| "sampling/sampling_logp_difference/max": 6.878793647047132e-05, |
| "sampling/sampling_logp_difference/mean": 3.0579262784158345e-06, |
| "step": 81, |
| "step_time": 12.754008866000504 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 213.03125, |
| "completions/mean_terminated_length": 213.03125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.73558894056714e-05, |
| "epoch": 0.00164, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.405829481082037e-05, |
| "kl": 5.079872742295265, |
| "learning_rate": 9.45607966279269e-05, |
| "loss": -0.0035, |
| "num_tokens": 3082215.0, |
| "reward": 0.6434207558631897, |
| "reward_std": 0.4949180483818054, |
| "rewards/rollout_reward_func/mean": 0.6434207558631897, |
| "rewards/rollout_reward_func/std": 0.49626046419143677, |
| "sampling/importance_sampling_ratio/max": 1.0000373125076294, |
| "sampling/importance_sampling_ratio/mean": 0.9999943971633911, |
| "sampling/importance_sampling_ratio/min": 0.9998199939727783, |
| "sampling/sampling_logp_difference/max": 0.00016332250379491597, |
| "sampling/sampling_logp_difference/mean": 3.4263287034264067e-06, |
| "step": 82, |
| "step_time": 13.59404965099975 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 179.40625, |
| "completions/mean_terminated_length": 179.40625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.360899558113033e-05, |
| "epoch": 0.00166, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 8.702681952854618e-05, |
| "kl": 4.518397552892566, |
| "learning_rate": 9.432795195948994e-05, |
| "loss": 0.0149, |
| "num_tokens": 3116089.0, |
| "reward": 0.608420729637146, |
| "reward_std": 0.5539922714233398, |
| "rewards/rollout_reward_func/mean": 0.608420729637146, |
| "rewards/rollout_reward_func/std": 0.5539858341217041, |
| "sampling/importance_sampling_ratio/max": 1.00007164478302, |
| "sampling/importance_sampling_ratio/mean": 0.999993085861206, |
| "sampling/importance_sampling_ratio/min": 0.9998717308044434, |
| "sampling/sampling_logp_difference/max": 0.000115759510663338, |
| "sampling/sampling_logp_difference/mean": 3.67949360224884e-06, |
| "step": 83, |
| "step_time": 12.737816991999352 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 225.125, |
| "completions/mean_terminated_length": 225.125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00011713192122897453, |
| "epoch": 0.00168, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.1548224797006696e-05, |
| "kl": 5.284573458135128, |
| "learning_rate": 9.409063416017778e-05, |
| "loss": 0.0012, |
| "num_tokens": 3154571.0, |
| "reward": 0.7405915260314941, |
| "reward_std": 0.46528252959251404, |
| "rewards/rollout_reward_func/mean": 0.7405915260314941, |
| "rewards/rollout_reward_func/std": 0.4598024785518646, |
| "sampling/importance_sampling_ratio/max": 1.0000370740890503, |
| "sampling/importance_sampling_ratio/mean": 0.9999953508377075, |
| "sampling/importance_sampling_ratio/min": 0.9999337792396545, |
| "sampling/sampling_logp_difference/max": 4.374391573946923e-05, |
| "sampling/sampling_logp_difference/mean": 3.3529299798829015e-06, |
| "step": 84, |
| "step_time": 12.951694035000628 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 265.6875, |
| "completions/mean_terminated_length": 265.6875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.799402398869006e-05, |
| "epoch": 0.0017, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.943003321182914e-05, |
| "kl": 5.07642325758934, |
| "learning_rate": 9.384887658286146e-05, |
| "loss": 0.0153, |
| "num_tokens": 3193105.0, |
| "reward": 0.7087500095367432, |
| "reward_std": 0.486086905002594, |
| "rewards/rollout_reward_func/mean": 0.7087500095367432, |
| "rewards/rollout_reward_func/std": 0.4667232632637024, |
| "sampling/importance_sampling_ratio/max": 1.0000450611114502, |
| "sampling/importance_sampling_ratio/mean": 1.000002145767212, |
| "sampling/importance_sampling_ratio/min": 0.9999704360961914, |
| "sampling/sampling_logp_difference/max": 6.615445454372093e-05, |
| "sampling/sampling_logp_difference/mean": 3.7237202832329785e-06, |
| "step": 85, |
| "step_time": 12.888392118999946 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 198.28125, |
| "completions/mean_terminated_length": 198.28125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010015473532121177, |
| "epoch": 0.00172, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 8.718404569663107e-05, |
| "kl": 5.136753097176552, |
| "learning_rate": 9.360271320438257e-05, |
| "loss": 0.0072, |
| "num_tokens": 3228951.0, |
| "reward": 0.5455915331840515, |
| "reward_std": 0.5534976124763489, |
| "rewards/rollout_reward_func/mean": 0.5455915331840515, |
| "rewards/rollout_reward_func/std": 0.5654960870742798, |
| "sampling/importance_sampling_ratio/max": 1.000040888786316, |
| "sampling/importance_sampling_ratio/mean": 0.9999889135360718, |
| "sampling/importance_sampling_ratio/min": 0.9997588992118835, |
| "sampling/sampling_logp_difference/max": 0.00016045953088905662, |
| "sampling/sampling_logp_difference/mean": 4.547407570498763e-06, |
| "step": 86, |
| "step_time": 12.209375244000285 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 168.5625, |
| "completions/mean_terminated_length": 168.5625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.546316439606017e-05, |
| "epoch": 0.00174, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.010289169149473e-05, |
| "kl": 4.654582601040602, |
| "learning_rate": 9.33521786207783e-05, |
| "loss": -0.0081, |
| "num_tokens": 3264282.0, |
| "reward": 0.8309207558631897, |
| "reward_std": 0.3539258539676666, |
| "rewards/rollout_reward_func/mean": 0.8309207558631897, |
| "rewards/rollout_reward_func/std": 0.3995343744754791, |
| "sampling/importance_sampling_ratio/max": 1.0000208616256714, |
| "sampling/importance_sampling_ratio/mean": 0.9999948143959045, |
| "sampling/importance_sampling_ratio/min": 0.9999562501907349, |
| "sampling/sampling_logp_difference/max": 2.860301174223423e-05, |
| "sampling/sampling_logp_difference/mean": 2.7131702609040076e-06, |
| "step": 87, |
| "step_time": 13.400294012999666 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 199.28125, |
| "completions/mean_terminated_length": 199.28125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.928002892754193e-05, |
| "epoch": 0.00176, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.472797470749356e-05, |
| "kl": 5.448563948273659, |
| "learning_rate": 9.309730804241916e-05, |
| "loss": -0.0036, |
| "num_tokens": 3301625.0, |
| "reward": 0.767762303352356, |
| "reward_std": 0.43076157569885254, |
| "rewards/rollout_reward_func/mean": 0.767762303352356, |
| "rewards/rollout_reward_func/std": 0.44489601254463196, |
| "sampling/importance_sampling_ratio/max": 1.000015377998352, |
| "sampling/importance_sampling_ratio/mean": 0.9999972581863403, |
| "sampling/importance_sampling_ratio/min": 0.9999665021896362, |
| "sampling/sampling_logp_difference/max": 2.5505985831841826e-05, |
| "sampling/sampling_logp_difference/mean": 2.6097468435182236e-06, |
| "step": 88, |
| "step_time": 12.85785657000065 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 182.53125, |
| "completions/mean_terminated_length": 182.53125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.7089729438075665e-05, |
| "epoch": 0.00178, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.0934026906616054e-05, |
| "kl": 5.0993258729577065, |
| "learning_rate": 9.283813728906054e-05, |
| "loss": 0.0125, |
| "num_tokens": 3337039.0, |
| "reward": 0.6074999570846558, |
| "reward_std": 0.49121958017349243, |
| "rewards/rollout_reward_func/mean": 0.6074999570846558, |
| "rewards/rollout_reward_func/std": 0.4953916370868683, |
| "sampling/importance_sampling_ratio/max": 1.0000306367874146, |
| "sampling/importance_sampling_ratio/mean": 0.9999954700469971, |
| "sampling/importance_sampling_ratio/min": 0.9999397993087769, |
| "sampling/sampling_logp_difference/max": 4.13682937505655e-05, |
| "sampling/sampling_logp_difference/mean": 2.6327832074457547e-06, |
| "step": 89, |
| "step_time": 12.809418414000447 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 270.0, |
| "completions/mean_terminated_length": 270.0, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00011780626084600954, |
| "epoch": 0.0018, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.4906221117125824e-05, |
| "kl": 4.517446521669626, |
| "learning_rate": 9.257470278480848e-05, |
| "loss": 0.0146, |
| "num_tokens": 3374681.0, |
| "reward": 0.7090123891830444, |
| "reward_std": 0.4307096004486084, |
| "rewards/rollout_reward_func/mean": 0.7090123891830444, |
| "rewards/rollout_reward_func/std": 0.46687158942222595, |
| "sampling/importance_sampling_ratio/max": 1.0000447034835815, |
| "sampling/importance_sampling_ratio/mean": 0.9999927282333374, |
| "sampling/importance_sampling_ratio/min": 0.9999160170555115, |
| "sampling/sampling_logp_difference/max": 5.054080247646198e-05, |
| "sampling/sampling_logp_difference/mean": 3.6263454603613354e-06, |
| "step": 90, |
| "step_time": 11.027926876000038 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 212.03125, |
| "completions/mean_terminated_length": 212.03125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.198726852981508e-05, |
| "epoch": 0.00182, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.5594642617506906e-05, |
| "kl": 4.633478656411171, |
| "learning_rate": 9.230704155300075e-05, |
| "loss": 0.002, |
| "num_tokens": 3411569.0, |
| "reward": 0.7068415880203247, |
| "reward_std": 0.4687871038913727, |
| "rewards/rollout_reward_func/mean": 0.7068415880203247, |
| "rewards/rollout_reward_func/std": 0.4736269414424896, |
| "sampling/importance_sampling_ratio/max": 1.0000184774398804, |
| "sampling/importance_sampling_ratio/mean": 0.9999944567680359, |
| "sampling/importance_sampling_ratio/min": 0.9999277591705322, |
| "sampling/sampling_logp_difference/max": 3.6478690162766725e-05, |
| "sampling/sampling_logp_difference/mean": 3.097064563917229e-06, |
| "step": 91, |
| "step_time": 10.657987567999498 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 309.34375, |
| "completions/mean_terminated_length": 309.34375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00011667752028188261, |
| "epoch": 0.00184, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.534836575156078e-05, |
| "kl": 5.510342627763748, |
| "learning_rate": 9.20351912110034e-05, |
| "loss": 0.0023, |
| "num_tokens": 3452832.0, |
| "reward": 0.7443415522575378, |
| "reward_std": 0.4789533019065857, |
| "rewards/rollout_reward_func/mean": 0.7443415522575378, |
| "rewards/rollout_reward_func/std": 0.45933249592781067, |
| "sampling/importance_sampling_ratio/max": 1.0000141859054565, |
| "sampling/importance_sampling_ratio/mean": 0.9999915361404419, |
| "sampling/importance_sampling_ratio/min": 0.9998993873596191, |
| "sampling/sampling_logp_difference/max": 0.00012767789303325117, |
| "sampling/sampling_logp_difference/mean": 3.875496531691169e-06, |
| "step": 92, |
| "step_time": 11.005892613000015 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 241.125, |
| "completions/mean_terminated_length": 241.125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.778749490862992e-05, |
| "epoch": 0.00186, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.869422653224319e-05, |
| "kl": 4.655470006167889, |
| "learning_rate": 9.175918996492408e-05, |
| "loss": 0.0075, |
| "num_tokens": 3490161.0, |
| "reward": 0.8015123009681702, |
| "reward_std": 0.4226231575012207, |
| "rewards/rollout_reward_func/mean": 0.8015123009681702, |
| "rewards/rollout_reward_func/std": 0.4206917881965637, |
| "sampling/importance_sampling_ratio/max": 1.0000184774398804, |
| "sampling/importance_sampling_ratio/mean": 0.9999892711639404, |
| "sampling/importance_sampling_ratio/min": 0.9999106526374817, |
| "sampling/sampling_logp_difference/max": 5.662531839334406e-05, |
| "sampling/sampling_logp_difference/mean": 4.012344561488135e-06, |
| "step": 93, |
| "step_time": 10.772258454000166 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 678.0, |
| "completions/max_terminated_length": 678.0, |
| "completions/mean_length": 142.40625, |
| "completions/mean_terminated_length": 142.40625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.749645379182766e-05, |
| "epoch": 0.00188, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7582062355359085e-05, |
| "kl": 4.700259050820023, |
| "learning_rate": 9.147907660424242e-05, |
| "loss": 0.007, |
| "num_tokens": 3523365.0, |
| "reward": 0.7009207606315613, |
| "reward_std": 0.4892513155937195, |
| "rewards/rollout_reward_func/mean": 0.7009207606315613, |
| "rewards/rollout_reward_func/std": 0.4695405960083008, |
| "sampling/importance_sampling_ratio/max": 1.0000180006027222, |
| "sampling/importance_sampling_ratio/mean": 0.9999983906745911, |
| "sampling/importance_sampling_ratio/min": 0.9999682307243347, |
| "sampling/sampling_logp_difference/max": 2.3959782993188128e-05, |
| "sampling/sampling_logp_difference/mean": 1.8251080291520339e-06, |
| "step": 94, |
| "step_time": 10.420490644999745 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 629.0, |
| "completions/max_terminated_length": 629.0, |
| "completions/mean_length": 247.65625, |
| "completions/mean_terminated_length": 247.65625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010689802249430613, |
| "epoch": 0.0019, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.2199332054005936e-05, |
| "kl": 4.485873177647591, |
| "learning_rate": 9.119489049635865e-05, |
| "loss": 0.0058, |
| "num_tokens": 3561189.0, |
| "reward": 0.8030915260314941, |
| "reward_std": 0.4217682480812073, |
| "rewards/rollout_reward_func/mean": 0.8030915260314941, |
| "rewards/rollout_reward_func/std": 0.41993096470832825, |
| "sampling/importance_sampling_ratio/max": 1.000044345855713, |
| "sampling/importance_sampling_ratio/mean": 0.9999996423721313, |
| "sampling/importance_sampling_ratio/min": 0.9999737739562988, |
| "sampling/sampling_logp_difference/max": 6.555848085554317e-05, |
| "sampling/sampling_logp_difference/mean": 3.4570114166854182e-06, |
| "step": 95, |
| "step_time": 10.547222818000137 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 229.625, |
| "completions/mean_terminated_length": 229.625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.808376419404794e-05, |
| "epoch": 0.00192, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.205911707482301e-05, |
| "kl": 5.648331835865974, |
| "learning_rate": 9.090667158106077e-05, |
| "loss": -0.002, |
| "num_tokens": 3599155.0, |
| "reward": 0.579670786857605, |
| "reward_std": 0.5128026008605957, |
| "rewards/rollout_reward_func/mean": 0.579670786857605, |
| "rewards/rollout_reward_func/std": 0.5076538324356079, |
| "sampling/importance_sampling_ratio/max": 1.0000417232513428, |
| "sampling/importance_sampling_ratio/mean": 0.9999995231628418, |
| "sampling/importance_sampling_ratio/min": 0.9999645948410034, |
| "sampling/sampling_logp_difference/max": 3.5042918170802295e-05, |
| "sampling/sampling_logp_difference/mean": 2.5084548269660445e-06, |
| "step": 96, |
| "step_time": 10.931292486000302 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 271.1875, |
| "completions/mean_terminated_length": 271.1875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010380294239098475, |
| "epoch": 0.00194, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 8.427110878983513e-05, |
| "kl": 4.673744417726994, |
| "learning_rate": 9.061446036491135e-05, |
| "loss": 0.0141, |
| "num_tokens": 3637695.0, |
| "reward": 0.6771038770675659, |
| "reward_std": 0.54509037733078, |
| "rewards/rollout_reward_func/mean": 0.6771038770675659, |
| "rewards/rollout_reward_func/std": 0.5397720336914062, |
| "sampling/importance_sampling_ratio/max": 1.0000097751617432, |
| "sampling/importance_sampling_ratio/mean": 0.9999973773956299, |
| "sampling/importance_sampling_ratio/min": 0.9999740123748779, |
| "sampling/sampling_logp_difference/max": 3.063724943785928e-05, |
| "sampling/sampling_logp_difference/mean": 2.753982471404015e-06, |
| "step": 97, |
| "step_time": 10.892550134999283 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 215.09375, |
| "completions/mean_terminated_length": 215.09375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.612215438257408e-05, |
| "epoch": 0.00196, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.8458416383946314e-05, |
| "kl": 4.893975809216499, |
| "learning_rate": 9.03182979155548e-05, |
| "loss": 0.0011, |
| "num_tokens": 3675048.0, |
| "reward": 0.6752623319625854, |
| "reward_std": 0.47441476583480835, |
| "rewards/rollout_reward_func/mean": 0.6752623319625854, |
| "rewards/rollout_reward_func/std": 0.48496752977371216, |
| "sampling/importance_sampling_ratio/max": 1.0000221729278564, |
| "sampling/importance_sampling_ratio/mean": 0.9999983310699463, |
| "sampling/importance_sampling_ratio/min": 0.999973475933075, |
| "sampling/sampling_logp_difference/max": 1.9550898286979645e-05, |
| "sampling/sampling_logp_difference/mean": 1.8493761899662786e-06, |
| "step": 98, |
| "step_time": 10.74181418300077 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 291.96875, |
| "completions/mean_terminated_length": 291.96875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00011381667358989489, |
| "epoch": 0.00198, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.915198562433943e-05, |
| "kl": 5.3638990968465805, |
| "learning_rate": 9.001822585594566e-05, |
| "loss": 0.0212, |
| "num_tokens": 3714784.0, |
| "reward": 0.5840122699737549, |
| "reward_std": 0.5640621185302734, |
| "rewards/rollout_reward_func/mean": 0.5840122699737549, |
| "rewards/rollout_reward_func/std": 0.5568755269050598, |
| "sampling/importance_sampling_ratio/max": 1.0000113248825073, |
| "sampling/importance_sampling_ratio/mean": 0.9999927878379822, |
| "sampling/importance_sampling_ratio/min": 0.9999212026596069, |
| "sampling/sampling_logp_difference/max": 5.3409283282235265e-05, |
| "sampling/sampling_logp_difference/mean": 3.0009853162482614e-06, |
| "step": 99, |
| "step_time": 10.928057961001741 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 302.25, |
| "completions/mean_terminated_length": 302.25, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010083374922942312, |
| "epoch": 0.002, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.809072536067106e-05, |
| "kl": 5.571109913289547, |
| "learning_rate": 8.971428635849876e-05, |
| "loss": -0.0019, |
| "num_tokens": 3755324.0, |
| "reward": 0.5843415260314941, |
| "reward_std": 0.4848282039165497, |
| "rewards/rollout_reward_func/mean": 0.5843415260314941, |
| "rewards/rollout_reward_func/std": 0.5085917711257935, |
| "sampling/importance_sampling_ratio/max": 1.0000355243682861, |
| "sampling/importance_sampling_ratio/mean": 0.9999938011169434, |
| "sampling/importance_sampling_ratio/min": 0.999915599822998, |
| "sampling/sampling_logp_difference/max": 6.294687045738101e-05, |
| "sampling/sampling_logp_difference/mean": 3.964934421674116e-06, |
| "step": 100, |
| "step_time": 11.060910541000794 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 150.3125, |
| "completions/mean_terminated_length": 150.3125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 4.915687395623536e-05, |
| "epoch": 0.00202, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.978915290441364e-05, |
| "kl": 5.539727114140987, |
| "learning_rate": 8.940652213916242e-05, |
| "loss": 0.0078, |
| "num_tokens": 3789997.0, |
| "reward": 0.5434207916259766, |
| "reward_std": 0.49296581745147705, |
| "rewards/rollout_reward_func/mean": 0.5434207916259766, |
| "rewards/rollout_reward_func/std": 0.5062321424484253, |
| "sampling/importance_sampling_ratio/max": 1.0000144243240356, |
| "sampling/importance_sampling_ratio/mean": 0.999999463558197, |
| "sampling/importance_sampling_ratio/min": 0.9999815225601196, |
| "sampling/sampling_logp_difference/max": 2.300502819707617e-05, |
| "sampling/sampling_logp_difference/mean": 1.5629962035745848e-06, |
| "step": 101, |
| "step_time": 10.688564576999852 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 659.0, |
| "completions/max_terminated_length": 659.0, |
| "completions/mean_length": 208.1875, |
| "completions/mean_terminated_length": 208.1875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.830299895772441e-05, |
| "epoch": 0.00204, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.874409958370961e-05, |
| "kl": 4.853120140731335, |
| "learning_rate": 8.9094976451415e-05, |
| "loss": 0.0016, |
| "num_tokens": 3826670.0, |
| "reward": 0.7693415284156799, |
| "reward_std": 0.4373759627342224, |
| "rewards/rollout_reward_func/mean": 0.7693415284156799, |
| "rewards/rollout_reward_func/std": 0.44000110030174255, |
| "sampling/importance_sampling_ratio/max": 1.0000156164169312, |
| "sampling/importance_sampling_ratio/mean": 0.9999966621398926, |
| "sampling/importance_sampling_ratio/min": 0.9999758005142212, |
| "sampling/sampling_logp_difference/max": 2.5749490305315703e-05, |
| "sampling/sampling_logp_difference/mean": 1.909265620270162e-06, |
| "step": 102, |
| "step_time": 10.570236231998933 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 669.0, |
| "completions/max_terminated_length": 669.0, |
| "completions/mean_length": 190.90625, |
| "completions/mean_terminated_length": 190.90625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010004358325943485, |
| "epoch": 0.00206, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.00010144019324798137, |
| "kl": 4.018396964296699, |
| "learning_rate": 8.877969308018608e-05, |
| "loss": -0.0118, |
| "num_tokens": 3862077.0, |
| "reward": 0.8630915880203247, |
| "reward_std": 0.3141759932041168, |
| "rewards/rollout_reward_func/mean": 0.8630915880203247, |
| "rewards/rollout_reward_func/std": 0.3708895444869995, |
| "sampling/importance_sampling_ratio/max": 1.0000416040420532, |
| "sampling/importance_sampling_ratio/mean": 0.9999942779541016, |
| "sampling/importance_sampling_ratio/min": 0.9999680519104004, |
| "sampling/sampling_logp_difference/max": 8.009921293705702e-05, |
| "sampling/sampling_logp_difference/mean": 3.6002747947350144e-06, |
| "step": 103, |
| "step_time": 10.594433905999722 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 589.0, |
| "completions/max_terminated_length": 589.0, |
| "completions/mean_length": 147.15625, |
| "completions/mean_terminated_length": 147.15625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.807541271314221e-05, |
| "epoch": 0.00208, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.578004856943153e-05, |
| "kl": 4.207340374588966, |
| "learning_rate": 8.846071633570285e-05, |
| "loss": -0.0112, |
| "num_tokens": 3895407.0, |
| "reward": 0.890591561794281, |
| "reward_std": 0.30010443925857544, |
| "rewards/rollout_reward_func/mean": 0.890591561794281, |
| "rewards/rollout_reward_func/std": 0.3387019634246826, |
| "sampling/importance_sampling_ratio/max": 1.0000135898590088, |
| "sampling/importance_sampling_ratio/mean": 0.9999985098838806, |
| "sampling/importance_sampling_ratio/min": 0.9999701380729675, |
| "sampling/sampling_logp_difference/max": 2.5631481548771262e-05, |
| "sampling/sampling_logp_difference/mean": 2.3388356567011215e-06, |
| "step": 104, |
| "step_time": 10.663827376000881 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 243.15625, |
| "completions/mean_terminated_length": 243.15625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.019029707246773e-05, |
| "epoch": 0.0021, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.9671542506548576e-05, |
| "kl": 5.539078265428543, |
| "learning_rate": 8.81380910472627e-05, |
| "loss": 0.0103, |
| "num_tokens": 3933754.0, |
| "reward": 0.5509207844734192, |
| "reward_std": 0.5520662665367126, |
| "rewards/rollout_reward_func/mean": 0.5509207844734192, |
| "rewards/rollout_reward_func/std": 0.5649293065071106, |
| "sampling/importance_sampling_ratio/max": 1.0000245571136475, |
| "sampling/importance_sampling_ratio/mean": 0.999995768070221, |
| "sampling/importance_sampling_ratio/min": 0.9999408721923828, |
| "sampling/sampling_logp_difference/max": 4.3273397750454023e-05, |
| "sampling/sampling_logp_difference/mean": 3.680256440929952e-06, |
| "step": 105, |
| "step_time": 10.16665372999978 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 243.53125, |
| "completions/mean_terminated_length": 243.53125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.68805940715356e-05, |
| "epoch": 0.00212, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.179349939979147e-05, |
| "kl": 4.557143405079842, |
| "learning_rate": 8.78118625569329e-05, |
| "loss": 0.0137, |
| "num_tokens": 3971886.0, |
| "reward": 0.8346039056777954, |
| "reward_std": 0.33073580265045166, |
| "rewards/rollout_reward_func/mean": 0.8346039056777954, |
| "rewards/rollout_reward_func/std": 0.3947785496711731, |
| "sampling/importance_sampling_ratio/max": 1.0000149011611938, |
| "sampling/importance_sampling_ratio/mean": 0.9999919533729553, |
| "sampling/importance_sampling_ratio/min": 0.9999665021896362, |
| "sampling/sampling_logp_difference/max": 3.433300298638642e-05, |
| "sampling/sampling_logp_difference/mean": 2.8295610263739945e-06, |
| "step": 106, |
| "step_time": 11.465477359998204 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 678.0, |
| "completions/max_terminated_length": 678.0, |
| "completions/mean_length": 244.5, |
| "completions/mean_terminated_length": 244.5, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.225094723319671e-05, |
| "epoch": 0.00214, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.9931542889680713e-05, |
| "kl": 5.053082346916199, |
| "learning_rate": 8.748207671317818e-05, |
| "loss": -0.001, |
| "num_tokens": 4010493.0, |
| "reward": 0.8046707510948181, |
| "reward_std": 0.43914103507995605, |
| "rewards/rollout_reward_func/mean": 0.8046707510948181, |
| "rewards/rollout_reward_func/std": 0.4238363802433014, |
| "sampling/importance_sampling_ratio/max": 1.0000298023223877, |
| "sampling/importance_sampling_ratio/mean": 0.9999991655349731, |
| "sampling/importance_sampling_ratio/min": 0.9999284744262695, |
| "sampling/sampling_logp_difference/max": 4.7092456952668726e-05, |
| "sampling/sampling_logp_difference/mean": 3.722727342392318e-06, |
| "step": 107, |
| "step_time": 10.213472404999266 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 687.0, |
| "completions/max_terminated_length": 687.0, |
| "completions/mean_length": 233.71875, |
| "completions/mean_terminated_length": 233.71875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.666757216564292e-05, |
| "epoch": 0.00216, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 7.740045839454979e-05, |
| "kl": 4.794837590306997, |
| "learning_rate": 8.714877986441713e-05, |
| "loss": -0.009, |
| "num_tokens": 4047992.0, |
| "reward": 0.8311830759048462, |
| "reward_std": 0.3405177891254425, |
| "rewards/rollout_reward_func/mean": 0.8311830759048462, |
| "rewards/rollout_reward_func/std": 0.3996248245239258, |
| "sampling/importance_sampling_ratio/max": 1.00002121925354, |
| "sampling/importance_sampling_ratio/mean": 0.9999939203262329, |
| "sampling/importance_sampling_ratio/min": 0.9999217987060547, |
| "sampling/sampling_logp_difference/max": 8.25024617370218e-05, |
| "sampling/sampling_logp_difference/mean": 3.614798288253951e-06, |
| "step": 108, |
| "step_time": 11.064788341000622 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 687.0, |
| "completions/max_terminated_length": 687.0, |
| "completions/mean_length": 299.90625, |
| "completions/mean_terminated_length": 299.90625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00011450917008914985, |
| "epoch": 0.00218, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 7.716579420957714e-05, |
| "kl": 5.604592651128769, |
| "learning_rate": 8.681201885250831e-05, |
| "loss": 0.0054, |
| "num_tokens": 4089355.0, |
| "reward": 0.7436830997467041, |
| "reward_std": 0.4545275866985321, |
| "rewards/rollout_reward_func/mean": 0.7436830997467041, |
| "rewards/rollout_reward_func/std": 0.45964205265045166, |
| "sampling/importance_sampling_ratio/max": 1.0000195503234863, |
| "sampling/importance_sampling_ratio/mean": 0.9999915361404419, |
| "sampling/importance_sampling_ratio/min": 0.9999645948410034, |
| "sampling/sampling_logp_difference/max": 3.063714029849507e-05, |
| "sampling/sampling_logp_difference/mean": 3.1274298635253217e-06, |
| "step": 109, |
| "step_time": 10.834878306000064 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 248.375, |
| "completions/mean_terminated_length": 248.375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010261963420532538, |
| "epoch": 0.0022, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.549662480712868e-05, |
| "kl": 4.847600609064102, |
| "learning_rate": 8.647184100616712e-05, |
| "loss": 0.0029, |
| "num_tokens": 4127050.0, |
| "reward": 0.7702623605728149, |
| "reward_std": 0.45045793056488037, |
| "rewards/rollout_reward_func/mean": 0.7702623605728149, |
| "rewards/rollout_reward_func/std": 0.44052988290786743, |
| "sampling/importance_sampling_ratio/max": 1.000023603439331, |
| "sampling/importance_sampling_ratio/mean": 0.9999868869781494, |
| "sampling/importance_sampling_ratio/min": 0.999665379524231, |
| "sampling/sampling_logp_difference/max": 0.0003045859048143029, |
| "sampling/sampling_logp_difference/mean": 4.818009529117262e-06, |
| "step": 110, |
| "step_time": 12.846799683999507 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 242.1875, |
| "completions/mean_terminated_length": 242.1875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.59633990586417e-05, |
| "epoch": 0.00222, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.0465465897577815e-05, |
| "kl": 4.926107108592987, |
| "learning_rate": 8.612829413431418e-05, |
| "loss": 0.0183, |
| "num_tokens": 4165291.0, |
| "reward": 0.6452623605728149, |
| "reward_std": 0.48441076278686523, |
| "rewards/rollout_reward_func/mean": 0.6452623605728149, |
| "rewards/rollout_reward_func/std": 0.48789316415786743, |
| "sampling/importance_sampling_ratio/max": 1.0000079870224, |
| "sampling/importance_sampling_ratio/mean": 0.9999897480010986, |
| "sampling/importance_sampling_ratio/min": 0.9999393224716187, |
| "sampling/sampling_logp_difference/max": 5.018767842557281e-05, |
| "sampling/sampling_logp_difference/mean": 3.0540300031134393e-06, |
| "step": 111, |
| "step_time": 12.97736248299907 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 678.0, |
| "completions/max_terminated_length": 678.0, |
| "completions/mean_length": 163.46875, |
| "completions/mean_terminated_length": 163.46875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.3820546966676375e-05, |
| "epoch": 0.00224, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5189647456281818e-05, |
| "kl": 5.1571872271597385, |
| "learning_rate": 8.578142651935609e-05, |
| "loss": 0.0077, |
| "num_tokens": 4199763.0, |
| "reward": 0.7949999570846558, |
| "reward_std": 0.43465879559516907, |
| "rewards/rollout_reward_func/mean": 0.7949999570846558, |
| "rewards/rollout_reward_func/std": 0.4186306297779083, |
| "sampling/importance_sampling_ratio/max": 1.000018835067749, |
| "sampling/importance_sampling_ratio/mean": 0.9999989867210388, |
| "sampling/importance_sampling_ratio/min": 0.9999740123748779, |
| "sampling/sampling_logp_difference/max": 2.9564285796368495e-05, |
| "sampling/sampling_logp_difference/mean": 2.1823295810463605e-06, |
| "step": 112, |
| "step_time": 12.87057027900073 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 639.0, |
| "completions/max_terminated_length": 639.0, |
| "completions/mean_length": 123.46875, |
| "completions/mean_terminated_length": 123.46875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.431075787498685e-05, |
| "epoch": 0.00226, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.2450723918154836e-05, |
| "kl": 4.872902058064938, |
| "learning_rate": 8.543128691039995e-05, |
| "loss": -0.0007, |
| "num_tokens": 4232839.0, |
| "reward": 0.6693415641784668, |
| "reward_std": 0.46633830666542053, |
| "rewards/rollout_reward_func/mean": 0.6693415641784668, |
| "rewards/rollout_reward_func/std": 0.4878684878349304, |
| "sampling/importance_sampling_ratio/max": 1.0000355243682861, |
| "sampling/importance_sampling_ratio/mean": 0.9999990463256836, |
| "sampling/importance_sampling_ratio/min": 0.9999709725379944, |
| "sampling/sampling_logp_difference/max": 4.2314342863392085e-05, |
| "sampling/sampling_logp_difference/mean": 2.0633228814403992e-06, |
| "step": 113, |
| "step_time": 10.490328707000117 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 269.5625, |
| "completions/mean_terminated_length": 269.5625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.317064824188037e-05, |
| "epoch": 0.00228, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.6714060368249193e-05, |
| "kl": 4.687108241021633, |
| "learning_rate": 8.507792451640202e-05, |
| "loss": 0.0053, |
| "num_tokens": 4271575.0, |
| "reward": 0.6162499785423279, |
| "reward_std": 0.5130974054336548, |
| "rewards/rollout_reward_func/mean": 0.6162499785423279, |
| "rewards/rollout_reward_func/std": 0.5002048015594482, |
| "sampling/importance_sampling_ratio/max": 1.0000026226043701, |
| "sampling/importance_sampling_ratio/mean": 0.9999919533729553, |
| "sampling/importance_sampling_ratio/min": 0.9999685287475586, |
| "sampling/sampling_logp_difference/max": 3.0636969313491136e-05, |
| "sampling/sampling_logp_difference/mean": 3.275076778663788e-06, |
| "step": 114, |
| "step_time": 11.029661345999557 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 649.0, |
| "completions/max_terminated_length": 649.0, |
| "completions/mean_length": 199.90625, |
| "completions/mean_terminated_length": 199.90625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.060472408682017e-05, |
| "epoch": 0.0023, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.878362960880622e-05, |
| "kl": 4.994293176879637, |
| "learning_rate": 8.472138899925184e-05, |
| "loss": -0.0077, |
| "num_tokens": 4308105.0, |
| "reward": 0.7680915594100952, |
| "reward_std": 0.43255943059921265, |
| "rewards/rollout_reward_func/mean": 0.7680915594100952, |
| "rewards/rollout_reward_func/std": 0.44509974122047424, |
| "sampling/importance_sampling_ratio/max": 1.0000261068344116, |
| "sampling/importance_sampling_ratio/mean": 0.9999980926513672, |
| "sampling/importance_sampling_ratio/min": 0.9999677538871765, |
| "sampling/sampling_logp_difference/max": 2.5987903427449055e-05, |
| "sampling/sampling_logp_difference/mean": 2.0921900159009965e-06, |
| "step": 115, |
| "step_time": 12.655563860999791 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 168.40625, |
| "completions/mean_terminated_length": 168.40625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.414693930561043e-05, |
| "epoch": 0.00232, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.48902283096686e-05, |
| "kl": 4.02911851555109, |
| "learning_rate": 8.43617304667927e-05, |
| "loss": 0.0119, |
| "num_tokens": 4341444.0, |
| "reward": 0.6993415355682373, |
| "reward_std": 0.4643377363681793, |
| "rewards/rollout_reward_func/mean": 0.6993415355682373, |
| "rewards/rollout_reward_func/std": 0.4691430330276489, |
| "sampling/importance_sampling_ratio/max": 1.0000131130218506, |
| "sampling/importance_sampling_ratio/mean": 0.9999961853027344, |
| "sampling/importance_sampling_ratio/min": 0.9999437928199768, |
| "sampling/sampling_logp_difference/max": 5.137961852597073e-05, |
| "sampling/sampling_logp_difference/mean": 2.4830555958033074e-06, |
| "step": 116, |
| "step_time": 12.967430730000615 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 169.0625, |
| "completions/mean_terminated_length": 169.0625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.054653645534927e-05, |
| "epoch": 0.00234, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.826282333349809e-05, |
| "kl": 5.1541957119479775, |
| "learning_rate": 8.399899946577953e-05, |
| "loss": -0.0054, |
| "num_tokens": 4376644.0, |
| "reward": 0.7355915307998657, |
| "reward_std": 0.48186829686164856, |
| "rewards/rollout_reward_func/mean": 0.7355915307998657, |
| "rewards/rollout_reward_func/std": 0.46224409341812134, |
| "sampling/importance_sampling_ratio/max": 1.000015377998352, |
| "sampling/importance_sampling_ratio/mean": 0.9999917149543762, |
| "sampling/importance_sampling_ratio/min": 0.9998599290847778, |
| "sampling/sampling_logp_difference/max": 0.00014258069859351963, |
| "sampling/sampling_logp_difference/mean": 3.6177268611936597e-06, |
| "step": 117, |
| "step_time": 12.858041905000846 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 221.46875, |
| "completions/mean_terminated_length": 221.46875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.304556062592837e-05, |
| "epoch": 0.00236, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.4655668312334456e-05, |
| "kl": 4.768407866358757, |
| "learning_rate": 8.363324697477484e-05, |
| "loss": 0.0086, |
| "num_tokens": 4413547.0, |
| "reward": 0.7377623319625854, |
| "reward_std": 0.4202122390270233, |
| "rewards/rollout_reward_func/mean": 0.7377623319625854, |
| "rewards/rollout_reward_func/std": 0.455178439617157, |
| "sampling/importance_sampling_ratio/max": 1.000011682510376, |
| "sampling/importance_sampling_ratio/mean": 0.9999964833259583, |
| "sampling/importance_sampling_ratio/min": 0.9999688267707825, |
| "sampling/sampling_logp_difference/max": 4.255828389432281e-05, |
| "sampling/sampling_logp_difference/mean": 2.690314659048454e-06, |
| "step": 118, |
| "step_time": 10.272596567999699 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 678.0, |
| "completions/max_terminated_length": 678.0, |
| "completions/mean_length": 196.96875, |
| "completions/mean_terminated_length": 196.96875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.680693799727578e-05, |
| "epoch": 0.00238, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.0536966985673644e-05, |
| "kl": 4.59810471534729, |
| "learning_rate": 8.326452439698433e-05, |
| "loss": 0.0101, |
| "num_tokens": 4448788.0, |
| "reward": 0.642170786857605, |
| "reward_std": 0.5499499440193176, |
| "rewards/rollout_reward_func/mean": 0.642170786857605, |
| "rewards/rollout_reward_func/std": 0.5485014915466309, |
| "sampling/importance_sampling_ratio/max": 1.0000146627426147, |
| "sampling/importance_sampling_ratio/mean": 0.9999949932098389, |
| "sampling/importance_sampling_ratio/min": 0.9999575018882751, |
| "sampling/sampling_logp_difference/max": 3.0994589906185865e-05, |
| "sampling/sampling_logp_difference/mean": 2.7388678063289262e-06, |
| "step": 119, |
| "step_time": 11.08815586799983 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 242.3125, |
| "completions/mean_terminated_length": 242.3125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.888474092178967e-05, |
| "epoch": 0.0024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 7.178436499089003e-05, |
| "kl": 5.197234347462654, |
| "learning_rate": 8.289288355303245e-05, |
| "loss": 0.0035, |
| "num_tokens": 4487700.0, |
| "reward": 0.7096707820892334, |
| "reward_std": 0.4924471080303192, |
| "rewards/rollout_reward_func/mean": 0.7096707820892334, |
| "rewards/rollout_reward_func/std": 0.473552942276001, |
| "sampling/importance_sampling_ratio/max": 1.0000423192977905, |
| "sampling/importance_sampling_ratio/mean": 0.9999993443489075, |
| "sampling/importance_sampling_ratio/min": 0.9999532699584961, |
| "sampling/sampling_logp_difference/max": 5.496320954989642e-05, |
| "sampling/sampling_logp_difference/mean": 3.605607162171509e-06, |
| "step": 120, |
| "step_time": 10.934329372000775 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 277.25, |
| "completions/mean_terminated_length": 277.25, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.522849164194213e-05, |
| "epoch": 0.00242, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.63542005995987e-05, |
| "kl": 4.920810654759407, |
| "learning_rate": 8.251837667367966e-05, |
| "loss": 0.0075, |
| "num_tokens": 4526577.0, |
| "reward": 0.7402623891830444, |
| "reward_std": 0.4214465022087097, |
| "rewards/rollout_reward_func/mean": 0.7402623891830444, |
| "rewards/rollout_reward_func/std": 0.4546830654144287, |
| "sampling/importance_sampling_ratio/max": 1.0000230073928833, |
| "sampling/importance_sampling_ratio/mean": 0.9999945759773254, |
| "sampling/importance_sampling_ratio/min": 0.9999340772628784, |
| "sampling/sampling_logp_difference/max": 5.650547973345965e-05, |
| "sampling/sampling_logp_difference/mean": 3.201313347744872e-06, |
| "step": 121, |
| "step_time": 11.145089898000151 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 253.9375, |
| "completions/mean_terminated_length": 253.9375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00011383450387825178, |
| "epoch": 0.00244, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0001262296427739784, |
| "kl": 4.25481122918427, |
| "learning_rate": 8.214105639248173e-05, |
| "loss": -0.008, |
| "num_tokens": 4564592.0, |
| "reward": 0.7399331331253052, |
| "reward_std": 0.3940573036670685, |
| "rewards/rollout_reward_func/mean": 0.7399331331253052, |
| "rewards/rollout_reward_func/std": 0.45654943585395813, |
| "sampling/importance_sampling_ratio/max": 1.0000048875808716, |
| "sampling/importance_sampling_ratio/mean": 0.999985933303833, |
| "sampling/importance_sampling_ratio/min": 0.9998538494110107, |
| "sampling/sampling_logp_difference/max": 0.0001305415207752958, |
| "sampling/sampling_logp_difference/mean": 4.2265378397132736e-06, |
| "step": 122, |
| "step_time": 11.035577829000886 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 219.96875, |
| "completions/mean_terminated_length": 219.96875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.981711987132712e-05, |
| "epoch": 0.00246, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.0030358175281435e-05, |
| "kl": 4.89252844452858, |
| "learning_rate": 8.176097573839265e-05, |
| "loss": -0.0091, |
| "num_tokens": 4601658.0, |
| "reward": 0.5161831378936768, |
| "reward_std": 0.4359714388847351, |
| "rewards/rollout_reward_func/mean": 0.5161831378936768, |
| "rewards/rollout_reward_func/std": 0.5666016936302185, |
| "sampling/importance_sampling_ratio/max": 1.0000238418579102, |
| "sampling/importance_sampling_ratio/mean": 0.999997079372406, |
| "sampling/importance_sampling_ratio/min": 0.9999577403068542, |
| "sampling/sampling_logp_difference/max": 4.756472844746895e-05, |
| "sampling/sampling_logp_difference/mean": 3.610257408581674e-06, |
| "step": 123, |
| "step_time": 10.925135986000441 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 212.90625, |
| "completions/mean_terminated_length": 212.90625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.236761294710959e-05, |
| "epoch": 0.00248, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.092013452667743e-05, |
| "kl": 5.190757237374783, |
| "learning_rate": 8.137818812831182e-05, |
| "loss": 0.001, |
| "num_tokens": 4638502.0, |
| "reward": 0.6746707558631897, |
| "reward_std": 0.46534547209739685, |
| "rewards/rollout_reward_func/mean": 0.6746707558631897, |
| "rewards/rollout_reward_func/std": 0.48581212759017944, |
| "sampling/importance_sampling_ratio/max": 1.0000137090682983, |
| "sampling/importance_sampling_ratio/mean": 0.9999908208847046, |
| "sampling/importance_sampling_ratio/min": 0.9998648762702942, |
| "sampling/sampling_logp_difference/max": 0.0001305415207752958, |
| "sampling/sampling_logp_difference/mean": 3.2529164855077397e-06, |
| "step": 124, |
| "step_time": 10.93318752300047 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 230.78125, |
| "completions/mean_terminated_length": 230.78125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.626912174885092e-05, |
| "epoch": 0.0025, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5847192116780207e-05, |
| "kl": 5.291407283395529, |
| "learning_rate": 8.09927473595769e-05, |
| "loss": 0.0025, |
| "num_tokens": 4675749.0, |
| "reward": 0.642170786857605, |
| "reward_std": 0.5106069445610046, |
| "rewards/rollout_reward_func/mean": 0.642170786857605, |
| "rewards/rollout_reward_func/std": 0.4933558702468872, |
| "sampling/importance_sampling_ratio/max": 1.0000139474868774, |
| "sampling/importance_sampling_ratio/mean": 0.9999961256980896, |
| "sampling/importance_sampling_ratio/min": 0.9999502301216125, |
| "sampling/sampling_logp_difference/max": 2.9326114599825814e-05, |
| "sampling/sampling_logp_difference/mean": 2.8105505407438613e-06, |
| "step": 125, |
| "step_time": 11.311041927999213 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 669.0, |
| "completions/max_terminated_length": 669.0, |
| "completions/mean_length": 265.96875, |
| "completions/mean_terminated_length": 265.96875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00011288215048921302, |
| "epoch": 0.00252, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.9042354071862064e-05, |
| "kl": 4.39733462408185, |
| "learning_rate": 8.060470760240294e-05, |
| "loss": 0.0024, |
| "num_tokens": 4714650.0, |
| "reward": 0.7424331307411194, |
| "reward_std": 0.5192468166351318, |
| "rewards/rollout_reward_func/mean": 0.7424331307411194, |
| "rewards/rollout_reward_func/std": 0.5208361744880676, |
| "sampling/importance_sampling_ratio/max": 1.0000265836715698, |
| "sampling/importance_sampling_ratio/mean": 0.9999986886978149, |
| "sampling/importance_sampling_ratio/min": 0.9999682307243347, |
| "sampling/sampling_logp_difference/max": 2.979917189804837e-05, |
| "sampling/sampling_logp_difference/mean": 2.8954166282346705e-06, |
| "step": 126, |
| "step_time": 11.478165510000053 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 230.59375, |
| "completions/mean_terminated_length": 230.59375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.604452432336984e-05, |
| "epoch": 0.00254, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.739314292441122e-05, |
| "kl": 5.027340762317181, |
| "learning_rate": 8.021412339226936e-05, |
| "loss": -0.003, |
| "num_tokens": 4752416.0, |
| "reward": 0.8005915880203247, |
| "reward_std": 0.4260421693325043, |
| "rewards/rollout_reward_func/mean": 0.8005915880203247, |
| "rewards/rollout_reward_func/std": 0.4247084856033325, |
| "sampling/importance_sampling_ratio/max": 1.0000354051589966, |
| "sampling/importance_sampling_ratio/mean": 0.9999985694885254, |
| "sampling/importance_sampling_ratio/min": 0.9999394416809082, |
| "sampling/sampling_logp_difference/max": 4.720142896985635e-05, |
| "sampling/sampling_logp_difference/mean": 3.4468034755263943e-06, |
| "step": 127, |
| "step_time": 10.348821588999726 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 259.0625, |
| "completions/mean_terminated_length": 259.0625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010858213161668573, |
| "epoch": 0.00256, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.580048673436977e-05, |
| "kl": 5.223805829882622, |
| "learning_rate": 7.982104962225541e-05, |
| "loss": -0.0116, |
| "num_tokens": 4791731.0, |
| "reward": 0.7111831307411194, |
| "reward_std": 0.43767261505126953, |
| "rewards/rollout_reward_func/mean": 0.7111831307411194, |
| "rewards/rollout_reward_func/std": 0.5364737510681152, |
| "sampling/importance_sampling_ratio/max": 1.0000152587890625, |
| "sampling/importance_sampling_ratio/mean": 0.9999892711639404, |
| "sampling/importance_sampling_ratio/min": 0.9999455213546753, |
| "sampling/sampling_logp_difference/max": 4.4942185922991484e-05, |
| "sampling/sampling_logp_difference/mean": 3.922757969121449e-06, |
| "step": 128, |
| "step_time": 11.726808570999765 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 237.65625, |
| "completions/mean_terminated_length": 237.65625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.304604658666449e-05, |
| "epoch": 0.00258, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.209320468362421e-05, |
| "kl": 4.889099486870691, |
| "learning_rate": 7.94255415353255e-05, |
| "loss": -0.0002, |
| "num_tokens": 4829324.0, |
| "reward": 0.7699999809265137, |
| "reward_std": 0.4524250626564026, |
| "rewards/rollout_reward_func/mean": 0.7699999809265137, |
| "rewards/rollout_reward_func/std": 0.4433304965496063, |
| "sampling/importance_sampling_ratio/max": 1.0000271797180176, |
| "sampling/importance_sampling_ratio/mean": 0.9999895691871643, |
| "sampling/importance_sampling_ratio/min": 0.9998538494110107, |
| "sampling/sampling_logp_difference/max": 0.0001435376179870218, |
| "sampling/sampling_logp_difference/mean": 4.068881935381796e-06, |
| "step": 129, |
| "step_time": 10.755176805000701 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 208.875, |
| "completions/mean_terminated_length": 208.875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.984883568847636e-05, |
| "epoch": 0.0026, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.964017691439949e-05, |
| "kl": 4.550747729837894, |
| "learning_rate": 7.902765471656524e-05, |
| "loss": -0.0008, |
| "num_tokens": 4865040.0, |
| "reward": 0.7049331665039062, |
| "reward_std": 0.48293593525886536, |
| "rewards/rollout_reward_func/mean": 0.7049331665039062, |
| "rewards/rollout_reward_func/std": 0.4722973704338074, |
| "sampling/importance_sampling_ratio/max": 1.0000436305999756, |
| "sampling/importance_sampling_ratio/mean": 0.9999958276748657, |
| "sampling/importance_sampling_ratio/min": 0.999826967716217, |
| "sampling/sampling_logp_difference/max": 0.00013281148858368397, |
| "sampling/sampling_logp_difference/mean": 3.5681759982253425e-06, |
| "step": 130, |
| "step_time": 10.911314888001016 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 257.78125, |
| "completions/mean_terminated_length": 257.78125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010051704717284338, |
| "epoch": 0.00262, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 8.389891445403919e-05, |
| "kl": 5.61663281917572, |
| "learning_rate": 7.862744508536953e-05, |
| "loss": -0.0145, |
| "num_tokens": 4905023.0, |
| "reward": 0.8059207797050476, |
| "reward_std": 0.3718266785144806, |
| "rewards/rollout_reward_func/mean": 0.8059207797050476, |
| "rewards/rollout_reward_func/std": 0.42751839756965637, |
| "sampling/importance_sampling_ratio/max": 1.0000563859939575, |
| "sampling/importance_sampling_ratio/mean": 0.9999959468841553, |
| "sampling/importance_sampling_ratio/min": 0.9999661445617676, |
| "sampling/sampling_logp_difference/max": 6.972985283937305e-05, |
| "sampling/sampling_logp_difference/mean": 3.5875398225471145e-06, |
| "step": 131, |
| "step_time": 11.427704178000113 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 251.46875, |
| "completions/mean_terminated_length": 251.46875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010139288470156771, |
| "epoch": 0.00264, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.9901922213612124e-05, |
| "kl": 5.053040578961372, |
| "learning_rate": 7.822496888758351e-05, |
| "loss": -0.0125, |
| "num_tokens": 4943228.0, |
| "reward": 0.7086831331253052, |
| "reward_std": 0.45560967922210693, |
| "rewards/rollout_reward_func/mean": 0.7086831331253052, |
| "rewards/rollout_reward_func/std": 0.5349801182746887, |
| "sampling/importance_sampling_ratio/max": 1.0000299215316772, |
| "sampling/importance_sampling_ratio/mean": 0.9999942779541016, |
| "sampling/importance_sampling_ratio/min": 0.999942421913147, |
| "sampling/sampling_logp_difference/max": 4.243637522449717e-05, |
| "sampling/sampling_logp_difference/mean": 3.3543090012244647e-06, |
| "step": 132, |
| "step_time": 10.83788310799946 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 166.5, |
| "completions/mean_terminated_length": 166.5, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.230003301117449e-05, |
| "epoch": 0.00266, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.2420093955588527e-05, |
| "kl": 4.749419122934341, |
| "learning_rate": 7.782028268759781e-05, |
| "loss": 0.0022, |
| "num_tokens": 4978386.0, |
| "reward": 0.7043415307998657, |
| "reward_std": 0.5322943925857544, |
| "rewards/rollout_reward_func/mean": 0.7043415307998657, |
| "rewards/rollout_reward_func/std": 0.5347846150398254, |
| "sampling/importance_sampling_ratio/max": 1.000034213066101, |
| "sampling/importance_sampling_ratio/mean": 0.9999971389770508, |
| "sampling/importance_sampling_ratio/min": 0.9999690651893616, |
| "sampling/sampling_logp_difference/max": 5.220848834142089e-05, |
| "sampling/sampling_logp_difference/mean": 2.8167501113784965e-06, |
| "step": 133, |
| "step_time": 10.881401281999842 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 147.5, |
| "completions/mean_terminated_length": 147.5, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.72966548304521e-05, |
| "epoch": 0.00268, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.369628368294798e-05, |
| "kl": 5.001729235053062, |
| "learning_rate": 7.741344336039886e-05, |
| "loss": -0.0074, |
| "num_tokens": 5013351.0, |
| "reward": 0.7330915927886963, |
| "reward_std": 0.4458242356777191, |
| "rewards/rollout_reward_func/mean": 0.7330915927886963, |
| "rewards/rollout_reward_func/std": 0.4634391963481903, |
| "sampling/importance_sampling_ratio/max": 1.0000064373016357, |
| "sampling/importance_sampling_ratio/mean": 0.9999979734420776, |
| "sampling/importance_sampling_ratio/min": 0.9999818801879883, |
| "sampling/sampling_logp_difference/max": 2.2650128812529147e-05, |
| "sampling/sampling_logp_difference/mean": 1.603030568730901e-06, |
| "step": 134, |
| "step_time": 10.908871880999868 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 305.875, |
| "completions/mean_terminated_length": 305.875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.725786842944672e-05, |
| "epoch": 0.0027, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.0055830797646195e-05, |
| "kl": 5.2666705548763275, |
| "learning_rate": 7.700450808357564e-05, |
| "loss": 0.0223, |
| "num_tokens": 5053684.0, |
| "reward": 0.6452623605728149, |
| "reward_std": 0.46077656745910645, |
| "rewards/rollout_reward_func/mean": 0.6452623605728149, |
| "rewards/rollout_reward_func/std": 0.4859411120414734, |
| "sampling/importance_sampling_ratio/max": 1.000022053718567, |
| "sampling/importance_sampling_ratio/mean": 0.9999962449073792, |
| "sampling/importance_sampling_ratio/min": 0.999959409236908, |
| "sampling/sampling_logp_difference/max": 3.814770025201142e-05, |
| "sampling/sampling_logp_difference/mean": 2.4756843686191132e-06, |
| "step": 135, |
| "step_time": 10.426622540999233 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 254.15625, |
| "completions/mean_terminated_length": 254.15625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.59571040479068e-05, |
| "epoch": 0.00272, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 8.841377712087706e-05, |
| "kl": 4.508408114314079, |
| "learning_rate": 7.659353432928393e-05, |
| "loss": -0.0018, |
| "num_tokens": 5091253.0, |
| "reward": 0.8015123605728149, |
| "reward_std": 0.36576953530311584, |
| "rewards/rollout_reward_func/mean": 0.8015123605728149, |
| "rewards/rollout_reward_func/std": 0.4206917881965637, |
| "sampling/importance_sampling_ratio/max": 1.0000181198120117, |
| "sampling/importance_sampling_ratio/mean": 0.999996542930603, |
| "sampling/importance_sampling_ratio/min": 0.9999696016311646, |
| "sampling/sampling_logp_difference/max": 3.4213535400340334e-05, |
| "sampling/sampling_logp_difference/mean": 2.4184487301681656e-06, |
| "step": 136, |
| "step_time": 11.579949559998568 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 217.71875, |
| "completions/mean_terminated_length": 217.71875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.95244829624653e-05, |
| "epoch": 0.00274, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.87391875544563e-05, |
| "kl": 4.599743388593197, |
| "learning_rate": 7.618057985616908e-05, |
| "loss": 0.0095, |
| "num_tokens": 5127521.0, |
| "reward": 0.6746707558631897, |
| "reward_std": 0.5018748044967651, |
| "rewards/rollout_reward_func/mean": 0.6746707558631897, |
| "rewards/rollout_reward_func/std": 0.48047077655792236, |
| "sampling/importance_sampling_ratio/max": 1.0000325441360474, |
| "sampling/importance_sampling_ratio/mean": 0.9999880790710449, |
| "sampling/importance_sampling_ratio/min": 0.9998170137405396, |
| "sampling/sampling_logp_difference/max": 0.00016952332225628197, |
| "sampling/sampling_logp_difference/mean": 4.749942490889225e-06, |
| "step": 137, |
| "step_time": 11.03167146700116 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 259.0, |
| "completions/mean_terminated_length": 259.0, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.630876831399291e-05, |
| "epoch": 0.00276, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.7878466426045634e-05, |
| "kl": 4.510915584862232, |
| "learning_rate": 7.576570270124853e-05, |
| "loss": 0.0126, |
| "num_tokens": 5165374.0, |
| "reward": 0.7396707534790039, |
| "reward_std": 0.4501022398471832, |
| "rewards/rollout_reward_func/mean": 0.7396707534790039, |
| "rewards/rollout_reward_func/std": 0.45358023047447205, |
| "sampling/importance_sampling_ratio/max": 1.0000441074371338, |
| "sampling/importance_sampling_ratio/mean": 0.9999974370002747, |
| "sampling/importance_sampling_ratio/min": 0.9999577403068542, |
| "sampling/sampling_logp_difference/max": 4.2433915950823575e-05, |
| "sampling/sampling_logp_difference/mean": 2.6292764232493937e-06, |
| "step": 138, |
| "step_time": 10.937990653000725 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 269.28125, |
| "completions/mean_terminated_length": 269.28125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.21896111915521e-05, |
| "epoch": 0.00278, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.454186975839548e-05, |
| "kl": 5.0575916320085526, |
| "learning_rate": 7.53489611717553e-05, |
| "loss": 0.0222, |
| "num_tokens": 5204102.0, |
| "reward": 0.7071039080619812, |
| "reward_std": 0.46030738949775696, |
| "rewards/rollout_reward_func/mean": 0.7071039080619812, |
| "rewards/rollout_reward_func/std": 0.46494990587234497, |
| "sampling/importance_sampling_ratio/max": 1.0000200271606445, |
| "sampling/importance_sampling_ratio/mean": 0.9999970197677612, |
| "sampling/importance_sampling_ratio/min": 0.999968409538269, |
| "sampling/sampling_logp_difference/max": 3.2663992897141725e-05, |
| "sampling/sampling_logp_difference/mean": 3.1937224775901996e-06, |
| "step": 139, |
| "step_time": 10.921687967998878 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 149.65625, |
| "completions/mean_terminated_length": 149.65625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.3915793799319545e-05, |
| "epoch": 0.0028, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.388841625768691e-05, |
| "kl": 5.497643873095512, |
| "learning_rate": 7.49304138369434e-05, |
| "loss": -0.0117, |
| "num_tokens": 5238438.0, |
| "reward": 0.7018415927886963, |
| "reward_std": 0.49856051802635193, |
| "rewards/rollout_reward_func/mean": 0.7018415927886963, |
| "rewards/rollout_reward_func/std": 0.4783366620540619, |
| "sampling/importance_sampling_ratio/max": 1.0000146627426147, |
| "sampling/importance_sampling_ratio/mean": 0.9999945759773254, |
| "sampling/importance_sampling_ratio/min": 0.9999581575393677, |
| "sampling/sampling_logp_difference/max": 5.030676402384415e-05, |
| "sampling/sampling_logp_difference/mean": 2.4461330667691072e-06, |
| "step": 140, |
| "step_time": 10.575326776998736 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 244.78125, |
| "completions/mean_terminated_length": 244.78125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.234672896241136e-05, |
| "epoch": 0.00282, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.646247882395983e-05, |
| "kl": 5.654919885098934, |
| "learning_rate": 7.45101195198564e-05, |
| "loss": -0.0082, |
| "num_tokens": 5277978.0, |
| "reward": 0.49026232957839966, |
| "reward_std": 0.5400711297988892, |
| "rewards/rollout_reward_func/mean": 0.49026232957839966, |
| "rewards/rollout_reward_func/std": 0.5695264935493469, |
| "sampling/importance_sampling_ratio/max": 1.0000135898590088, |
| "sampling/importance_sampling_ratio/mean": 0.9999983310699463, |
| "sampling/importance_sampling_ratio/min": 0.9999774098396301, |
| "sampling/sampling_logp_difference/max": 2.241195397800766e-05, |
| "sampling/sampling_logp_difference/mean": 2.3845029772928683e-06, |
| "step": 141, |
| "step_time": 11.646618943000249 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 232.78125, |
| "completions/mean_terminated_length": 232.78125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010328506519385883, |
| "epoch": 0.00284, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.90917952649761e-05, |
| "kl": 5.274715817227843, |
| "learning_rate": 7.408813728906053e-05, |
| "loss": -0.0046, |
| "num_tokens": 5315780.0, |
| "reward": 0.6771707534790039, |
| "reward_std": 0.49981045722961426, |
| "rewards/rollout_reward_func/mean": 0.6771707534790039, |
| "rewards/rollout_reward_func/std": 0.4876364767551422, |
| "sampling/importance_sampling_ratio/max": 1.0000065565109253, |
| "sampling/importance_sampling_ratio/mean": 0.9999916553497314, |
| "sampling/importance_sampling_ratio/min": 0.9999037981033325, |
| "sampling/sampling_logp_difference/max": 0.00010908614785876125, |
| "sampling/sampling_logp_difference/mean": 3.764534085348714e-06, |
| "step": 142, |
| "step_time": 10.2494408320008 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 192.40625, |
| "completions/mean_terminated_length": 192.40625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.100872606396024e-05, |
| "epoch": 0.00286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.320547668612562e-05, |
| "kl": 4.761083319783211, |
| "learning_rate": 7.366452645034293e-05, |
| "loss": -0.0062, |
| "num_tokens": 5351444.0, |
| "reward": 0.703091561794281, |
| "reward_std": 0.44097042083740234, |
| "rewards/rollout_reward_func/mean": 0.703091561794281, |
| "rewards/rollout_reward_func/std": 0.4737764596939087, |
| "sampling/importance_sampling_ratio/max": 1.000017523765564, |
| "sampling/importance_sampling_ratio/mean": 0.9999960064888, |
| "sampling/importance_sampling_ratio/min": 0.9999603033065796, |
| "sampling/sampling_logp_difference/max": 3.778961399802938e-05, |
| "sampling/sampling_logp_difference/mean": 2.097834112646524e-06, |
| "step": 143, |
| "step_time": 11.270066180999493 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 218.09375, |
| "completions/mean_terminated_length": 218.09375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.329525671982083e-05, |
| "epoch": 0.00288, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.2181723731337115e-05, |
| "kl": 5.640114210546017, |
| "learning_rate": 7.32393465383769e-05, |
| "loss": 0.016, |
| "num_tokens": 5388860.0, |
| "reward": 0.5475000143051147, |
| "reward_std": 0.5258115530014038, |
| "rewards/rollout_reward_func/mean": 0.5475000143051147, |
| "rewards/rollout_reward_func/std": 0.5025290250778198, |
| "sampling/importance_sampling_ratio/max": 1.0000290870666504, |
| "sampling/importance_sampling_ratio/mean": 0.9999995231628418, |
| "sampling/importance_sampling_ratio/min": 0.9999827742576599, |
| "sampling/sampling_logp_difference/max": 4.2435844079591334e-05, |
| "sampling/sampling_logp_difference/mean": 1.7509868257548078e-06, |
| "step": 144, |
| "step_time": 10.207922464999228 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 247.71875, |
| "completions/mean_terminated_length": 247.71875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.949859710583041e-05, |
| "epoch": 0.0029, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 8.013792103156447e-05, |
| "kl": 5.88670526444912, |
| "learning_rate": 7.281265730835482e-05, |
| "loss": -0.0042, |
| "num_tokens": 5428231.0, |
| "reward": 0.5490123629570007, |
| "reward_std": 0.4685298204421997, |
| "rewards/rollout_reward_func/mean": 0.5490123629570007, |
| "rewards/rollout_reward_func/std": 0.5122498869895935, |
| "sampling/importance_sampling_ratio/max": 1.0000377893447876, |
| "sampling/importance_sampling_ratio/mean": 0.9999960660934448, |
| "sampling/importance_sampling_ratio/min": 0.9999233484268188, |
| "sampling/sampling_logp_difference/max": 6.354815559461713e-05, |
| "sampling/sampling_logp_difference/mean": 3.663993084046524e-06, |
| "step": 145, |
| "step_time": 11.4028208120003 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 213.125, |
| "completions/mean_terminated_length": 213.125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.420744133275093e-05, |
| "epoch": 0.00292, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.134711005259305e-05, |
| "kl": 5.1818161606788635, |
| "learning_rate": 7.238451872759005e-05, |
| "loss": 0.0084, |
| "num_tokens": 5464823.0, |
| "reward": 0.8305915594100952, |
| "reward_std": 0.40696918964385986, |
| "rewards/rollout_reward_func/mean": 0.8305915594100952, |
| "rewards/rollout_reward_func/std": 0.396116703748703, |
| "sampling/importance_sampling_ratio/max": 1.0000261068344116, |
| "sampling/importance_sampling_ratio/mean": 0.9999996423721313, |
| "sampling/importance_sampling_ratio/min": 0.999976396560669, |
| "sampling/sampling_logp_difference/max": 2.4913773813750595e-05, |
| "sampling/sampling_logp_difference/mean": 2.3334573597821873e-06, |
| "step": 146, |
| "step_time": 10.321809514000051 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 303.0, |
| "completions/mean_terminated_length": 303.0, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010614871865755049, |
| "epoch": 0.00294, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.479826697614044e-05, |
| "kl": 5.353869080543518, |
| "learning_rate": 7.195499096708908e-05, |
| "loss": 0.0023, |
| "num_tokens": 5505107.0, |
| "reward": 0.6477623581886292, |
| "reward_std": 0.3948605954647064, |
| "rewards/rollout_reward_func/mean": 0.6477623581886292, |
| "rewards/rollout_reward_func/std": 0.489864706993103, |
| "sampling/importance_sampling_ratio/max": 1.0000009536743164, |
| "sampling/importance_sampling_ratio/mean": 0.9999884963035583, |
| "sampling/importance_sampling_ratio/min": 0.9998813271522522, |
| "sampling/sampling_logp_difference/max": 7.403669587802142e-05, |
| "sampling/sampling_logp_difference/mean": 3.598812554628239e-06, |
| "step": 147, |
| "step_time": 11.450413204000597 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 198.65625, |
| "completions/mean_terminated_length": 198.65625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.508765266095452e-05, |
| "epoch": 0.00296, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 7.576803909614682e-05, |
| "kl": 5.068997144699097, |
| "learning_rate": 7.152413439309507e-05, |
| "loss": -0.0066, |
| "num_tokens": 5540702.0, |
| "reward": 0.7034207582473755, |
| "reward_std": 0.3982081413269043, |
| "rewards/rollout_reward_func/mean": 0.7034207582473755, |
| "rewards/rollout_reward_func/std": 0.4712841808795929, |
| "sampling/importance_sampling_ratio/max": 1.000001072883606, |
| "sampling/importance_sampling_ratio/mean": 0.9999918937683105, |
| "sampling/importance_sampling_ratio/min": 0.9999278783798218, |
| "sampling/sampling_logp_difference/max": 3.9339749491773546e-05, |
| "sampling/sampling_logp_difference/mean": 2.7800087991636246e-06, |
| "step": 148, |
| "step_time": 10.281147137000517 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 182.40625, |
| "completions/mean_terminated_length": 182.40625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.207081082720833e-05, |
| "epoch": 0.00298, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.50255832017865e-05, |
| "kl": 5.564902618527412, |
| "learning_rate": 7.109200955860391e-05, |
| "loss": -0.0056, |
| "num_tokens": 5577184.0, |
| "reward": 0.7362499833106995, |
| "reward_std": 0.46878689527511597, |
| "rewards/rollout_reward_func/mean": 0.7362499833106995, |
| "rewards/rollout_reward_func/std": 0.4626832604408264, |
| "sampling/importance_sampling_ratio/max": 1.000007152557373, |
| "sampling/importance_sampling_ratio/mean": 0.9999955892562866, |
| "sampling/importance_sampling_ratio/min": 0.9999680519104004, |
| "sampling/sampling_logp_difference/max": 3.171010030200705e-05, |
| "sampling/sampling_logp_difference/mean": 2.3544655505247647e-06, |
| "step": 149, |
| "step_time": 11.345688200000495 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 292.65625, |
| "completions/mean_terminated_length": 292.65625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010053441897639459, |
| "epoch": 0.003, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.2554990209755488e-05, |
| "kl": 4.625477809458971, |
| "learning_rate": 7.065867719485405e-05, |
| "loss": -0.004, |
| "num_tokens": 5617306.0, |
| "reward": 0.7446038722991943, |
| "reward_std": 0.5122137665748596, |
| "rewards/rollout_reward_func/mean": 0.7446038722991943, |
| "rewards/rollout_reward_func/std": 0.5244807600975037, |
| "sampling/importance_sampling_ratio/max": 1.00002920627594, |
| "sampling/importance_sampling_ratio/mean": 0.9999974370002747, |
| "sampling/importance_sampling_ratio/min": 0.9999731183052063, |
| "sampling/sampling_logp_difference/max": 3.337630914757028e-05, |
| "sampling/sampling_logp_difference/mean": 2.951200485767913e-06, |
| "step": 150, |
| "step_time": 10.506773899000109 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 222.65625, |
| "completions/mean_terminated_length": 222.65625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.440468580772631e-05, |
| "epoch": 0.00302, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.207312304060906e-05, |
| "kl": 4.787262956320774, |
| "learning_rate": 7.022419820279119e-05, |
| "loss": 0.0178, |
| "num_tokens": 5654322.0, |
| "reward": 0.673420786857605, |
| "reward_std": 0.4894208014011383, |
| "rewards/rollout_reward_func/mean": 0.673420786857605, |
| "rewards/rollout_reward_func/std": 0.477554589509964, |
| "sampling/importance_sampling_ratio/max": 1.0000011920928955, |
| "sampling/importance_sampling_ratio/mean": 0.9999935626983643, |
| "sampling/importance_sampling_ratio/min": 0.9999630451202393, |
| "sampling/sampling_logp_difference/max": 3.206796827726066e-05, |
| "sampling/sampling_logp_difference/mean": 2.7845492240885505e-06, |
| "step": 151, |
| "step_time": 11.023018873000638 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 187.1875, |
| "completions/mean_terminated_length": 187.1875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.2811070446523445e-05, |
| "epoch": 0.00304, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.808218985388521e-05, |
| "kl": 5.205201223492622, |
| "learning_rate": 6.978863364450934e-05, |
| "loss": 0.0038, |
| "num_tokens": 5689557.0, |
| "reward": 0.6399999856948853, |
| "reward_std": 0.4861735701560974, |
| "rewards/rollout_reward_func/mean": 0.6399999856948853, |
| "rewards/rollout_reward_func/std": 0.49357154965400696, |
| "sampling/importance_sampling_ratio/max": 1.0000072717666626, |
| "sampling/importance_sampling_ratio/mean": 0.9999970197677612, |
| "sampling/importance_sampling_ratio/min": 0.9999759197235107, |
| "sampling/sampling_logp_difference/max": 2.753835724433884e-05, |
| "sampling/sampling_logp_difference/mean": 1.748717750160722e-06, |
| "step": 152, |
| "step_time": 10.42158465600005 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 649.0, |
| "completions/max_terminated_length": 649.0, |
| "completions/mean_length": 236.3125, |
| "completions/mean_terminated_length": 236.3125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.034046837224196e-05, |
| "epoch": 0.00306, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.1351421057479456e-05, |
| "kl": 4.991829484701157, |
| "learning_rate": 6.935204473466904e-05, |
| "loss": 0.0026, |
| "num_tokens": 5727626.0, |
| "reward": 0.7721707820892334, |
| "reward_std": 0.4510332942008972, |
| "rewards/rollout_reward_func/mean": 0.7721707820892334, |
| "rewards/rollout_reward_func/std": 0.4416636824607849, |
| "sampling/importance_sampling_ratio/max": 1.0000555515289307, |
| "sampling/importance_sampling_ratio/mean": 0.9999988079071045, |
| "sampling/importance_sampling_ratio/min": 0.9999600648880005, |
| "sampling/sampling_logp_difference/max": 5.256607983028516e-05, |
| "sampling/sampling_logp_difference/mean": 2.8970951007067924e-06, |
| "step": 153, |
| "step_time": 11.246140894999826 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 285.09375, |
| "completions/mean_terminated_length": 285.09375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.816164853759801e-05, |
| "epoch": 0.00308, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.816457294509746e-05, |
| "kl": 5.398580640554428, |
| "learning_rate": 6.891449283189408e-05, |
| "loss": -0.0029, |
| "num_tokens": 5767888.0, |
| "reward": 0.7736831307411194, |
| "reward_std": 0.4541749954223633, |
| "rewards/rollout_reward_func/mean": 0.7736831307411194, |
| "rewards/rollout_reward_func/std": 0.4432695508003235, |
| "sampling/importance_sampling_ratio/max": 1.0000208616256714, |
| "sampling/importance_sampling_ratio/mean": 0.9999924302101135, |
| "sampling/importance_sampling_ratio/min": 0.9999641180038452, |
| "sampling/sampling_logp_difference/max": 4.196213558316231e-05, |
| "sampling/sampling_logp_difference/mean": 3.3789206099754665e-06, |
| "step": 154, |
| "step_time": 10.789620635000574 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 168.5, |
| "completions/mean_terminated_length": 168.5, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.32246807899628e-05, |
| "epoch": 0.0031, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.940055754967034e-05, |
| "kl": 3.8871512040495872, |
| "learning_rate": 6.847603943014831e-05, |
| "loss": -0.0094, |
| "num_tokens": 5801517.0, |
| "reward": 0.9227623343467712, |
| "reward_std": 0.2715562582015991, |
| "rewards/rollout_reward_func/mean": 0.9227623343467712, |
| "rewards/rollout_reward_func/std": 0.29780396819114685, |
| "sampling/importance_sampling_ratio/max": 1.000018835067749, |
| "sampling/importance_sampling_ratio/mean": 0.9999949932098389, |
| "sampling/importance_sampling_ratio/min": 0.9999276399612427, |
| "sampling/sampling_logp_difference/max": 7.188395102275535e-05, |
| "sampling/sampling_logp_difference/mean": 2.5055819605768193e-06, |
| "step": 155, |
| "step_time": 11.14235565700028 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 291.6875, |
| "completions/mean_terminated_length": 291.6875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.755855474224973e-05, |
| "epoch": 0.00312, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.6484132326440886e-05, |
| "kl": 5.283691808581352, |
| "learning_rate": 6.803674615009306e-05, |
| "loss": 0.0013, |
| "num_tokens": 5842047.0, |
| "reward": 0.6480915546417236, |
| "reward_std": 0.42874252796173096, |
| "rewards/rollout_reward_func/mean": 0.6480915546417236, |
| "rewards/rollout_reward_func/std": 0.49206840991973877, |
| "sampling/importance_sampling_ratio/max": 1.0000343322753906, |
| "sampling/importance_sampling_ratio/mean": 0.9999889135360718, |
| "sampling/importance_sampling_ratio/min": 0.9998984336853027, |
| "sampling/sampling_logp_difference/max": 5.508240428753197e-05, |
| "sampling/sampling_logp_difference/mean": 3.7744730434496887e-06, |
| "step": 156, |
| "step_time": 10.346119937999902 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 629.0, |
| "completions/max_terminated_length": 629.0, |
| "completions/mean_length": 192.3125, |
| "completions/mean_terminated_length": 192.3125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.816107470797306e-05, |
| "epoch": 0.00314, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.39569957961794e-05, |
| "kl": 5.069719024002552, |
| "learning_rate": 6.759667473042693e-05, |
| "loss": -0.0042, |
| "num_tokens": 5877952.0, |
| "reward": 0.6434208154678345, |
| "reward_std": 0.5129468441009521, |
| "rewards/rollout_reward_func/mean": 0.6434208154678345, |
| "rewards/rollout_reward_func/std": 0.49626046419143677, |
| "sampling/importance_sampling_ratio/max": 1.0000133514404297, |
| "sampling/importance_sampling_ratio/mean": 0.9999951124191284, |
| "sampling/importance_sampling_ratio/min": 0.9999496936798096, |
| "sampling/sampling_logp_difference/max": 3.862451922032051e-05, |
| "sampling/sampling_logp_difference/mean": 2.2066908513806993e-06, |
| "step": 157, |
| "step_time": 10.50308546999986 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 155.96875, |
| "completions/mean_terminated_length": 155.96875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 4.922382314020979e-05, |
| "epoch": 0.00316, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.8020789108704776e-05, |
| "kl": 4.0720951023104135, |
| "learning_rate": 6.71558870192091e-05, |
| "loss": 0.0129, |
| "num_tokens": 5909654.0, |
| "reward": 0.7300000190734863, |
| "reward_std": 0.44941431283950806, |
| "rewards/rollout_reward_func/mean": 0.7300000190734863, |
| "rewards/rollout_reward_func/std": 0.45300430059432983, |
| "sampling/importance_sampling_ratio/max": 1.000025749206543, |
| "sampling/importance_sampling_ratio/mean": 0.9999992847442627, |
| "sampling/importance_sampling_ratio/min": 0.9999586343765259, |
| "sampling/sampling_logp_difference/max": 3.397205000510439e-05, |
| "sampling/sampling_logp_difference/mean": 2.13175098906504e-06, |
| "step": 158, |
| "step_time": 10.794820879999861 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 235.4375, |
| "completions/mean_terminated_length": 235.4375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.976622259775468e-05, |
| "epoch": 0.00318, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 7.966442353790626e-05, |
| "kl": 4.3015144392848015, |
| "learning_rate": 6.671444496516697e-05, |
| "loss": 0.0141, |
| "num_tokens": 5945688.0, |
| "reward": 0.7380915284156799, |
| "reward_std": 0.4598575234413147, |
| "rewards/rollout_reward_func/mean": 0.7380915284156799, |
| "rewards/rollout_reward_func/std": 0.45405706763267517, |
| "sampling/importance_sampling_ratio/max": 1.0000113248825073, |
| "sampling/importance_sampling_ratio/mean": 0.9999938011169434, |
| "sampling/importance_sampling_ratio/min": 0.9999215602874756, |
| "sampling/sampling_logp_difference/max": 8.154689567163587e-05, |
| "sampling/sampling_logp_difference/mean": 2.5779245333978906e-06, |
| "step": 159, |
| "step_time": 10.786944689001302 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 233.53125, |
| "completions/mean_terminated_length": 233.53125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.402424188602708e-05, |
| "epoch": 0.0032, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.717422416433692e-05, |
| "kl": 5.396564111113548, |
| "learning_rate": 6.627241060898992e-05, |
| "loss": 0.0013, |
| "num_tokens": 5983663.0, |
| "reward": 0.6759207844734192, |
| "reward_std": 0.49697309732437134, |
| "rewards/rollout_reward_func/mean": 0.6759207844734192, |
| "rewards/rollout_reward_func/std": 0.48406854271888733, |
| "sampling/importance_sampling_ratio/max": 1.0000170469284058, |
| "sampling/importance_sampling_ratio/mean": 0.9999961853027344, |
| "sampling/importance_sampling_ratio/min": 0.9998974204063416, |
| "sampling/sampling_logp_difference/max": 0.00010955836478387937, |
| "sampling/sampling_logp_difference/mean": 2.9753630315099144e-06, |
| "step": 160, |
| "step_time": 10.427135486000225 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 659.0, |
| "completions/max_terminated_length": 659.0, |
| "completions/mean_length": 219.5625, |
| "completions/mean_terminated_length": 219.5625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.343402867832083e-05, |
| "epoch": 0.00322, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.0754632209427655e-05, |
| "kl": 4.563273504376411, |
| "learning_rate": 6.582984607461005e-05, |
| "loss": -0.0, |
| "num_tokens": 6020228.0, |
| "reward": 0.8640123605728149, |
| "reward_std": 0.32256150245666504, |
| "rewards/rollout_reward_func/mean": 0.8640123605728149, |
| "rewards/rollout_reward_func/std": 0.3677850067615509, |
| "sampling/importance_sampling_ratio/max": 1.0000296831130981, |
| "sampling/importance_sampling_ratio/mean": 0.9999971389770508, |
| "sampling/importance_sampling_ratio/min": 0.9999282360076904, |
| "sampling/sampling_logp_difference/max": 4.3036350689362735e-05, |
| "sampling/sampling_logp_difference/mean": 2.5196563910867553e-06, |
| "step": 161, |
| "step_time": 11.209509552999862 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 687.0, |
| "completions/max_terminated_length": 687.0, |
| "completions/mean_length": 229.59375, |
| "completions/mean_terminated_length": 229.59375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.595711480869795e-05, |
| "epoch": 0.00324, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.432530790334567e-05, |
| "kl": 4.6679516807198524, |
| "learning_rate": 6.538681356047126e-05, |
| "loss": -0.0007, |
| "num_tokens": 6057489.0, |
| "reward": 0.7705915570259094, |
| "reward_std": 0.33682525157928467, |
| "rewards/rollout_reward_func/mean": 0.7705915570259094, |
| "rewards/rollout_reward_func/std": 0.4407337009906769, |
| "sampling/importance_sampling_ratio/max": 1.0000135898590088, |
| "sampling/importance_sampling_ratio/mean": 0.999994158744812, |
| "sampling/importance_sampling_ratio/min": 0.9999637603759766, |
| "sampling/sampling_logp_difference/max": 4.1843191866064444e-05, |
| "sampling/sampling_logp_difference/mean": 2.773166670522187e-06, |
| "step": 162, |
| "step_time": 10.208484574999602 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 649.0, |
| "completions/max_terminated_length": 649.0, |
| "completions/mean_length": 207.3125, |
| "completions/mean_terminated_length": 207.3125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.422831117764872e-05, |
| "epoch": 0.00326, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.0685478375526145e-05, |
| "kl": 4.779223263263702, |
| "learning_rate": 6.494337533078768e-05, |
| "loss": 0.0052, |
| "num_tokens": 6093527.0, |
| "reward": 0.6440123319625854, |
| "reward_std": 0.5318774580955505, |
| "rewards/rollout_reward_func/mean": 0.6440123319625854, |
| "rewards/rollout_reward_func/std": 0.5527451038360596, |
| "sampling/importance_sampling_ratio/max": 1.0000094175338745, |
| "sampling/importance_sampling_ratio/mean": 0.9999961853027344, |
| "sampling/importance_sampling_ratio/min": 0.99996018409729, |
| "sampling/sampling_logp_difference/max": 2.408051113889087e-05, |
| "sampling/sampling_logp_difference/mean": 2.0797965589736123e-06, |
| "step": 163, |
| "step_time": 11.039077626001017 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 687.0, |
| "completions/max_terminated_length": 687.0, |
| "completions/mean_length": 224.78125, |
| "completions/mean_terminated_length": 224.78125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010505592480569703, |
| "epoch": 0.00328, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.714211197802797e-05, |
| "kl": 4.768290672451258, |
| "learning_rate": 6.449959370679315e-05, |
| "loss": -0.0051, |
| "num_tokens": 6130535.0, |
| "reward": 0.6136831045150757, |
| "reward_std": 0.5993773937225342, |
| "rewards/rollout_reward_func/mean": 0.6136831045150757, |
| "rewards/rollout_reward_func/std": 0.6141568422317505, |
| "sampling/importance_sampling_ratio/max": 1.0000272989273071, |
| "sampling/importance_sampling_ratio/mean": 0.9999916553497314, |
| "sampling/importance_sampling_ratio/min": 0.9998797178268433, |
| "sampling/sampling_logp_difference/max": 0.00010908614785876125, |
| "sampling/sampling_logp_difference/mean": 3.312961098345113e-06, |
| "step": 164, |
| "step_time": 10.286591632999716 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 220.5625, |
| "completions/mean_terminated_length": 220.5625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.280044454522795e-05, |
| "epoch": 0.0033, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.6241592422593385e-05, |
| "kl": 5.127960130572319, |
| "learning_rate": 6.40555310579825e-05, |
| "loss": 0.0136, |
| "num_tokens": 6167111.0, |
| "reward": 0.736512303352356, |
| "reward_std": 0.46028703451156616, |
| "rewards/rollout_reward_func/mean": 0.736512303352356, |
| "rewards/rollout_reward_func/std": 0.45586925745010376, |
| "sampling/importance_sampling_ratio/max": 1.0000137090682983, |
| "sampling/importance_sampling_ratio/mean": 0.9999931454658508, |
| "sampling/importance_sampling_ratio/min": 0.999862790107727, |
| "sampling/sampling_logp_difference/max": 0.00010932452278211713, |
| "sampling/sampling_logp_difference/mean": 3.687848220579326e-06, |
| "step": 165, |
| "step_time": 11.5190469510012 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 211.75, |
| "completions/mean_terminated_length": 211.75, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.565107993277252e-05, |
| "epoch": 0.00332, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.7061549189966172e-05, |
| "kl": 5.664609462022781, |
| "learning_rate": 6.3611249793346e-05, |
| "loss": 0.0054, |
| "num_tokens": 6203530.0, |
| "reward": 0.5784207582473755, |
| "reward_std": 0.5085035562515259, |
| "rewards/rollout_reward_func/mean": 0.5784207582473755, |
| "rewards/rollout_reward_func/std": 0.5033032298088074, |
| "sampling/importance_sampling_ratio/max": 1.000006914138794, |
| "sampling/importance_sampling_ratio/mean": 0.9999963641166687, |
| "sampling/importance_sampling_ratio/min": 0.9999628663063049, |
| "sampling/sampling_logp_difference/max": 3.2067451684270054e-05, |
| "sampling/sampling_logp_difference/mean": 1.9461851934465813e-06, |
| "step": 166, |
| "step_time": 10.759499068000423 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 243.09375, |
| "completions/mean_terminated_length": 243.09375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.166922380643427e-05, |
| "epoch": 0.00334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.269746957812458e-05, |
| "kl": 5.353539019823074, |
| "learning_rate": 6.316681235259841e-05, |
| "loss": -0.0023, |
| "num_tokens": 6241522.0, |
| "reward": 0.7074331045150757, |
| "reward_std": 0.4500606656074524, |
| "rewards/rollout_reward_func/mean": 0.7074331045150757, |
| "rewards/rollout_reward_func/std": 0.5293745398521423, |
| "sampling/importance_sampling_ratio/max": 1.000022292137146, |
| "sampling/importance_sampling_ratio/mean": 0.9999964833259583, |
| "sampling/importance_sampling_ratio/min": 0.9999686479568481, |
| "sampling/sampling_logp_difference/max": 3.9577906136401e-05, |
| "sampling/sampling_logp_difference/mean": 2.4976732220238773e-06, |
| "step": 167, |
| "step_time": 10.744948780001323 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 687.0, |
| "completions/max_terminated_length": 687.0, |
| "completions/mean_length": 316.25, |
| "completions/mean_terminated_length": 316.25, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.831737940795392e-05, |
| "epoch": 0.00336, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.081060589873232e-05, |
| "kl": 5.351280067116022, |
| "learning_rate": 6.272228119740365e-05, |
| "loss": -0.0001, |
| "num_tokens": 6283073.0, |
| "reward": 0.6187499761581421, |
| "reward_std": 0.5161277651786804, |
| "rewards/rollout_reward_func/mean": 0.6187499761581421, |
| "rewards/rollout_reward_func/std": 0.5022770762443542, |
| "sampling/importance_sampling_ratio/max": 1.0000226497650146, |
| "sampling/importance_sampling_ratio/mean": 0.9999954700469971, |
| "sampling/importance_sampling_ratio/min": 0.9999697208404541, |
| "sampling/sampling_logp_difference/max": 2.861044958990533e-05, |
| "sampling/sampling_logp_difference/mean": 2.5904687390720937e-06, |
| "step": 168, |
| "step_time": 10.471195426000122 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 281.9375, |
| "completions/mean_terminated_length": 281.9375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.795457219434866e-05, |
| "epoch": 0.00338, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.220425009611063e-05, |
| "kl": 5.150362059473991, |
| "learning_rate": 6.227771880259637e-05, |
| "loss": 0.0149, |
| "num_tokens": 6323047.0, |
| "reward": 0.6793415546417236, |
| "reward_std": 0.5005303621292114, |
| "rewards/rollout_reward_func/mean": 0.6793415546417236, |
| "rewards/rollout_reward_func/std": 0.4792368412017822, |
| "sampling/importance_sampling_ratio/max": 1.00005042552948, |
| "sampling/importance_sampling_ratio/mean": 0.9999979138374329, |
| "sampling/importance_sampling_ratio/min": 0.9999761581420898, |
| "sampling/sampling_logp_difference/max": 4.2317173210904e-05, |
| "sampling/sampling_logp_difference/mean": 2.44653210756951e-06, |
| "step": 169, |
| "step_time": 11.402160383000137 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 142.375, |
| "completions/mean_terminated_length": 142.375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 4.836189982881933e-05, |
| "epoch": 0.0034, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.064507109229453e-05, |
| "kl": 4.963020361959934, |
| "learning_rate": 6.183318764740161e-05, |
| "loss": -0.0042, |
| "num_tokens": 6356441.0, |
| "reward": 0.7005915641784668, |
| "reward_std": 0.48488616943359375, |
| "rewards/rollout_reward_func/mean": 0.7005915641784668, |
| "rewards/rollout_reward_func/std": 0.4747658967971802, |
| "sampling/importance_sampling_ratio/max": 1.000011682510376, |
| "sampling/importance_sampling_ratio/mean": 0.999997079372406, |
| "sampling/importance_sampling_ratio/min": 0.9999744892120361, |
| "sampling/sampling_logp_difference/max": 3.075617496506311e-05, |
| "sampling/sampling_logp_difference/mean": 1.6916058029892156e-06, |
| "step": 170, |
| "step_time": 10.331447902999571 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 649.0, |
| "completions/max_terminated_length": 649.0, |
| "completions/mean_length": 191.6875, |
| "completions/mean_terminated_length": 191.6875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.195598889353505e-05, |
| "epoch": 0.00342, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.0986659112386405e-05, |
| "kl": 4.95799808204174, |
| "learning_rate": 6.138875020665402e-05, |
| "loss": -0.001, |
| "num_tokens": 6392772.0, |
| "reward": 0.7986831665039062, |
| "reward_std": 0.43947798013687134, |
| "rewards/rollout_reward_func/mean": 0.7986831665039062, |
| "rewards/rollout_reward_func/std": 0.4244628846645355, |
| "sampling/importance_sampling_ratio/max": 1.000011682510376, |
| "sampling/importance_sampling_ratio/mean": 0.9999972581863403, |
| "sampling/importance_sampling_ratio/min": 0.999981164932251, |
| "sampling/sampling_logp_difference/max": 2.6346318918513134e-05, |
| "sampling/sampling_logp_difference/mean": 2.3348138711298816e-06, |
| "step": 171, |
| "step_time": 10.487647148000178 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 304.59375, |
| "completions/mean_terminated_length": 304.59375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.308483593566507e-05, |
| "epoch": 0.00344, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.993174686911516e-05, |
| "kl": 5.391621388494968, |
| "learning_rate": 6.0944468942017506e-05, |
| "loss": -0.0032, |
| "num_tokens": 6433785.0, |
| "reward": 0.6780915260314941, |
| "reward_std": 0.47621750831604004, |
| "rewards/rollout_reward_func/mean": 0.6780915260314941, |
| "rewards/rollout_reward_func/std": 0.4856433868408203, |
| "sampling/importance_sampling_ratio/max": 1.0000230073928833, |
| "sampling/importance_sampling_ratio/mean": 0.9999980330467224, |
| "sampling/importance_sampling_ratio/min": 0.9999696016311646, |
| "sampling/sampling_logp_difference/max": 3.433286474319175e-05, |
| "sampling/sampling_logp_difference/mean": 2.4288412987516494e-06, |
| "step": 172, |
| "step_time": 11.141727451000861 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 228.15625, |
| "completions/mean_terminated_length": 228.15625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.799076345487265e-05, |
| "epoch": 0.00346, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.5050336009589955e-05, |
| "kl": 5.1544738709926605, |
| "learning_rate": 6.050040629320685e-05, |
| "loss": -0.0035, |
| "num_tokens": 6471147.0, |
| "reward": 0.6752623319625854, |
| "reward_std": 0.36605003476142883, |
| "rewards/rollout_reward_func/mean": 0.6752623319625854, |
| "rewards/rollout_reward_func/std": 0.4842662811279297, |
| "sampling/importance_sampling_ratio/max": 1.0000275373458862, |
| "sampling/importance_sampling_ratio/mean": 0.9999979734420776, |
| "sampling/importance_sampling_ratio/min": 0.9999769926071167, |
| "sampling/sampling_logp_difference/max": 3.2067451684270054e-05, |
| "sampling/sampling_logp_difference/mean": 1.975222858163761e-06, |
| "step": 173, |
| "step_time": 10.814325706000545 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 216.21875, |
| "completions/mean_terminated_length": 216.21875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.060093649930877e-05, |
| "epoch": 0.00348, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.7824895116500556e-05, |
| "kl": 4.555030801682733, |
| "learning_rate": 6.0056624669212335e-05, |
| "loss": -0.0107, |
| "num_tokens": 6508087.0, |
| "reward": 0.8005247116088867, |
| "reward_std": 0.4093303382396698, |
| "rewards/rollout_reward_func/mean": 0.8005247116088867, |
| "rewards/rollout_reward_func/std": 0.4889991879463196, |
| "sampling/importance_sampling_ratio/max": 1.0000141859054565, |
| "sampling/importance_sampling_ratio/mean": 0.9999945163726807, |
| "sampling/importance_sampling_ratio/min": 0.9999269247055054, |
| "sampling/sampling_logp_difference/max": 6.58156059216708e-05, |
| "sampling/sampling_logp_difference/mean": 2.803930328809656e-06, |
| "step": 174, |
| "step_time": 10.91736506000052 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 224.6875, |
| "completions/mean_terminated_length": 224.6875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.575894181854892e-05, |
| "epoch": 0.0035, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.9673146855202504e-05, |
| "kl": 4.930005133152008, |
| "learning_rate": 5.961318643952876e-05, |
| "loss": -0.001, |
| "num_tokens": 6545080.0, |
| "reward": 0.5815123319625854, |
| "reward_std": 0.5429329872131348, |
| "rewards/rollout_reward_func/mean": 0.5815123319625854, |
| "rewards/rollout_reward_func/std": 0.5628648996353149, |
| "sampling/importance_sampling_ratio/max": 1.0000128746032715, |
| "sampling/importance_sampling_ratio/mean": 0.9999929666519165, |
| "sampling/importance_sampling_ratio/min": 0.9999529123306274, |
| "sampling/sampling_logp_difference/max": 5.090353079140186e-05, |
| "sampling/sampling_logp_difference/mean": 2.7515507099451497e-06, |
| "step": 175, |
| "step_time": 11.003749063999749 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 260.0, |
| "completions/mean_terminated_length": 260.0, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.6100228437449e-05, |
| "epoch": 0.00352, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7575541278347373e-05, |
| "kl": 4.587189957499504, |
| "learning_rate": 5.917015392538995e-05, |
| "loss": 0.0078, |
| "num_tokens": 6582632.0, |
| "reward": 0.7384207844734192, |
| "reward_std": 0.4618578553199768, |
| "rewards/rollout_reward_func/mean": 0.7384207844734192, |
| "rewards/rollout_reward_func/std": 0.4556211829185486, |
| "sampling/importance_sampling_ratio/max": 1.000009298324585, |
| "sampling/importance_sampling_ratio/mean": 0.9999972581863403, |
| "sampling/importance_sampling_ratio/min": 0.9999776482582092, |
| "sampling/sampling_logp_difference/max": 2.0385952666401863e-05, |
| "sampling/sampling_logp_difference/mean": 2.221093836851651e-06, |
| "step": 176, |
| "step_time": 10.7899746460007 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 669.0, |
| "completions/max_terminated_length": 669.0, |
| "completions/mean_length": 235.1875, |
| "completions/mean_terminated_length": 235.1875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.040411874153051e-05, |
| "epoch": 0.00354, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.335916335345246e-05, |
| "kl": 5.1722188803069, |
| "learning_rate": 5.872758939101011e-05, |
| "loss": -0.0138, |
| "num_tokens": 6620470.0, |
| "reward": 0.6118415594100952, |
| "reward_std": 0.3704405725002289, |
| "rewards/rollout_reward_func/mean": 0.6118415594100952, |
| "rewards/rollout_reward_func/std": 0.5016486644744873, |
| "sampling/importance_sampling_ratio/max": 1.0000079870224, |
| "sampling/importance_sampling_ratio/mean": 0.9999971389770508, |
| "sampling/importance_sampling_ratio/min": 0.9999728202819824, |
| "sampling/sampling_logp_difference/max": 2.1219901100266725e-05, |
| "sampling/sampling_logp_difference/mean": 1.7425376199753373e-06, |
| "step": 177, |
| "step_time": 10.707822592999946 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 255.03125, |
| "completions/mean_terminated_length": 255.03125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.830880042230092e-05, |
| "epoch": 0.00356, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.9521069993497804e-05, |
| "kl": 5.192914575338364, |
| "learning_rate": 5.828555503483305e-05, |
| "loss": -0.0072, |
| "num_tokens": 6659130.0, |
| "reward": 0.6134207844734192, |
| "reward_std": 0.48705360293388367, |
| "rewards/rollout_reward_func/mean": 0.6134207844734192, |
| "rewards/rollout_reward_func/std": 0.5055395364761353, |
| "sampling/importance_sampling_ratio/max": 1.0000206232070923, |
| "sampling/importance_sampling_ratio/mean": 0.9999992847442627, |
| "sampling/importance_sampling_ratio/min": 0.9999740123748779, |
| "sampling/sampling_logp_difference/max": 2.705884980969131e-05, |
| "sampling/sampling_logp_difference/mean": 2.5482090677542146e-06, |
| "step": 178, |
| "step_time": 10.992413979999128 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 225.25, |
| "completions/mean_terminated_length": 225.25, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.845150574325999e-05, |
| "epoch": 0.00358, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.7154950657859445e-05, |
| "kl": 4.836911082267761, |
| "learning_rate": 5.784411298079091e-05, |
| "loss": 0.001, |
| "num_tokens": 6696650.0, |
| "reward": 0.6777623891830444, |
| "reward_std": 0.5198129415512085, |
| "rewards/rollout_reward_func/mean": 0.6777623891830444, |
| "rewards/rollout_reward_func/std": 0.5467191338539124, |
| "sampling/importance_sampling_ratio/max": 1.0000206232070923, |
| "sampling/importance_sampling_ratio/mean": 0.9999957084655762, |
| "sampling/importance_sampling_ratio/min": 0.999954104423523, |
| "sampling/sampling_logp_difference/max": 3.7670324672944844e-05, |
| "sampling/sampling_logp_difference/mean": 3.4365023111604387e-06, |
| "step": 179, |
| "step_time": 10.803954972000156 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 295.0625, |
| "completions/mean_terminated_length": 295.0625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.146770800327886e-05, |
| "epoch": 0.0036, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.348440052126534e-05, |
| "kl": 5.05779367685318, |
| "learning_rate": 5.740332526957307e-05, |
| "loss": 0.0057, |
| "num_tokens": 6736701.0, |
| "reward": 0.6796707510948181, |
| "reward_std": 0.46409231424331665, |
| "rewards/rollout_reward_func/mean": 0.6796707510948181, |
| "rewards/rollout_reward_func/std": 0.48484089970588684, |
| "sampling/importance_sampling_ratio/max": 1.0000330209732056, |
| "sampling/importance_sampling_ratio/mean": 0.999993622303009, |
| "sampling/importance_sampling_ratio/min": 0.9999408721923828, |
| "sampling/sampling_logp_difference/max": 5.281723861116916e-05, |
| "sampling/sampling_logp_difference/mean": 3.24379061567015e-06, |
| "step": 180, |
| "step_time": 11.004632821999621 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 307.5625, |
| "completions/mean_terminated_length": 307.5625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.063932723383459e-05, |
| "epoch": 0.00362, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.0221406884957105e-05, |
| "kl": 5.286682575941086, |
| "learning_rate": 5.696325384990696e-05, |
| "loss": 0.0138, |
| "num_tokens": 6777629.0, |
| "reward": 0.5859208106994629, |
| "reward_std": 0.46358904242515564, |
| "rewards/rollout_reward_func/mean": 0.5859208106994629, |
| "rewards/rollout_reward_func/std": 0.5023459792137146, |
| "sampling/importance_sampling_ratio/max": 1.0000137090682983, |
| "sampling/importance_sampling_ratio/mean": 0.9999953508377075, |
| "sampling/importance_sampling_ratio/min": 0.9999548196792603, |
| "sampling/sampling_logp_difference/max": 4.208114114589989e-05, |
| "sampling/sampling_logp_difference/mean": 2.680135366972536e-06, |
| "step": 181, |
| "step_time": 11.124628821999067 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 659.0, |
| "completions/max_terminated_length": 659.0, |
| "completions/mean_length": 222.6875, |
| "completions/mean_terminated_length": 222.6875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.661826836624641e-05, |
| "epoch": 0.00364, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.2959258027840406e-05, |
| "kl": 5.0524120181798935, |
| "learning_rate": 5.6523960569851695e-05, |
| "loss": -0.0093, |
| "num_tokens": 6814561.0, |
| "reward": 0.7361831665039062, |
| "reward_std": 0.4711652994155884, |
| "rewards/rollout_reward_func/mean": 0.7361831665039062, |
| "rewards/rollout_reward_func/std": 0.4640645682811737, |
| "sampling/importance_sampling_ratio/max": 1.0000171661376953, |
| "sampling/importance_sampling_ratio/mean": 0.9999977350234985, |
| "sampling/importance_sampling_ratio/min": 0.9999733567237854, |
| "sampling/sampling_logp_difference/max": 3.1111223506741226e-05, |
| "sampling/sampling_logp_difference/mean": 2.562888994361856e-06, |
| "step": 182, |
| "step_time": 10.962344157000189 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 649.0, |
| "completions/max_terminated_length": 649.0, |
| "completions/mean_length": 253.375, |
| "completions/mean_terminated_length": 253.375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.879004907351828e-05, |
| "epoch": 0.00366, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.388055094750598e-05, |
| "kl": 5.53626848757267, |
| "learning_rate": 5.6085507168105936e-05, |
| "loss": 0.0083, |
| "num_tokens": 6853636.0, |
| "reward": 0.6771707534790039, |
| "reward_std": 0.47990167140960693, |
| "rewards/rollout_reward_func/mean": 0.6771707534790039, |
| "rewards/rollout_reward_func/std": 0.4823152720928192, |
| "sampling/importance_sampling_ratio/max": 1.000022053718567, |
| "sampling/importance_sampling_ratio/mean": 0.999996542930603, |
| "sampling/importance_sampling_ratio/min": 0.9998987913131714, |
| "sampling/sampling_logp_difference/max": 8.393276948481798e-05, |
| "sampling/sampling_logp_difference/mean": 3.3215160328836646e-06, |
| "step": 183, |
| "step_time": 10.043853558999217 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 278.5, |
| "completions/mean_terminated_length": 278.5, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010487484094312549, |
| "epoch": 0.00368, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.7909642085433006e-05, |
| "kl": 4.555940642952919, |
| "learning_rate": 5.5647955265330974e-05, |
| "loss": 0.0018, |
| "num_tokens": 6893156.0, |
| "reward": 0.6486831307411194, |
| "reward_std": 0.5506794452667236, |
| "rewards/rollout_reward_func/mean": 0.6486831307411194, |
| "rewards/rollout_reward_func/std": 0.5536602139472961, |
| "sampling/importance_sampling_ratio/max": 1.000025987625122, |
| "sampling/importance_sampling_ratio/mean": 0.9999922513961792, |
| "sampling/importance_sampling_ratio/min": 0.9998563528060913, |
| "sampling/sampling_logp_difference/max": 0.00014341842324938625, |
| "sampling/sampling_logp_difference/mean": 3.5290327105030883e-06, |
| "step": 184, |
| "step_time": 11.444932727999912 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 669.0, |
| "completions/max_terminated_length": 669.0, |
| "completions/mean_length": 215.75, |
| "completions/mean_terminated_length": 215.75, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.568410264013892e-05, |
| "epoch": 0.0037, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.518177618389018e-05, |
| "kl": 4.69642661511898, |
| "learning_rate": 5.5211366355490666e-05, |
| "loss": -0.0085, |
| "num_tokens": 6930186.0, |
| "reward": 0.8646707534790039, |
| "reward_std": 0.3878023624420166, |
| "rewards/rollout_reward_func/mean": 0.8646707534790039, |
| "rewards/rollout_reward_func/std": 0.37504494190216064, |
| "sampling/importance_sampling_ratio/max": 1.0000090599060059, |
| "sampling/importance_sampling_ratio/mean": 0.9999953508377075, |
| "sampling/importance_sampling_ratio/min": 0.9999690651893616, |
| "sampling/sampling_logp_difference/max": 4.506219192990102e-05, |
| "sampling/sampling_logp_difference/mean": 2.9929221909696935e-06, |
| "step": 185, |
| "step_time": 10.029982729999574 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 193.375, |
| "completions/mean_terminated_length": 193.375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.111554115761919e-05, |
| "epoch": 0.00372, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.515595744829625e-05, |
| "kl": 5.0036322847008705, |
| "learning_rate": 5.4775801797208824e-05, |
| "loss": -0.0062, |
| "num_tokens": 6966304.0, |
| "reward": 0.7059208154678345, |
| "reward_std": 0.4823508858680725, |
| "rewards/rollout_reward_func/mean": 0.7059208154678345, |
| "rewards/rollout_reward_func/std": 0.47572779655456543, |
| "sampling/importance_sampling_ratio/max": 1.0000250339508057, |
| "sampling/importance_sampling_ratio/mean": 0.9999973773956299, |
| "sampling/importance_sampling_ratio/min": 0.9999443292617798, |
| "sampling/sampling_logp_difference/max": 4.649218681151979e-05, |
| "sampling/sampling_logp_difference/mean": 2.706846771616256e-06, |
| "step": 186, |
| "step_time": 11.153666612000507 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 225.21875, |
| "completions/mean_terminated_length": 225.21875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.528406706886926e-05, |
| "epoch": 0.00374, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.1905819923849776e-05, |
| "kl": 4.318456851877272, |
| "learning_rate": 5.434132280514597e-05, |
| "loss": -0.0056, |
| "num_tokens": 7003007.0, |
| "reward": 0.7396707534790039, |
| "reward_std": 0.4821699857711792, |
| "rewards/rollout_reward_func/mean": 0.7396707534790039, |
| "rewards/rollout_reward_func/std": 0.4620356559753418, |
| "sampling/importance_sampling_ratio/max": 1.0000288486480713, |
| "sampling/importance_sampling_ratio/mean": 0.9999913573265076, |
| "sampling/importance_sampling_ratio/min": 0.9999553561210632, |
| "sampling/sampling_logp_difference/max": 4.5657627197215334e-05, |
| "sampling/sampling_logp_difference/mean": 3.5564712561608758e-06, |
| "step": 187, |
| "step_time": 10.209738572000333 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 156.25, |
| "completions/mean_terminated_length": 156.25, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 4.4102910081278424e-05, |
| "epoch": 0.00376, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1708140846167225e-05, |
| "kl": 4.209350232034922, |
| "learning_rate": 5.39079904413961e-05, |
| "loss": 0.008, |
| "num_tokens": 7036551.0, |
| "reward": 0.6680915355682373, |
| "reward_std": 0.49390894174575806, |
| "rewards/rollout_reward_func/mean": 0.6680915355682373, |
| "rewards/rollout_reward_func/std": 0.4823147654533386, |
| "sampling/importance_sampling_ratio/max": 1.000010371208191, |
| "sampling/importance_sampling_ratio/mean": 0.9999985694885254, |
| "sampling/importance_sampling_ratio/min": 0.9999769926071167, |
| "sampling/sampling_logp_difference/max": 2.1696479961974546e-05, |
| "sampling/sampling_logp_difference/mean": 1.3542378383135656e-06, |
| "step": 188, |
| "step_time": 11.282029985001373 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 211.25, |
| "completions/mean_terminated_length": 211.25, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.414088152302156e-05, |
| "epoch": 0.00378, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.6090696085011587e-05, |
| "kl": 4.265987630002201, |
| "learning_rate": 5.347586560690494e-05, |
| "loss": -0.0067, |
| "num_tokens": 7072324.0, |
| "reward": 0.7690123319625854, |
| "reward_std": 0.37984663248062134, |
| "rewards/rollout_reward_func/mean": 0.7690123319625854, |
| "rewards/rollout_reward_func/std": 0.43979594111442566, |
| "sampling/importance_sampling_ratio/max": 1.0000287294387817, |
| "sampling/importance_sampling_ratio/mean": 0.9999973177909851, |
| "sampling/importance_sampling_ratio/min": 0.9999397397041321, |
| "sampling/sampling_logp_difference/max": 4.434636502992362e-05, |
| "sampling/sampling_logp_difference/mean": 2.806390966725303e-06, |
| "step": 189, |
| "step_time": 10.240539944999455 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 217.5625, |
| "completions/mean_terminated_length": 217.5625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.151194306549314e-05, |
| "epoch": 0.0038, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.1802683477289975e-05, |
| "kl": 5.158947452902794, |
| "learning_rate": 5.304500903291094e-05, |
| "loss": 0.0011, |
| "num_tokens": 7109869.0, |
| "reward": 0.7086831331253052, |
| "reward_std": 0.5108458399772644, |
| "rewards/rollout_reward_func/mean": 0.7086831331253052, |
| "rewards/rollout_reward_func/std": 0.5356149673461914, |
| "sampling/importance_sampling_ratio/max": 1.0000258684158325, |
| "sampling/importance_sampling_ratio/mean": 0.9999958872795105, |
| "sampling/importance_sampling_ratio/min": 0.9999440908432007, |
| "sampling/sampling_logp_difference/max": 2.7300320653012022e-05, |
| "sampling/sampling_logp_difference/mean": 2.319904979231069e-06, |
| "step": 190, |
| "step_time": 11.589442052000777 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 167.09375, |
| "completions/mean_terminated_length": 167.09375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.836062217199014e-05, |
| "epoch": 0.00382, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.9179961933987215e-05, |
| "kl": 3.6461506029590964, |
| "learning_rate": 5.261548127240997e-05, |
| "loss": -0.0062, |
| "num_tokens": 7142827.0, |
| "reward": 0.7652623057365417, |
| "reward_std": 0.3935619592666626, |
| "rewards/rollout_reward_func/mean": 0.7652623057365417, |
| "rewards/rollout_reward_func/std": 0.5054081082344055, |
| "sampling/importance_sampling_ratio/max": 1.0000618696212769, |
| "sampling/importance_sampling_ratio/mean": 0.9999986290931702, |
| "sampling/importance_sampling_ratio/min": 0.9999737739562988, |
| "sampling/sampling_logp_difference/max": 9.068677900359035e-05, |
| "sampling/sampling_logp_difference/mean": 2.8852020932390587e-06, |
| "step": 191, |
| "step_time": 10.1199132500019 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 295.90625, |
| "completions/mean_terminated_length": 295.90625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.60994704762652e-05, |
| "epoch": 0.00384, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.605450547183864e-05, |
| "kl": 4.935833718627691, |
| "learning_rate": 5.218734269164519e-05, |
| "loss": 0.0128, |
| "num_tokens": 7183293.0, |
| "reward": 0.5855915546417236, |
| "reward_std": 0.5059728026390076, |
| "rewards/rollout_reward_func/mean": 0.5855915546417236, |
| "rewards/rollout_reward_func/std": 0.5033941864967346, |
| "sampling/importance_sampling_ratio/max": 1.0000156164169312, |
| "sampling/importance_sampling_ratio/mean": 0.9999943971633911, |
| "sampling/importance_sampling_ratio/min": 0.9999115467071533, |
| "sampling/sampling_logp_difference/max": 8.84538603713736e-05, |
| "sampling/sampling_logp_difference/mean": 2.977001713588834e-06, |
| "step": 192, |
| "step_time": 11.594122014000277 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 224.625, |
| "completions/mean_terminated_length": 224.625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.150646666786997e-05, |
| "epoch": 0.00386, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4954373909858987e-05, |
| "kl": 4.582384366542101, |
| "learning_rate": 5.176065346162311e-05, |
| "loss": -0.0057, |
| "num_tokens": 7220905.0, |
| "reward": 0.6765123009681702, |
| "reward_std": 0.5298900604248047, |
| "rewards/rollout_reward_func/mean": 0.6765123009681702, |
| "rewards/rollout_reward_func/std": 0.5459093451499939, |
| "sampling/importance_sampling_ratio/max": 1.0000386238098145, |
| "sampling/importance_sampling_ratio/mean": 0.9999974370002747, |
| "sampling/importance_sampling_ratio/min": 0.9999622106552124, |
| "sampling/sampling_logp_difference/max": 3.5402728826738894e-05, |
| "sampling/sampling_logp_difference/mean": 2.2574688500753837e-06, |
| "step": 193, |
| "step_time": 10.853508633000274 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 350.03125, |
| "completions/mean_terminated_length": 350.03125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.001271817192901e-05, |
| "epoch": 0.00388, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.810703492490575e-05, |
| "kl": 5.076992951333523, |
| "learning_rate": 5.1335473549657084e-05, |
| "loss": 0.0259, |
| "num_tokens": 7262276.0, |
| "reward": 0.6180915236473083, |
| "reward_std": 0.4720328152179718, |
| "rewards/rollout_reward_func/mean": 0.6180915236473083, |
| "rewards/rollout_reward_func/std": 0.49200955033302307, |
| "sampling/importance_sampling_ratio/max": 1.0000076293945312, |
| "sampling/importance_sampling_ratio/mean": 0.999992847442627, |
| "sampling/importance_sampling_ratio/min": 0.9999713897705078, |
| "sampling/sampling_logp_difference/max": 2.563028465374373e-05, |
| "sampling/sampling_logp_difference/mean": 2.8041777113685384e-06, |
| "step": 194, |
| "step_time": 10.52614910399916 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 172.90625, |
| "completions/mean_terminated_length": 172.90625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.115065123106888e-05, |
| "epoch": 0.0039, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.7086392189376056e-05, |
| "kl": 5.180005930364132, |
| "learning_rate": 5.0911862710939485e-05, |
| "loss": -0.0109, |
| "num_tokens": 7298179.0, |
| "reward": 0.578091561794281, |
| "reward_std": 0.4882219433784485, |
| "rewards/rollout_reward_func/mean": 0.578091561794281, |
| "rewards/rollout_reward_func/std": 0.5080995559692383, |
| "sampling/importance_sampling_ratio/max": 1.0000019073486328, |
| "sampling/importance_sampling_ratio/mean": 0.9999942183494568, |
| "sampling/importance_sampling_ratio/min": 0.9999573230743408, |
| "sampling/sampling_logp_difference/max": 3.194832243025303e-05, |
| "sampling/sampling_logp_difference/mean": 2.1487360299943248e-06, |
| "step": 195, |
| "step_time": 11.431366815999809 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 247.6875, |
| "completions/mean_terminated_length": 247.6875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.820983522004553e-05, |
| "epoch": 0.00392, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9182505639037117e-05, |
| "kl": 4.828364592045546, |
| "learning_rate": 5.0489880480143605e-05, |
| "loss": 0.0046, |
| "num_tokens": 7336132.0, |
| "reward": 0.706183135509491, |
| "reward_std": 0.4439047574996948, |
| "rewards/rollout_reward_func/mean": 0.706183135509491, |
| "rewards/rollout_reward_func/std": 0.473155677318573, |
| "sampling/importance_sampling_ratio/max": 1.000019907951355, |
| "sampling/importance_sampling_ratio/mean": 0.9999977350234985, |
| "sampling/importance_sampling_ratio/min": 0.9999704360961914, |
| "sampling/sampling_logp_difference/max": 4.1127488657366484e-05, |
| "sampling/sampling_logp_difference/mean": 1.965895080502378e-06, |
| "step": 196, |
| "step_time": 10.270143649001056 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 243.25, |
| "completions/mean_terminated_length": 243.25, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.612786135448914e-05, |
| "epoch": 0.00394, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 7.341598393395543e-05, |
| "kl": 5.151996046304703, |
| "learning_rate": 5.0069586163056615e-05, |
| "loss": -0.0132, |
| "num_tokens": 7375208.0, |
| "reward": 0.6177623271942139, |
| "reward_std": 0.5074785947799683, |
| "rewards/rollout_reward_func/mean": 0.6177623271942139, |
| "rewards/rollout_reward_func/std": 0.6169306039810181, |
| "sampling/importance_sampling_ratio/max": 1.0000261068344116, |
| "sampling/importance_sampling_ratio/mean": 0.9999943971633911, |
| "sampling/importance_sampling_ratio/min": 0.9998465776443481, |
| "sampling/sampling_logp_difference/max": 0.00015128619270399213, |
| "sampling/sampling_logp_difference/mean": 3.3790561246860307e-06, |
| "step": 197, |
| "step_time": 10.915793876999032 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 678.0, |
| "completions/max_terminated_length": 678.0, |
| "completions/mean_length": 254.09375, |
| "completions/mean_terminated_length": 254.09375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.961041109500911e-05, |
| "epoch": 0.00396, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.339655060903169e-05, |
| "kl": 5.318888820707798, |
| "learning_rate": 4.96510388282447e-05, |
| "loss": 0.0028, |
| "num_tokens": 7413139.0, |
| "reward": 0.6443415880203247, |
| "reward_std": 0.4514845907688141, |
| "rewards/rollout_reward_func/mean": 0.6443415880203247, |
| "rewards/rollout_reward_func/std": 0.49244067072868347, |
| "sampling/importance_sampling_ratio/max": 1.0000090599060059, |
| "sampling/importance_sampling_ratio/mean": 0.9999953508377075, |
| "sampling/importance_sampling_ratio/min": 0.9998794198036194, |
| "sampling/sampling_logp_difference/max": 9.40619720495306e-05, |
| "sampling/sampling_logp_difference/mean": 2.942206265288405e-06, |
| "step": 198, |
| "step_time": 10.681011859999671 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 687.0, |
| "completions/max_terminated_length": 687.0, |
| "completions/mean_length": 242.28125, |
| "completions/mean_terminated_length": 242.28125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.84905787330581e-05, |
| "epoch": 0.00398, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.2986649128142744e-05, |
| "kl": 5.349530559033155, |
| "learning_rate": 4.9234297298751484e-05, |
| "loss": 0.0035, |
| "num_tokens": 7451845.0, |
| "reward": 0.5530915260314941, |
| "reward_std": 0.5549728870391846, |
| "rewards/rollout_reward_func/mean": 0.5530915260314941, |
| "rewards/rollout_reward_func/std": 0.5644925236701965, |
| "sampling/importance_sampling_ratio/max": 1.000011682510376, |
| "sampling/importance_sampling_ratio/mean": 0.9999925494194031, |
| "sampling/importance_sampling_ratio/min": 0.9999537467956543, |
| "sampling/sampling_logp_difference/max": 3.862451922032051e-05, |
| "sampling/sampling_logp_difference/mean": 2.9472043934219982e-06, |
| "step": 199, |
| "step_time": 10.867003692000253 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 271.6875, |
| "completions/mean_terminated_length": 271.6875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.556861286024287e-05, |
| "epoch": 0.004, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.539603105513379e-05, |
| "kl": 5.291132360696793, |
| "learning_rate": 4.881942014383094e-05, |
| "loss": 0.0038, |
| "num_tokens": 7491724.0, |
| "reward": 0.7399331331253052, |
| "reward_std": 0.4636785089969635, |
| "rewards/rollout_reward_func/mean": 0.7399331331253052, |
| "rewards/rollout_reward_func/std": 0.4593670070171356, |
| "sampling/importance_sampling_ratio/max": 1.000006079673767, |
| "sampling/importance_sampling_ratio/mean": 0.9999874830245972, |
| "sampling/importance_sampling_ratio/min": 0.9997283816337585, |
| "sampling/sampling_logp_difference/max": 0.00025416005519218743, |
| "sampling/sampling_logp_difference/mean": 3.923657914128853e-06, |
| "step": 200, |
| "step_time": 11.005329466000148 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 221.09375, |
| "completions/mean_terminated_length": 221.09375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.380929744726927e-05, |
| "epoch": 0.00402, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.2547788072843105e-05, |
| "kl": 5.06697791069746, |
| "learning_rate": 4.8406465670716076e-05, |
| "loss": -0.0034, |
| "num_tokens": 7529039.0, |
| "reward": 0.7384207844734192, |
| "reward_std": 0.48120927810668945, |
| "rewards/rollout_reward_func/mean": 0.7384207844734192, |
| "rewards/rollout_reward_func/std": 0.4612503945827484, |
| "sampling/importance_sampling_ratio/max": 1.000004529953003, |
| "sampling/importance_sampling_ratio/mean": 0.9999886155128479, |
| "sampling/importance_sampling_ratio/min": 0.999925434589386, |
| "sampling/sampling_logp_difference/max": 8.261243056040257e-05, |
| "sampling/sampling_logp_difference/mean": 3.649963218776975e-06, |
| "step": 201, |
| "step_time": 11.135630174998823 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 218.53125, |
| "completions/mean_terminated_length": 218.53125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.144260962694716e-05, |
| "epoch": 0.00404, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 8.080134284682572e-05, |
| "kl": 4.722892113029957, |
| "learning_rate": 4.799549191642437e-05, |
| "loss": -0.0093, |
| "num_tokens": 7565841.0, |
| "reward": 0.8027622699737549, |
| "reward_std": 0.27080461382865906, |
| "rewards/rollout_reward_func/mean": 0.8027622699737549, |
| "rewards/rollout_reward_func/std": 0.42055216431617737, |
| "sampling/importance_sampling_ratio/max": 1.0000193119049072, |
| "sampling/importance_sampling_ratio/mean": 0.9999892711639404, |
| "sampling/importance_sampling_ratio/min": 0.9997215867042542, |
| "sampling/sampling_logp_difference/max": 0.0002580939617473632, |
| "sampling/sampling_logp_difference/mean": 4.462499873625347e-06, |
| "step": 202, |
| "step_time": 10.46230581699956 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 295.09375, |
| "completions/mean_terminated_length": 295.09375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.879004566608728e-05, |
| "epoch": 0.00406, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.695196548709646e-05, |
| "kl": 4.424702264368534, |
| "learning_rate": 4.7586556639601154e-05, |
| "loss": -0.0063, |
| "num_tokens": 7605323.0, |
| "reward": 0.8055915832519531, |
| "reward_std": 0.3435424566268921, |
| "rewards/rollout_reward_func/mean": 0.8055915832519531, |
| "rewards/rollout_reward_func/std": 0.4181799292564392, |
| "sampling/importance_sampling_ratio/max": 1.0000042915344238, |
| "sampling/importance_sampling_ratio/mean": 0.9999926090240479, |
| "sampling/importance_sampling_ratio/min": 0.9999536871910095, |
| "sampling/sampling_logp_difference/max": 4.494252061704174e-05, |
| "sampling/sampling_logp_difference/mean": 2.7982086976408027e-06, |
| "step": 203, |
| "step_time": 11.714568361000147 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 238.375, |
| "completions/mean_terminated_length": 238.375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.828146078419195e-05, |
| "epoch": 0.00408, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.576986611937173e-05, |
| "kl": 4.861797966063023, |
| "learning_rate": 4.71797173124022e-05, |
| "loss": 0.002, |
| "num_tokens": 7642942.0, |
| "reward": 0.7380915284156799, |
| "reward_std": 0.4791000783443451, |
| "rewards/rollout_reward_func/mean": 0.7380915284156799, |
| "rewards/rollout_reward_func/std": 0.45896562933921814, |
| "sampling/importance_sampling_ratio/max": 1.0000057220458984, |
| "sampling/importance_sampling_ratio/mean": 0.9999947547912598, |
| "sampling/importance_sampling_ratio/min": 0.9999723434448242, |
| "sampling/sampling_logp_difference/max": 3.445236143306829e-05, |
| "sampling/sampling_logp_difference/mean": 2.5225040189980064e-06, |
| "step": 204, |
| "step_time": 10.50498939000181 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 218.1875, |
| "completions/mean_terminated_length": 218.1875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 4.999684122708459e-05, |
| "epoch": 0.0041, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3910033885622397e-05, |
| "kl": 5.3215382900089025, |
| "learning_rate": 4.67750311124165e-05, |
| "loss": 0.0004, |
| "num_tokens": 7679934.0, |
| "reward": 0.6737499833106995, |
| "reward_std": 0.381944477558136, |
| "rewards/rollout_reward_func/mean": 0.6737499833106995, |
| "rewards/rollout_reward_func/std": 0.477099746465683, |
| "sampling/importance_sampling_ratio/max": 1.000003457069397, |
| "sampling/importance_sampling_ratio/mean": 0.9999958276748657, |
| "sampling/importance_sampling_ratio/min": 0.9999797344207764, |
| "sampling/sampling_logp_difference/max": 2.3484337361878715e-05, |
| "sampling/sampling_logp_difference/mean": 1.814927827581414e-06, |
| "step": 205, |
| "step_time": 11.114921720999519 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 248.03125, |
| "completions/mean_terminated_length": 248.03125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.976049931812668e-05, |
| "epoch": 0.00412, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.381532613071613e-05, |
| "kl": 4.561372932046652, |
| "learning_rate": 4.63725549146305e-05, |
| "loss": 0.0129, |
| "num_tokens": 7718197.0, |
| "reward": 0.8037499785423279, |
| "reward_std": 0.43239593505859375, |
| "rewards/rollout_reward_func/mean": 0.8037499785423279, |
| "rewards/rollout_reward_func/std": 0.4172239303588867, |
| "sampling/importance_sampling_ratio/max": 1.0000351667404175, |
| "sampling/importance_sampling_ratio/mean": 0.9999947547912598, |
| "sampling/importance_sampling_ratio/min": 0.9998741149902344, |
| "sampling/sampling_logp_difference/max": 0.0001127816503867507, |
| "sampling/sampling_logp_difference/mean": 2.9079683372401632e-06, |
| "step": 206, |
| "step_time": 10.3316515169995 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 218.03125, |
| "completions/mean_terminated_length": 218.03125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.278466614162426e-05, |
| "epoch": 0.00414, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.739985888591036e-05, |
| "kl": 4.738906025886536, |
| "learning_rate": 4.597234528343477e-05, |
| "loss": -0.0071, |
| "num_tokens": 7754920.0, |
| "reward": 0.7693415880203247, |
| "reward_std": 0.3719741702079773, |
| "rewards/rollout_reward_func/mean": 0.7693415880203247, |
| "rewards/rollout_reward_func/std": 0.44077280163764954, |
| "sampling/importance_sampling_ratio/max": 1.0000015497207642, |
| "sampling/importance_sampling_ratio/mean": 0.9999948740005493, |
| "sampling/importance_sampling_ratio/min": 0.9999737739562988, |
| "sampling/sampling_logp_difference/max": 2.1100262529216707e-05, |
| "sampling/sampling_logp_difference/mean": 2.0317766029620543e-06, |
| "step": 207, |
| "step_time": 11.240013908000037 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 214.9375, |
| "completions/mean_terminated_length": 214.9375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.187105017829708e-05, |
| "epoch": 0.00416, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3428838428808376e-05, |
| "kl": 4.688595987856388, |
| "learning_rate": 4.557445846467452e-05, |
| "loss": -0.0029, |
| "num_tokens": 7791160.0, |
| "reward": 0.7068415284156799, |
| "reward_std": 0.5244907140731812, |
| "rewards/rollout_reward_func/mean": 0.7068415284156799, |
| "rewards/rollout_reward_func/std": 0.5362996459007263, |
| "sampling/importance_sampling_ratio/max": 1.0000072717666626, |
| "sampling/importance_sampling_ratio/mean": 0.9999967813491821, |
| "sampling/importance_sampling_ratio/min": 0.9999647736549377, |
| "sampling/sampling_logp_difference/max": 3.254435432609171e-05, |
| "sampling/sampling_logp_difference/mean": 1.924766593219829e-06, |
| "step": 208, |
| "step_time": 10.564521855001658 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 270.09375, |
| "completions/mean_terminated_length": 270.09375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.67929339673401e-05, |
| "epoch": 0.00418, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.7854244763148017e-05, |
| "kl": 5.123309299349785, |
| "learning_rate": 4.517895037774461e-05, |
| "loss": -0.0006, |
| "num_tokens": 7830838.0, |
| "reward": 0.7405915260314941, |
| "reward_std": 0.3506433665752411, |
| "rewards/rollout_reward_func/mean": 0.7405915260314941, |
| "rewards/rollout_reward_func/std": 0.4569876194000244, |
| "sampling/importance_sampling_ratio/max": 1.000009298324585, |
| "sampling/importance_sampling_ratio/mean": 0.9999930262565613, |
| "sampling/importance_sampling_ratio/min": 0.9999588131904602, |
| "sampling/sampling_logp_difference/max": 5.2810326451435685e-05, |
| "sampling/sampling_logp_difference/mean": 3.302905042801285e-06, |
| "step": 209, |
| "step_time": 11.46979012699967 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 250.09375, |
| "completions/mean_terminated_length": 250.09375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.607456470599573e-05, |
| "epoch": 0.0042, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.8605783882085234e-05, |
| "kl": 5.365914463996887, |
| "learning_rate": 4.478587660773065e-05, |
| "loss": 0.0039, |
| "num_tokens": 7869530.0, |
| "reward": 0.7415122985839844, |
| "reward_std": 0.4781748354434967, |
| "rewards/rollout_reward_func/mean": 0.7415122985839844, |
| "rewards/rollout_reward_func/std": 0.45829862356185913, |
| "sampling/importance_sampling_ratio/max": 1.0000122785568237, |
| "sampling/importance_sampling_ratio/mean": 0.999998152256012, |
| "sampling/importance_sampling_ratio/min": 0.9999728202819824, |
| "sampling/sampling_logp_difference/max": 2.2650128812529147e-05, |
| "sampling/sampling_logp_difference/mean": 2.2131084733700845e-06, |
| "step": 210, |
| "step_time": 10.400364204000653 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 196.0625, |
| "completions/mean_terminated_length": 196.0625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.971475469834786e-05, |
| "epoch": 0.00422, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.156006303266622e-05, |
| "kl": 5.336601361632347, |
| "learning_rate": 4.439529239759708e-05, |
| "loss": 0.01, |
| "num_tokens": 7905392.0, |
| "reward": 0.6737499833106995, |
| "reward_std": 0.4696081280708313, |
| "rewards/rollout_reward_func/mean": 0.6737499833106995, |
| "rewards/rollout_reward_func/std": 0.48247846961021423, |
| "sampling/importance_sampling_ratio/max": 1.0000094175338745, |
| "sampling/importance_sampling_ratio/mean": 0.999995768070221, |
| "sampling/importance_sampling_ratio/min": 0.9999589920043945, |
| "sampling/sampling_logp_difference/max": 4.8401976528111845e-05, |
| "sampling/sampling_logp_difference/mean": 2.4310015760420356e-06, |
| "step": 211, |
| "step_time": 11.25777174300083 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 678.0, |
| "completions/max_terminated_length": 678.0, |
| "completions/mean_length": 215.5625, |
| "completions/mean_terminated_length": 215.5625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.468110858610544e-05, |
| "epoch": 0.00424, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.0994404116645455e-05, |
| "kl": 5.020519495010376, |
| "learning_rate": 4.4007252640423116e-05, |
| "loss": -0.0123, |
| "num_tokens": 7943157.0, |
| "reward": 0.7718415260314941, |
| "reward_std": 0.4587705433368683, |
| "rewards/rollout_reward_func/mean": 0.7718415260314941, |
| "rewards/rollout_reward_func/std": 0.4480281174182892, |
| "sampling/importance_sampling_ratio/max": 1.0000126361846924, |
| "sampling/importance_sampling_ratio/mean": 0.9999984502792358, |
| "sampling/importance_sampling_ratio/min": 0.999977707862854, |
| "sampling/sampling_logp_difference/max": 2.8371961889206432e-05, |
| "sampling/sampling_logp_difference/mean": 2.205347755079856e-06, |
| "step": 212, |
| "step_time": 10.416242442000112 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 250.625, |
| "completions/mean_terminated_length": 250.625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.606689459294103e-05, |
| "epoch": 0.00426, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3088554624118842e-05, |
| "kl": 4.440442498773336, |
| "learning_rate": 4.3621811871688186e-05, |
| "loss": 0.0017, |
| "num_tokens": 7980729.0, |
| "reward": 0.6758539080619812, |
| "reward_std": 0.4955633580684662, |
| "rewards/rollout_reward_func/mean": 0.6758539080619812, |
| "rewards/rollout_reward_func/std": 0.5430920124053955, |
| "sampling/importance_sampling_ratio/max": 1.0000171661376953, |
| "sampling/importance_sampling_ratio/mean": 0.9999970197677612, |
| "sampling/importance_sampling_ratio/min": 0.9999763369560242, |
| "sampling/sampling_logp_difference/max": 1.823993807192892e-05, |
| "sampling/sampling_logp_difference/mean": 2.135067461495055e-06, |
| "step": 213, |
| "step_time": 11.03314153600013 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 130.3125, |
| "completions/mean_terminated_length": 130.3125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 4.7448618119005914e-05, |
| "epoch": 0.00428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5431114661623724e-05, |
| "kl": 4.4410514533519745, |
| "learning_rate": 4.323902426160737e-05, |
| "loss": 0.005, |
| "num_tokens": 8013047.0, |
| "reward": 0.8249331116676331, |
| "reward_std": 0.40843328833580017, |
| "rewards/rollout_reward_func/mean": 0.8249331116676331, |
| "rewards/rollout_reward_func/std": 0.3973701298236847, |
| "sampling/importance_sampling_ratio/max": 1.0000035762786865, |
| "sampling/importance_sampling_ratio/mean": 0.9999982118606567, |
| "sampling/importance_sampling_ratio/min": 0.9999793171882629, |
| "sampling/sampling_logp_difference/max": 2.1219655536697246e-05, |
| "sampling/sampling_logp_difference/mean": 1.4284883036452811e-06, |
| "step": 214, |
| "step_time": 10.193582248000894 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 687.0, |
| "completions/max_terminated_length": 687.0, |
| "completions/mean_length": 221.90625, |
| "completions/mean_terminated_length": 221.90625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.51787586636965e-05, |
| "epoch": 0.0043, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.6926014470518567e-05, |
| "kl": 3.983197882771492, |
| "learning_rate": 4.285894360751829e-05, |
| "loss": -0.0012, |
| "num_tokens": 8048381.0, |
| "reward": 0.8018415570259094, |
| "reward_std": 0.4044482707977295, |
| "rewards/rollout_reward_func/mean": 0.8018415570259094, |
| "rewards/rollout_reward_func/std": 0.4863073229789734, |
| "sampling/importance_sampling_ratio/max": 1.0000040531158447, |
| "sampling/importance_sampling_ratio/mean": 0.9999951124191284, |
| "sampling/importance_sampling_ratio/min": 0.9999686479568481, |
| "sampling/sampling_logp_difference/max": 3.111379555775784e-05, |
| "sampling/sampling_logp_difference/mean": 2.425235834380146e-06, |
| "step": 215, |
| "step_time": 10.813712388001022 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 284.3125, |
| "completions/mean_terminated_length": 284.3125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.533502838758977e-05, |
| "epoch": 0.00432, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.587458483409137e-05, |
| "kl": 4.82944827247411, |
| "learning_rate": 4.2481623326320364e-05, |
| "loss": 0.0117, |
| "num_tokens": 8087461.0, |
| "reward": 0.5843415260314941, |
| "reward_std": 0.5055504441261292, |
| "rewards/rollout_reward_func/mean": 0.5843415260314941, |
| "rewards/rollout_reward_func/std": 0.5016008019447327, |
| "sampling/importance_sampling_ratio/max": 1.0000051259994507, |
| "sampling/importance_sampling_ratio/mean": 0.9999934434890747, |
| "sampling/importance_sampling_ratio/min": 0.999961793422699, |
| "sampling/sampling_logp_difference/max": 3.683606337290257e-05, |
| "sampling/sampling_logp_difference/mean": 3.0676592359668575e-06, |
| "step": 216, |
| "step_time": 10.44111612099914 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 235.59375, |
| "completions/mean_terminated_length": 235.59375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 0.00010566368675313242, |
| "epoch": 0.00434, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.359190618037246e-05, |
| "kl": 5.429783314466476, |
| "learning_rate": 4.210711644696756e-05, |
| "loss": -0.0041, |
| "num_tokens": 8126489.0, |
| "reward": 0.7111831307411194, |
| "reward_std": 0.525292158126831, |
| "rewards/rollout_reward_func/mean": 0.7111831307411194, |
| "rewards/rollout_reward_func/std": 0.5364737510681152, |
| "sampling/importance_sampling_ratio/max": 1.0000098943710327, |
| "sampling/importance_sampling_ratio/mean": 0.9999901056289673, |
| "sampling/importance_sampling_ratio/min": 0.9998805522918701, |
| "sampling/sampling_logp_difference/max": 9.84744110610336e-05, |
| "sampling/sampling_logp_difference/mean": 3.6738169910677243e-06, |
| "step": 217, |
| "step_time": 11.342535938998935 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 299.75, |
| "completions/mean_terminated_length": 299.75, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.36404399251478e-05, |
| "epoch": 0.00436, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.6273870389559306e-05, |
| "kl": 4.871564254164696, |
| "learning_rate": 4.1735475603015697e-05, |
| "loss": 0.0077, |
| "num_tokens": 8166512.0, |
| "reward": 0.6790122985839844, |
| "reward_std": 0.4945233464241028, |
| "rewards/rollout_reward_func/mean": 0.6790122985839844, |
| "rewards/rollout_reward_func/std": 0.4823760390281677, |
| "sampling/importance_sampling_ratio/max": 1.0000158548355103, |
| "sampling/importance_sampling_ratio/mean": 0.999993085861206, |
| "sampling/importance_sampling_ratio/min": 0.9999461770057678, |
| "sampling/sampling_logp_difference/max": 3.6239842302165926e-05, |
| "sampling/sampling_logp_difference/mean": 2.806388010867522e-06, |
| "step": 218, |
| "step_time": 10.86233605400048 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 252.75, |
| "completions/mean_terminated_length": 252.75, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.950771052695927e-05, |
| "epoch": 0.00438, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.0681065254611894e-05, |
| "kl": 4.676598273217678, |
| "learning_rate": 4.136675302522517e-05, |
| "loss": -0.0047, |
| "num_tokens": 8204946.0, |
| "reward": 0.6799330711364746, |
| "reward_std": 0.5543248653411865, |
| "rewards/rollout_reward_func/mean": 0.6799330711364746, |
| "rewards/rollout_reward_func/std": 0.5487261414527893, |
| "sampling/importance_sampling_ratio/max": 1.0000182390213013, |
| "sampling/importance_sampling_ratio/mean": 0.9999966025352478, |
| "sampling/importance_sampling_ratio/min": 0.9999750852584839, |
| "sampling/sampling_logp_difference/max": 2.6107110898010433e-05, |
| "sampling/sampling_logp_difference/mean": 2.1445025595312472e-06, |
| "step": 219, |
| "step_time": 10.943576970999402 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 247.71875, |
| "completions/mean_terminated_length": 247.71875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.87366666941125e-05, |
| "epoch": 0.0044, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.677894761087373e-05, |
| "kl": 5.05719730257988, |
| "learning_rate": 4.1001000534220484e-05, |
| "loss": 0.0043, |
| "num_tokens": 8243453.0, |
| "reward": 0.4586831331253052, |
| "reward_std": 0.6123698949813843, |
| "rewards/rollout_reward_func/mean": 0.4586831331253052, |
| "rewards/rollout_reward_func/std": 0.616819441318512, |
| "sampling/importance_sampling_ratio/max": 1.0000048875808716, |
| "sampling/importance_sampling_ratio/mean": 0.999995231628418, |
| "sampling/importance_sampling_ratio/min": 0.9999604821205139, |
| "sampling/sampling_logp_difference/max": 3.4690252505242825e-05, |
| "sampling/sampling_logp_difference/mean": 2.33208902500337e-06, |
| "step": 220, |
| "step_time": 11.04217300100072 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 261.25, |
| "completions/mean_terminated_length": 261.25, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.492780096323258e-05, |
| "epoch": 0.00442, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8693634046940133e-05, |
| "kl": 5.244952633976936, |
| "learning_rate": 4.063826953320731e-05, |
| "loss": 0.0021, |
| "num_tokens": 8282852.0, |
| "reward": 0.6468415260314941, |
| "reward_std": 0.510591983795166, |
| "rewards/rollout_reward_func/mean": 0.6468415260314941, |
| "rewards/rollout_reward_func/std": 0.49439889192581177, |
| "sampling/importance_sampling_ratio/max": 1.0000091791152954, |
| "sampling/importance_sampling_ratio/mean": 0.9999966621398926, |
| "sampling/importance_sampling_ratio/min": 0.9999747276306152, |
| "sampling/sampling_logp_difference/max": 2.1458068658830598e-05, |
| "sampling/sampling_logp_difference/mean": 1.838789557950804e-06, |
| "step": 221, |
| "step_time": 11.379838227001073 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 231.0, |
| "completions/mean_terminated_length": 231.0, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.502011356701587e-05, |
| "epoch": 0.00444, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.6576681193546392e-05, |
| "kl": 5.2187200747430325, |
| "learning_rate": 4.027861100074818e-05, |
| "loss": 0.0005, |
| "num_tokens": 8320490.0, |
| "reward": 0.642762303352356, |
| "reward_std": 0.501641035079956, |
| "rewards/rollout_reward_func/mean": 0.642762303352356, |
| "rewards/rollout_reward_func/std": 0.4938026964664459, |
| "sampling/importance_sampling_ratio/max": 1.0000317096710205, |
| "sampling/importance_sampling_ratio/mean": 0.9999995231628418, |
| "sampling/importance_sampling_ratio/min": 0.9999289512634277, |
| "sampling/sampling_logp_difference/max": 6.830880738561973e-05, |
| "sampling/sampling_logp_difference/mean": 3.455873411439825e-06, |
| "step": 222, |
| "step_time": 10.861577565999141 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 249.0, |
| "completions/mean_terminated_length": 249.0, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.842954086039299e-05, |
| "epoch": 0.00446, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4798371012147982e-05, |
| "kl": 4.837060697376728, |
| "learning_rate": 3.9922075483597984e-05, |
| "loss": 0.0038, |
| "num_tokens": 8358120.0, |
| "reward": 0.7390123605728149, |
| "reward_std": 0.4545978307723999, |
| "rewards/rollout_reward_func/mean": 0.7390123605728149, |
| "rewards/rollout_reward_func/std": 0.4567206799983978, |
| "sampling/importance_sampling_ratio/max": 1.0000172853469849, |
| "sampling/importance_sampling_ratio/mean": 0.9999964833259583, |
| "sampling/importance_sampling_ratio/min": 0.9999744892120361, |
| "sampling/sampling_logp_difference/max": 3.647844278020784e-05, |
| "sampling/sampling_logp_difference/mean": 2.729870857365313e-06, |
| "step": 223, |
| "step_time": 11.067154321999169 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 669.0, |
| "completions/max_terminated_length": 669.0, |
| "completions/mean_length": 220.34375, |
| "completions/mean_terminated_length": 220.34375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.793645808045312e-05, |
| "epoch": 0.00448, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.114427843480371e-05, |
| "kl": 4.609769219532609, |
| "learning_rate": 3.956871308960006e-05, |
| "loss": -0.0024, |
| "num_tokens": 8395318.0, |
| "reward": 0.7071707844734192, |
| "reward_std": 0.37846216559410095, |
| "rewards/rollout_reward_func/mean": 0.7071707844734192, |
| "rewards/rollout_reward_func/std": 0.47386202216148376, |
| "sampling/importance_sampling_ratio/max": 1.0000035762786865, |
| "sampling/importance_sampling_ratio/mean": 0.9999899864196777, |
| "sampling/importance_sampling_ratio/min": 0.9998646378517151, |
| "sampling/sampling_logp_difference/max": 0.0001251772919204086, |
| "sampling/sampling_logp_difference/mean": 2.8750114324793685e-06, |
| "step": 224, |
| "step_time": 10.648732830999961 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 678.0, |
| "completions/max_terminated_length": 678.0, |
| "completions/mean_length": 194.875, |
| "completions/mean_terminated_length": 194.875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.4564909845566945e-05, |
| "epoch": 0.0045, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.99508774332935e-05, |
| "kl": 4.295398809015751, |
| "learning_rate": 3.921857348064393e-05, |
| "loss": -0.0028, |
| "num_tokens": 8429759.0, |
| "reward": 0.735920786857605, |
| "reward_std": 0.3834129571914673, |
| "rewards/rollout_reward_func/mean": 0.735920786857605, |
| "rewards/rollout_reward_func/std": 0.45476430654525757, |
| "sampling/importance_sampling_ratio/max": 1.0000051259994507, |
| "sampling/importance_sampling_ratio/mean": 0.9999962449073792, |
| "sampling/importance_sampling_ratio/min": 0.99997878074646, |
| "sampling/sampling_logp_difference/max": 2.408070577075705e-05, |
| "sampling/sampling_logp_difference/mean": 2.0818633856833912e-06, |
| "step": 225, |
| "step_time": 10.618216946999382 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 213.4375, |
| "completions/mean_terminated_length": 213.4375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.468187297059558e-05, |
| "epoch": 0.00452, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.314897210453637e-05, |
| "kl": 4.907812386751175, |
| "learning_rate": 3.8871705865685835e-05, |
| "loss": -0.0071, |
| "num_tokens": 8466101.0, |
| "reward": 0.7993415594100952, |
| "reward_std": 0.35336440801620483, |
| "rewards/rollout_reward_func/mean": 0.7993415594100952, |
| "rewards/rollout_reward_func/std": 0.42098554968833923, |
| "sampling/importance_sampling_ratio/max": 1.000029444694519, |
| "sampling/importance_sampling_ratio/mean": 0.9999977350234985, |
| "sampling/importance_sampling_ratio/min": 0.9999707937240601, |
| "sampling/sampling_logp_difference/max": 3.9220289181685075e-05, |
| "sampling/sampling_logp_difference/mean": 2.529017365304753e-06, |
| "step": 226, |
| "step_time": 10.741557626999565 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 223.03125, |
| "completions/mean_terminated_length": 223.03125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.120953486923099e-05, |
| "epoch": 0.00454, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.4079289839137346e-05, |
| "kl": 4.519463703036308, |
| "learning_rate": 3.852815899383288e-05, |
| "loss": -0.0039, |
| "num_tokens": 8502768.0, |
| "reward": 0.642762303352356, |
| "reward_std": 0.43429845571517944, |
| "rewards/rollout_reward_func/mean": 0.642762303352356, |
| "rewards/rollout_reward_func/std": 0.55186527967453, |
| "sampling/importance_sampling_ratio/max": 1.000009536743164, |
| "sampling/importance_sampling_ratio/mean": 0.9999961853027344, |
| "sampling/importance_sampling_ratio/min": 0.9999439120292664, |
| "sampling/sampling_logp_difference/max": 6.046371709089726e-05, |
| "sampling/sampling_logp_difference/mean": 2.2524543510371586e-06, |
| "step": 227, |
| "step_time": 10.909075683000992 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 678.0, |
| "completions/max_terminated_length": 678.0, |
| "completions/mean_length": 238.4375, |
| "completions/mean_terminated_length": 238.4375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.815549295424717e-05, |
| "epoch": 0.00456, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.8888143788208254e-05, |
| "kl": 5.122124448418617, |
| "learning_rate": 3.81879811474917e-05, |
| "loss": 0.0128, |
| "num_tokens": 8540887.0, |
| "reward": 0.6134207248687744, |
| "reward_std": 0.4781406819820404, |
| "rewards/rollout_reward_func/mean": 0.6134207248687744, |
| "rewards/rollout_reward_func/std": 0.49850568175315857, |
| "sampling/importance_sampling_ratio/max": 1.0000128746032715, |
| "sampling/importance_sampling_ratio/mean": 0.9999950528144836, |
| "sampling/importance_sampling_ratio/min": 0.9999350309371948, |
| "sampling/sampling_logp_difference/max": 5.8532186812954023e-05, |
| "sampling/sampling_logp_difference/mean": 2.7186433726456016e-06, |
| "step": 228, |
| "step_time": 10.663067908001267 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 255.21875, |
| "completions/mean_terminated_length": 255.21875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.427226666185561e-05, |
| "epoch": 0.00458, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.2373380488716066e-05, |
| "kl": 4.516269013285637, |
| "learning_rate": 3.785122013558288e-05, |
| "loss": 0.0073, |
| "num_tokens": 8578873.0, |
| "reward": 0.7715123891830444, |
| "reward_std": 0.4250052869319916, |
| "rewards/rollout_reward_func/mean": 0.7715123891830444, |
| "rewards/rollout_reward_func/std": 0.44125890731811523, |
| "sampling/importance_sampling_ratio/max": 1.0000226497650146, |
| "sampling/importance_sampling_ratio/mean": 0.9999961853027344, |
| "sampling/importance_sampling_ratio/min": 0.9999725818634033, |
| "sampling/sampling_logp_difference/max": 2.968106127809733e-05, |
| "sampling/sampling_logp_difference/mean": 2.688705535547342e-06, |
| "step": 229, |
| "step_time": 10.989937573000134 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 261.40625, |
| "completions/mean_terminated_length": 261.40625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.805091253487717e-05, |
| "epoch": 0.0046, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.7097617930849083e-05, |
| "kl": 4.258571729063988, |
| "learning_rate": 3.751792328682183e-05, |
| "loss": 0.0047, |
| "num_tokens": 8617352.0, |
| "reward": 0.8043415546417236, |
| "reward_std": 0.33875972032546997, |
| "rewards/rollout_reward_func/mean": 0.8043415546417236, |
| "rewards/rollout_reward_func/std": 0.41523313522338867, |
| "sampling/importance_sampling_ratio/max": 1.0000033378601074, |
| "sampling/importance_sampling_ratio/mean": 0.9999909400939941, |
| "sampling/importance_sampling_ratio/min": 0.9999553561210632, |
| "sampling/sampling_logp_difference/max": 3.457104321569204e-05, |
| "sampling/sampling_logp_difference/mean": 2.853767227861681e-06, |
| "step": 230, |
| "step_time": 10.89791804999868 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 185.6875, |
| "completions/mean_terminated_length": 185.6875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.286828362396136e-05, |
| "epoch": 0.00462, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.8086083804955706e-05, |
| "kl": 5.058218787424266, |
| "learning_rate": 3.718813744306712e-05, |
| "loss": -0.0022, |
| "num_tokens": 8653215.0, |
| "reward": 0.7049999833106995, |
| "reward_std": 0.49435633420944214, |
| "rewards/rollout_reward_func/mean": 0.7049999833106995, |
| "rewards/rollout_reward_func/std": 0.4751094877719879, |
| "sampling/importance_sampling_ratio/max": 1.0000234842300415, |
| "sampling/importance_sampling_ratio/mean": 0.9999891519546509, |
| "sampling/importance_sampling_ratio/min": 0.9998546838760376, |
| "sampling/sampling_logp_difference/max": 0.00014317341265268624, |
| "sampling/sampling_logp_difference/mean": 3.555632019924815e-06, |
| "step": 231, |
| "step_time": 10.807909351999115 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 250.9375, |
| "completions/mean_terminated_length": 250.9375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.626304455106037e-05, |
| "epoch": 0.00464, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0001353168481728062, |
| "kl": 5.0585897117853165, |
| "learning_rate": 3.686190895273733e-05, |
| "loss": 0.0117, |
| "num_tokens": 8691158.0, |
| "reward": 0.7077623605728149, |
| "reward_std": 0.44103604555130005, |
| "rewards/rollout_reward_func/mean": 0.7077623605728149, |
| "rewards/rollout_reward_func/std": 0.468770295381546, |
| "sampling/importance_sampling_ratio/max": 1.0000251531600952, |
| "sampling/importance_sampling_ratio/mean": 0.9999915957450867, |
| "sampling/importance_sampling_ratio/min": 0.999932050704956, |
| "sampling/sampling_logp_difference/max": 7.951720181154087e-05, |
| "sampling/sampling_logp_difference/mean": 3.854770056932466e-06, |
| "step": 232, |
| "step_time": 11.1020100679998 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 227.96875, |
| "completions/mean_terminated_length": 227.96875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.657698354535114e-05, |
| "epoch": 0.00466, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.930617771809921e-05, |
| "kl": 5.246766164898872, |
| "learning_rate": 3.653928366429717e-05, |
| "loss": -0.0081, |
| "num_tokens": 8729310.0, |
| "reward": 0.6768415570259094, |
| "reward_std": 0.5108159184455872, |
| "rewards/rollout_reward_func/mean": 0.6768415570259094, |
| "rewards/rollout_reward_func/std": 0.49002739787101746, |
| "sampling/importance_sampling_ratio/max": 1.0000180006027222, |
| "sampling/importance_sampling_ratio/mean": 0.9999940395355225, |
| "sampling/importance_sampling_ratio/min": 0.9999337792396545, |
| "sampling/sampling_logp_difference/max": 4.6849683712935075e-05, |
| "sampling/sampling_logp_difference/mean": 2.78436164080631e-06, |
| "step": 233, |
| "step_time": 11.074858544001472 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 248.59375, |
| "completions/mean_terminated_length": 248.59375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.97915432571017e-05, |
| "epoch": 0.00468, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8650109268492088e-05, |
| "kl": 4.891276657581329, |
| "learning_rate": 3.6220306919813934e-05, |
| "loss": 0.0071, |
| "num_tokens": 8767949.0, |
| "reward": 0.5517747402191162, |
| "reward_std": 0.501990556716919, |
| "rewards/rollout_reward_func/mean": 0.5517747402191162, |
| "rewards/rollout_reward_func/std": 0.5072534680366516, |
| "sampling/importance_sampling_ratio/max": 1.0000141859054565, |
| "sampling/importance_sampling_ratio/mean": 0.9999977946281433, |
| "sampling/importance_sampling_ratio/min": 0.9999755024909973, |
| "sampling/sampling_logp_difference/max": 3.135273800580762e-05, |
| "sampling/sampling_logp_difference/mean": 2.4732930796744768e-06, |
| "step": 234, |
| "step_time": 11.137182370001028 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 224.40625, |
| "completions/mean_terminated_length": 224.40625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.745976586193137e-05, |
| "epoch": 0.0047, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.0286239052657038e-05, |
| "kl": 5.41153359413147, |
| "learning_rate": 3.590502354858501e-05, |
| "loss": -0.0122, |
| "num_tokens": 8805306.0, |
| "reward": 0.6440123319625854, |
| "reward_std": 0.4558776617050171, |
| "rewards/rollout_reward_func/mean": 0.6440123319625854, |
| "rewards/rollout_reward_func/std": 0.5544620156288147, |
| "sampling/importance_sampling_ratio/max": 1.0000075101852417, |
| "sampling/importance_sampling_ratio/mean": 0.9999954104423523, |
| "sampling/importance_sampling_ratio/min": 0.9999374151229858, |
| "sampling/sampling_logp_difference/max": 5.960933049209416e-05, |
| "sampling/sampling_logp_difference/mean": 2.1439589090732625e-06, |
| "step": 235, |
| "step_time": 11.217590768001173 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 179.5, |
| "completions/mean_terminated_length": 179.5, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.373513450341761e-05, |
| "epoch": 0.00472, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.1967592803994194e-05, |
| "kl": 5.293322175741196, |
| "learning_rate": 3.559347786083758e-05, |
| "loss": -0.0122, |
| "num_tokens": 8841204.0, |
| "reward": 0.579012393951416, |
| "reward_std": 0.5713227987289429, |
| "rewards/rollout_reward_func/mean": 0.579012393951416, |
| "rewards/rollout_reward_func/std": 0.5699735283851624, |
| "sampling/importance_sampling_ratio/max": 1.0000178813934326, |
| "sampling/importance_sampling_ratio/mean": 0.9999982118606567, |
| "sampling/importance_sampling_ratio/min": 0.999977171421051, |
| "sampling/sampling_logp_difference/max": 3.111419209744781e-05, |
| "sampling/sampling_logp_difference/mean": 2.388872417213861e-06, |
| "step": 236, |
| "step_time": 11.031479144000059 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 220.28125, |
| "completions/mean_terminated_length": 220.28125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.39780113640154e-05, |
| "epoch": 0.00474, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.93100511044031e-05, |
| "kl": 4.748119935393333, |
| "learning_rate": 3.528571364150124e-05, |
| "loss": -0.0089, |
| "num_tokens": 8879050.0, |
| "reward": 0.7405247092247009, |
| "reward_std": 0.43948662281036377, |
| "rewards/rollout_reward_func/mean": 0.7405247092247009, |
| "rewards/rollout_reward_func/std": 0.5215878486633301, |
| "sampling/importance_sampling_ratio/max": 1.0000333786010742, |
| "sampling/importance_sampling_ratio/mean": 0.9999979734420776, |
| "sampling/importance_sampling_ratio/min": 0.9999390840530396, |
| "sampling/sampling_logp_difference/max": 4.208123209537007e-05, |
| "sampling/sampling_logp_difference/mean": 3.0910557597962907e-06, |
| "step": 237, |
| "step_time": 11.245172328000535 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 678.0, |
| "completions/max_terminated_length": 678.0, |
| "completions/mean_length": 221.4375, |
| "completions/mean_terminated_length": 221.4375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.091031755772747e-05, |
| "epoch": 0.00476, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5538744264631532e-05, |
| "kl": 4.544229738414288, |
| "learning_rate": 3.4981774144054344e-05, |
| "loss": 0.0045, |
| "num_tokens": 8915726.0, |
| "reward": 0.6730915904045105, |
| "reward_std": 0.4944014549255371, |
| "rewards/rollout_reward_func/mean": 0.6730915904045105, |
| "rewards/rollout_reward_func/std": 0.4819692075252533, |
| "sampling/importance_sampling_ratio/max": 1.0000104904174805, |
| "sampling/importance_sampling_ratio/mean": 0.9999960064888, |
| "sampling/importance_sampling_ratio/min": 0.9999375939369202, |
| "sampling/sampling_logp_difference/max": 3.111407204414718e-05, |
| "sampling/sampling_logp_difference/mean": 2.156180244128336e-06, |
| "step": 238, |
| "step_time": 11.206029839998337 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 220.9375, |
| "completions/mean_terminated_length": 220.9375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.803557426986572e-05, |
| "epoch": 0.00478, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.782759449677542e-05, |
| "kl": 5.248299717903137, |
| "learning_rate": 3.46817020844452e-05, |
| "loss": 0.0036, |
| "num_tokens": 8952836.0, |
| "reward": 0.5790123343467712, |
| "reward_std": 0.5091844797134399, |
| "rewards/rollout_reward_func/mean": 0.5790123343467712, |
| "rewards/rollout_reward_func/std": 0.5044925808906555, |
| "sampling/importance_sampling_ratio/max": 1.000012755393982, |
| "sampling/importance_sampling_ratio/mean": 0.9999973773956299, |
| "sampling/importance_sampling_ratio/min": 0.9999673366546631, |
| "sampling/sampling_logp_difference/max": 2.6941650503431447e-05, |
| "sampling/sampling_logp_difference/mean": 1.8537682535679778e-06, |
| "step": 239, |
| "step_time": 11.89933117200053 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 678.0, |
| "completions/max_terminated_length": 678.0, |
| "completions/mean_length": 200.9375, |
| "completions/mean_terminated_length": 200.9375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.659828022748115e-05, |
| "epoch": 0.0048, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.700514520867728e-05, |
| "kl": 4.181779149919748, |
| "learning_rate": 3.438553963508866e-05, |
| "loss": -0.0069, |
| "num_tokens": 8988037.0, |
| "reward": 0.7680915594100952, |
| "reward_std": 0.3802240490913391, |
| "rewards/rollout_reward_func/mean": 0.7680915594100952, |
| "rewards/rollout_reward_func/std": 0.43926358222961426, |
| "sampling/importance_sampling_ratio/max": 1.0000133514404297, |
| "sampling/importance_sampling_ratio/mean": 0.9999954700469971, |
| "sampling/importance_sampling_ratio/min": 0.9999701976776123, |
| "sampling/sampling_logp_difference/max": 3.194832243025303e-05, |
| "sampling/sampling_logp_difference/mean": 2.628037236718228e-06, |
| "step": 240, |
| "step_time": 10.429010280000512 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 687.0, |
| "completions/max_terminated_length": 687.0, |
| "completions/mean_length": 190.59375, |
| "completions/mean_terminated_length": 190.59375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.779884174079598e-05, |
| "epoch": 0.00482, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.3270024030352943e-05, |
| "kl": 4.673696734011173, |
| "learning_rate": 3.409332841893925e-05, |
| "loss": 0.0024, |
| "num_tokens": 9023151.0, |
| "reward": 0.735920786857605, |
| "reward_std": 0.4644585847854614, |
| "rewards/rollout_reward_func/mean": 0.735920786857605, |
| "rewards/rollout_reward_func/std": 0.4568495452404022, |
| "sampling/importance_sampling_ratio/max": 1.0000066757202148, |
| "sampling/importance_sampling_ratio/mean": 0.9999940991401672, |
| "sampling/importance_sampling_ratio/min": 0.9998875260353088, |
| "sampling/sampling_logp_difference/max": 7.773219840601087e-05, |
| "sampling/sampling_logp_difference/mean": 2.4395403670496307e-06, |
| "step": 241, |
| "step_time": 12.248209440999744 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 276.96875, |
| "completions/mean_terminated_length": 276.96875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.92703500422931e-05, |
| "epoch": 0.00484, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.4162562112906016e-05, |
| "kl": 4.796989947557449, |
| "learning_rate": 3.3805109503641356e-05, |
| "loss": 0.0094, |
| "num_tokens": 9062474.0, |
| "reward": 0.7093415260314941, |
| "reward_std": 0.4662177562713623, |
| "rewards/rollout_reward_func/mean": 0.7093415260314941, |
| "rewards/rollout_reward_func/std": 0.4678354859352112, |
| "sampling/importance_sampling_ratio/max": 1.000012993812561, |
| "sampling/importance_sampling_ratio/mean": 0.9999938011169434, |
| "sampling/importance_sampling_ratio/min": 0.9999693036079407, |
| "sampling/sampling_logp_difference/max": 3.433253732509911e-05, |
| "sampling/sampling_logp_difference/mean": 2.6973182229994563e-06, |
| "step": 242, |
| "step_time": 11.291534194999713 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 307.125, |
| "completions/mean_terminated_length": 307.125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.851263379166085e-05, |
| "epoch": 0.00486, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.707106457615737e-05, |
| "kl": 5.56565897166729, |
| "learning_rate": 3.352092339575757e-05, |
| "loss": 0.0095, |
| "num_tokens": 9103556.0, |
| "reward": 0.5833538770675659, |
| "reward_std": 0.5242961645126343, |
| "rewards/rollout_reward_func/mean": 0.5833538770675659, |
| "rewards/rollout_reward_func/std": 0.5032540559768677, |
| "sampling/importance_sampling_ratio/max": 1.000014305114746, |
| "sampling/importance_sampling_ratio/mean": 0.9999957084655762, |
| "sampling/importance_sampling_ratio/min": 0.9999706745147705, |
| "sampling/sampling_logp_difference/max": 3.0160426831571385e-05, |
| "sampling/sampling_logp_difference/mean": 2.1681942143914057e-06, |
| "step": 243, |
| "step_time": 11.62106315400024 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 231.46875, |
| "completions/mean_terminated_length": 231.46875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.005396872410529e-05, |
| "epoch": 0.00488, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6445228538941592e-05, |
| "kl": 5.512866705656052, |
| "learning_rate": 3.324081003507593e-05, |
| "loss": 0.0098, |
| "num_tokens": 9141136.0, |
| "reward": 0.6134207844734192, |
| "reward_std": 0.4750402569770813, |
| "rewards/rollout_reward_func/mean": 0.6134207844734192, |
| "rewards/rollout_reward_func/std": 0.5004087090492249, |
| "sampling/importance_sampling_ratio/max": 1.0000059604644775, |
| "sampling/importance_sampling_ratio/mean": 0.9999967813491821, |
| "sampling/importance_sampling_ratio/min": 0.999974250793457, |
| "sampling/sampling_logp_difference/max": 2.849124211934395e-05, |
| "sampling/sampling_logp_difference/mean": 2.1164394183870172e-06, |
| "step": 244, |
| "step_time": 11.151449438999407 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 201.875, |
| "completions/mean_terminated_length": 201.875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.096377369464335e-05, |
| "epoch": 0.0049, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.411771311017219e-05, |
| "kl": 4.767334572970867, |
| "learning_rate": 3.29648087889966e-05, |
| "loss": 0.0072, |
| "num_tokens": 9177427.0, |
| "reward": 0.5493415594100952, |
| "reward_std": 0.5647812485694885, |
| "rewards/rollout_reward_func/mean": 0.5493415594100952, |
| "rewards/rollout_reward_func/std": 0.5635629296302795, |
| "sampling/importance_sampling_ratio/max": 1.000023365020752, |
| "sampling/importance_sampling_ratio/mean": 0.999997615814209, |
| "sampling/importance_sampling_ratio/min": 0.9999508857727051, |
| "sampling/sampling_logp_difference/max": 5.1029317546635866e-05, |
| "sampling/sampling_logp_difference/mean": 2.1633254618791398e-06, |
| "step": 245, |
| "step_time": 11.763949329999832 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 678.0, |
| "completions/max_terminated_length": 678.0, |
| "completions/mean_length": 160.0625, |
| "completions/mean_terminated_length": 160.0625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 4.01286881839269e-05, |
| "epoch": 0.00492, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5201369023998268e-05, |
| "kl": 4.452401959657436, |
| "learning_rate": 3.269295844699925e-05, |
| "loss": 0.0038, |
| "num_tokens": 9211181.0, |
| "reward": 0.7321707606315613, |
| "reward_std": 0.44021376967430115, |
| "rewards/rollout_reward_func/mean": 0.7321707606315613, |
| "rewards/rollout_reward_func/std": 0.4572508633136749, |
| "sampling/importance_sampling_ratio/max": 1.0000144243240356, |
| "sampling/importance_sampling_ratio/mean": 0.999998927116394, |
| "sampling/importance_sampling_ratio/min": 0.9999858140945435, |
| "sampling/sampling_logp_difference/max": 1.8358508896199055e-05, |
| "sampling/sampling_logp_difference/mean": 1.4965582977310987e-06, |
| "step": 246, |
| "step_time": 11.372876166001333 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 639.0, |
| "completions/max_terminated_length": 639.0, |
| "completions/mean_length": 150.46875, |
| "completions/mean_terminated_length": 150.46875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.801312877906639e-05, |
| "epoch": 0.00494, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5779214663780294e-05, |
| "kl": 5.292075231671333, |
| "learning_rate": 3.242529721519152e-05, |
| "loss": -0.0026, |
| "num_tokens": 9245647.0, |
| "reward": 0.5455915927886963, |
| "reward_std": 0.5671359300613403, |
| "rewards/rollout_reward_func/mean": 0.5455915927886963, |
| "rewards/rollout_reward_func/std": 0.5694448947906494, |
| "sampling/importance_sampling_ratio/max": 1.0000114440917969, |
| "sampling/importance_sampling_ratio/mean": 0.9999982118606567, |
| "sampling/importance_sampling_ratio/min": 0.9999804496765137, |
| "sampling/sampling_logp_difference/max": 2.324626620975323e-05, |
| "sampling/sampling_logp_difference/mean": 1.994621015910525e-06, |
| "step": 247, |
| "step_time": 10.116187335000177 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 316.5, |
| "completions/mean_terminated_length": 316.5, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.037800134592544e-05, |
| "epoch": 0.00496, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.4790740503231063e-05, |
| "kl": 4.973954036831856, |
| "learning_rate": 3.2161862710939476e-05, |
| "loss": 0.0204, |
| "num_tokens": 9286864.0, |
| "reward": 0.6830915212631226, |
| "reward_std": 0.4971539378166199, |
| "rewards/rollout_reward_func/mean": 0.6830915212631226, |
| "rewards/rollout_reward_func/std": 0.47586745023727417, |
| "sampling/importance_sampling_ratio/max": 1.0000207424163818, |
| "sampling/importance_sampling_ratio/mean": 0.9999940395355225, |
| "sampling/importance_sampling_ratio/min": 0.9999626874923706, |
| "sampling/sampling_logp_difference/max": 3.242294769734144e-05, |
| "sampling/sampling_logp_difference/mean": 3.037049282283988e-06, |
| "step": 248, |
| "step_time": 11.992384031000256 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 247.65625, |
| "completions/mean_terminated_length": 247.65625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.774960132498563e-05, |
| "epoch": 0.00498, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.237016203958774e-05, |
| "kl": 4.647655583918095, |
| "learning_rate": 3.1902691957580834e-05, |
| "loss": 0.0102, |
| "num_tokens": 9324389.0, |
| "reward": 0.6430915594100952, |
| "reward_std": 0.4474893808364868, |
| "rewards/rollout_reward_func/mean": 0.6430915594100952, |
| "rewards/rollout_reward_func/std": 0.49145370721817017, |
| "sampling/importance_sampling_ratio/max": 1.0000030994415283, |
| "sampling/importance_sampling_ratio/mean": 0.9999960660934448, |
| "sampling/importance_sampling_ratio/min": 0.9999740123748779, |
| "sampling/sampling_logp_difference/max": 2.6822204745258205e-05, |
| "sampling/sampling_logp_difference/mean": 2.2784347493143287e-06, |
| "step": 249, |
| "step_time": 10.406291274999603 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 302.9375, |
| "completions/mean_terminated_length": 302.9375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.460602839159947e-05, |
| "epoch": 0.005, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.1649214836070314e-05, |
| "kl": 5.410085678100586, |
| "learning_rate": 3.1647821379221695e-05, |
| "loss": 0.0123, |
| "num_tokens": 9365019.0, |
| "reward": 0.7090123295783997, |
| "reward_std": 0.47402238845825195, |
| "rewards/rollout_reward_func/mean": 0.7090123295783997, |
| "rewards/rollout_reward_func/std": 0.46759894490242004, |
| "sampling/importance_sampling_ratio/max": 1.0000172853469849, |
| "sampling/importance_sampling_ratio/mean": 0.9999974966049194, |
| "sampling/importance_sampling_ratio/min": 0.9999734163284302, |
| "sampling/sampling_logp_difference/max": 2.479633258190006e-05, |
| "sampling/sampling_logp_difference/mean": 2.1298787942214403e-06, |
| "step": 250, |
| "step_time": 11.301801955999508 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 247.96875, |
| "completions/mean_terminated_length": 247.96875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.343430341994917e-05, |
| "epoch": 0.00502, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.20609363168478e-05, |
| "kl": 5.1174429804086685, |
| "learning_rate": 3.139728679561744e-05, |
| "loss": 0.0003, |
| "num_tokens": 9403754.0, |
| "reward": 0.6780915260314941, |
| "reward_std": 0.5075234174728394, |
| "rewards/rollout_reward_func/mean": 0.6780915260314941, |
| "rewards/rollout_reward_func/std": 0.4863426387310028, |
| "sampling/importance_sampling_ratio/max": 1.0000110864639282, |
| "sampling/importance_sampling_ratio/mean": 0.9999961853027344, |
| "sampling/importance_sampling_ratio/min": 0.9999733567237854, |
| "sampling/sampling_logp_difference/max": 3.040073352167383e-05, |
| "sampling/sampling_logp_difference/mean": 2.2713497855875175e-06, |
| "step": 251, |
| "step_time": 11.164583231999586 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 678.0, |
| "completions/max_terminated_length": 678.0, |
| "completions/mean_length": 243.9375, |
| "completions/mean_terminated_length": 243.9375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.330492835601945e-05, |
| "epoch": 0.00504, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.416094452608377e-05, |
| "kl": 5.1367504596710205, |
| "learning_rate": 3.1151123417138556e-05, |
| "loss": 0.0115, |
| "num_tokens": 9442316.0, |
| "reward": 0.5840123295783997, |
| "reward_std": 0.5598544478416443, |
| "rewards/rollout_reward_func/mean": 0.5840123295783997, |
| "rewards/rollout_reward_func/std": 0.5602787733078003, |
| "sampling/importance_sampling_ratio/max": 1.0000160932540894, |
| "sampling/importance_sampling_ratio/mean": 0.9999907612800598, |
| "sampling/importance_sampling_ratio/min": 0.9998561143875122, |
| "sampling/sampling_logp_difference/max": 0.00012445918400771916, |
| "sampling/sampling_logp_difference/mean": 3.5559787647798657e-06, |
| "step": 252, |
| "step_time": 11.626074560000234 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 549.0, |
| "completions/max_terminated_length": 549.0, |
| "completions/mean_length": 198.71875, |
| "completions/mean_terminated_length": 198.71875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.531994623377614e-05, |
| "epoch": 0.00506, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 8.664622873766348e-05, |
| "kl": 4.52351340779569, |
| "learning_rate": 3.090936583982223e-05, |
| "loss": -0.0237, |
| "num_tokens": 9478355.0, |
| "reward": 0.8955914974212646, |
| "reward_std": 0.23238001763820648, |
| "rewards/rollout_reward_func/mean": 0.8955914974212646, |
| "rewards/rollout_reward_func/std": 0.34063035249710083, |
| "sampling/importance_sampling_ratio/max": 1.0000280141830444, |
| "sampling/importance_sampling_ratio/mean": 0.9999982118606567, |
| "sampling/importance_sampling_ratio/min": 0.999969482421875, |
| "sampling/sampling_logp_difference/max": 3.838152042590082e-05, |
| "sampling/sampling_logp_difference/mean": 2.3958950805536006e-06, |
| "step": 253, |
| "step_time": 9.807439852000243 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 687.0, |
| "completions/max_terminated_length": 687.0, |
| "completions/mean_length": 182.40625, |
| "completions/mean_terminated_length": 182.40625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.432378395653359e-05, |
| "epoch": 0.00508, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9421602701186202e-05, |
| "kl": 4.590497374534607, |
| "learning_rate": 3.067204804051008e-05, |
| "loss": 0.0097, |
| "num_tokens": 9512954.0, |
| "reward": 0.7674999833106995, |
| "reward_std": 0.44722259044647217, |
| "rewards/rollout_reward_func/mean": 0.7674999833106995, |
| "rewards/rollout_reward_func/std": 0.43893563747406006, |
| "sampling/importance_sampling_ratio/max": 1.0000064373016357, |
| "sampling/importance_sampling_ratio/mean": 0.9999945759773254, |
| "sampling/importance_sampling_ratio/min": 0.9999449253082275, |
| "sampling/sampling_logp_difference/max": 3.469131115707569e-05, |
| "sampling/sampling_logp_difference/mean": 2.1483758700924227e-06, |
| "step": 254, |
| "step_time": 11.132221287000448 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 230.28125, |
| "completions/mean_terminated_length": 230.28125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.0313648162946265e-05, |
| "epoch": 0.0051, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7147687685792334e-05, |
| "kl": 4.427471227943897, |
| "learning_rate": 3.04392033720731e-05, |
| "loss": 0.0068, |
| "num_tokens": 9549669.0, |
| "reward": 0.7380915284156799, |
| "reward_std": 0.47702929377555847, |
| "rewards/rollout_reward_func/mean": 0.7380915284156799, |
| "rewards/rollout_reward_func/std": 0.5209257006645203, |
| "sampling/importance_sampling_ratio/max": 1.0000137090682983, |
| "sampling/importance_sampling_ratio/mean": 0.9999964833259583, |
| "sampling/importance_sampling_ratio/min": 0.9999605417251587, |
| "sampling/sampling_logp_difference/max": 3.421324800001457e-05, |
| "sampling/sampling_logp_difference/mean": 2.215445874753641e-06, |
| "step": 255, |
| "step_time": 10.346654982000928 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 242.65625, |
| "completions/mean_terminated_length": 242.65625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.65414812885956e-05, |
| "epoch": 0.00512, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.15218052896671e-05, |
| "kl": 5.355410695075989, |
| "learning_rate": 3.0210864558724166e-05, |
| "loss": -0.0081, |
| "num_tokens": 9588652.0, |
| "reward": 0.5530915260314941, |
| "reward_std": 0.5290231108665466, |
| "rewards/rollout_reward_func/mean": 0.5530915260314941, |
| "rewards/rollout_reward_func/std": 0.513593852519989, |
| "sampling/importance_sampling_ratio/max": 1.0000211000442505, |
| "sampling/importance_sampling_ratio/mean": 0.9999995827674866, |
| "sampling/importance_sampling_ratio/min": 0.9999812841415405, |
| "sampling/sampling_logp_difference/max": 2.8729844416375272e-05, |
| "sampling/sampling_logp_difference/mean": 2.4928499442467e-06, |
| "step": 256, |
| "step_time": 11.143621553000685 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 669.0, |
| "completions/max_terminated_length": 669.0, |
| "completions/mean_length": 209.78125, |
| "completions/mean_terminated_length": 209.78125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.505985173850149e-05, |
| "epoch": 0.00514, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.374975055223331e-05, |
| "kl": 4.920541919767857, |
| "learning_rate": 2.9987063691418976e-05, |
| "loss": 0.0012, |
| "num_tokens": 9625429.0, |
| "reward": 0.7693415880203247, |
| "reward_std": 0.4513539671897888, |
| "rewards/rollout_reward_func/mean": 0.7693415880203247, |
| "rewards/rollout_reward_func/std": 0.4429239332675934, |
| "sampling/importance_sampling_ratio/max": 1.0000171661376953, |
| "sampling/importance_sampling_ratio/mean": 0.9999936819076538, |
| "sampling/importance_sampling_ratio/min": 0.9998266100883484, |
| "sampling/sampling_logp_difference/max": 0.00015843386063352227, |
| "sampling/sampling_logp_difference/mean": 2.968661647173576e-06, |
| "step": 257, |
| "step_time": 10.483305359999576 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 245.9375, |
| "completions/mean_terminated_length": 245.9375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.710858835707768e-05, |
| "epoch": 0.00516, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.126638537447434e-05, |
| "kl": 6.137430131435394, |
| "learning_rate": 2.9767832223345916e-05, |
| "loss": 0.0035, |
| "num_tokens": 9665099.0, |
| "reward": 0.36309152841567993, |
| "reward_std": 0.5653538107872009, |
| "rewards/rollout_reward_func/mean": 0.36309152841567993, |
| "rewards/rollout_reward_func/std": 0.5455530285835266, |
| "sampling/importance_sampling_ratio/max": 1.000001072883606, |
| "sampling/importance_sampling_ratio/mean": 0.9999943971633911, |
| "sampling/importance_sampling_ratio/min": 0.9999680519104004, |
| "sampling/sampling_logp_difference/max": 3.0160221285768785e-05, |
| "sampling/sampling_logp_difference/mean": 2.584924914117437e-06, |
| "step": 258, |
| "step_time": 11.087042529000428 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 222.40625, |
| "completions/mean_terminated_length": 222.40625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.469891036748777e-05, |
| "epoch": 0.00518, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8265869584865868e-05, |
| "kl": 4.711505472660065, |
| "learning_rate": 2.9553200965505647e-05, |
| "loss": 0.0056, |
| "num_tokens": 9702437.0, |
| "reward": 0.7709207534790039, |
| "reward_std": 0.4500967860221863, |
| "rewards/rollout_reward_func/mean": 0.7709207534790039, |
| "rewards/rollout_reward_func/std": 0.4417072534561157, |
| "sampling/importance_sampling_ratio/max": 1.0000097751617432, |
| "sampling/importance_sampling_ratio/mean": 0.9999932646751404, |
| "sampling/importance_sampling_ratio/min": 0.9999553561210632, |
| "sampling/sampling_logp_difference/max": 4.124702900298871e-05, |
| "sampling/sampling_logp_difference/mean": 2.601570486149285e-06, |
| "step": 259, |
| "step_time": 10.336212901999716 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 295.53125, |
| "completions/mean_terminated_length": 295.53125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.37967036804821e-05, |
| "epoch": 0.0052, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.2233665731619112e-05, |
| "kl": 5.106555700302124, |
| "learning_rate": 2.9343200082380866e-05, |
| "loss": 0.009, |
| "num_tokens": 9742666.0, |
| "reward": 0.7430247068405151, |
| "reward_std": 0.5055485963821411, |
| "rewards/rollout_reward_func/mean": 0.7430247068405151, |
| "rewards/rollout_reward_func/std": 0.5186600685119629, |
| "sampling/importance_sampling_ratio/max": 1.000009298324585, |
| "sampling/importance_sampling_ratio/mean": 0.9999918937683105, |
| "sampling/importance_sampling_ratio/min": 0.9999312162399292, |
| "sampling/sampling_logp_difference/max": 6.461232260335237e-05, |
| "sampling/sampling_logp_difference/mean": 2.8128415578976274e-06, |
| "step": 260, |
| "step_time": 11.534494020001148 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 316.65625, |
| "completions/mean_terminated_length": 316.65625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.77153407369724e-05, |
| "epoch": 0.00522, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.157628038432449e-05, |
| "kl": 5.550658762454987, |
| "learning_rate": 2.9137859087696982e-05, |
| "loss": 0.0096, |
| "num_tokens": 9783874.0, |
| "reward": 0.4912499785423279, |
| "reward_std": 0.5208456516265869, |
| "rewards/rollout_reward_func/mean": 0.4912499785423279, |
| "rewards/rollout_reward_func/std": 0.5062942504882812, |
| "sampling/importance_sampling_ratio/max": 1.0000102519989014, |
| "sampling/importance_sampling_ratio/mean": 0.9999921321868896, |
| "sampling/importance_sampling_ratio/min": 0.999884843826294, |
| "sampling/sampling_logp_difference/max": 0.00010276297689415514, |
| "sampling/sampling_logp_difference/mean": 3.043264314328553e-06, |
| "step": 261, |
| "step_time": 11.332538519000991 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 232.4375, |
| "completions/mean_terminated_length": 232.4375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.221832322239607e-05, |
| "epoch": 0.00524, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.8388696591719054e-05, |
| "kl": 5.240327179431915, |
| "learning_rate": 2.8937206840274185e-05, |
| "loss": -0.0008, |
| "num_tokens": 9822174.0, |
| "reward": 0.6765123605728149, |
| "reward_std": 0.46574604511260986, |
| "rewards/rollout_reward_func/mean": 0.6765123605728149, |
| "rewards/rollout_reward_func/std": 0.4871373772621155, |
| "sampling/importance_sampling_ratio/max": 1.0000027418136597, |
| "sampling/importance_sampling_ratio/mean": 0.9999960660934448, |
| "sampling/importance_sampling_ratio/min": 0.9999709725379944, |
| "sampling/sampling_logp_difference/max": 1.7762320567271672e-05, |
| "sampling/sampling_logp_difference/mean": 2.006978093049838e-06, |
| "step": 262, |
| "step_time": 11.319322815998476 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 162.5, |
| "completions/mean_terminated_length": 162.5, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 3.778687000988157e-05, |
| "epoch": 0.00526, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1966519195993897e-05, |
| "kl": 4.899482652544975, |
| "learning_rate": 2.8741271539971675e-05, |
| "loss": 0.0028, |
| "num_tokens": 9856162.0, |
| "reward": 0.6996707916259766, |
| "reward_std": 0.48187097907066345, |
| "rewards/rollout_reward_func/mean": 0.6996707916259766, |
| "rewards/rollout_reward_func/std": 0.47140640020370483, |
| "sampling/importance_sampling_ratio/max": 1.0000178813934326, |
| "sampling/importance_sampling_ratio/mean": 1.000000238418579, |
| "sampling/importance_sampling_ratio/min": 0.999981164932251, |
| "sampling/sampling_logp_difference/max": 1.8714017642196268e-05, |
| "sampling/sampling_logp_difference/mean": 1.450816625947482e-06, |
| "step": 263, |
| "step_time": 10.3198294609997 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 199.28125, |
| "completions/mean_terminated_length": 199.28125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.570318305103683e-05, |
| "epoch": 0.00528, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.6365134544903412e-05, |
| "kl": 4.995322413742542, |
| "learning_rate": 2.8550080723724342e-05, |
| "loss": 0.0073, |
| "num_tokens": 9891750.0, |
| "reward": 0.7674999833106995, |
| "reward_std": 0.44868531823158264, |
| "rewards/rollout_reward_func/mean": 0.7674999833106995, |
| "rewards/rollout_reward_func/std": 0.43893563747406006, |
| "sampling/importance_sampling_ratio/max": 1.0000247955322266, |
| "sampling/importance_sampling_ratio/mean": 0.9999992251396179, |
| "sampling/importance_sampling_ratio/min": 0.9999715089797974, |
| "sampling/sampling_logp_difference/max": 2.860890526790172e-05, |
| "sampling/sampling_logp_difference/mean": 2.4780854346317938e-06, |
| "step": 264, |
| "step_time": 11.099350021000191 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 284.59375, |
| "completions/mean_terminated_length": 284.59375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.459991841254123e-05, |
| "epoch": 0.0053, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.00013145462435204536, |
| "kl": 4.65108397603035, |
| "learning_rate": 2.8363661261672758e-05, |
| "loss": 0.015, |
| "num_tokens": 9930243.0, |
| "reward": 0.7093415260314941, |
| "reward_std": 0.48638349771499634, |
| "rewards/rollout_reward_func/mean": 0.7093415260314941, |
| "rewards/rollout_reward_func/std": 0.4678354859352112, |
| "sampling/importance_sampling_ratio/max": 1.0000114440917969, |
| "sampling/importance_sampling_ratio/mean": 0.9999954700469971, |
| "sampling/importance_sampling_ratio/min": 0.9999157786369324, |
| "sampling/sampling_logp_difference/max": 6.986263178987429e-05, |
| "sampling/sampling_logp_difference/mean": 2.4661187580932165e-06, |
| "step": 265, |
| "step_time": 10.411334865998924 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 161.78125, |
| "completions/mean_terminated_length": 161.78125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.54108203374426e-05, |
| "epoch": 0.00532, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4415406440093648e-05, |
| "kl": 4.443069338798523, |
| "learning_rate": 2.8182039353386807e-05, |
| "loss": -0.0007, |
| "num_tokens": 9963996.0, |
| "reward": 0.764012336730957, |
| "reward_std": 0.5061614513397217, |
| "rewards/rollout_reward_func/mean": 0.764012336730957, |
| "rewards/rollout_reward_func/std": 0.506635308265686, |
| "sampling/importance_sampling_ratio/max": 1.00002920627594, |
| "sampling/importance_sampling_ratio/mean": 0.9999999403953552, |
| "sampling/importance_sampling_ratio/min": 0.9999759197235107, |
| "sampling/sampling_logp_difference/max": 3.4687778679654e-05, |
| "sampling/sampling_logp_difference/mean": 1.926028062371188e-06, |
| "step": 266, |
| "step_time": 11.121112325000013 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 233.5625, |
| "completions/mean_terminated_length": 233.5625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.09017165121395e-05, |
| "epoch": 0.00534, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.388181357877329e-05, |
| "kl": 5.19572713971138, |
| "learning_rate": 2.800524052418356e-05, |
| "loss": -0.0123, |
| "num_tokens": 10001566.0, |
| "reward": 0.7383538484573364, |
| "reward_std": 0.4403288960456848, |
| "rewards/rollout_reward_func/mean": 0.7383538484573364, |
| "rewards/rollout_reward_func/std": 0.5228644013404846, |
| "sampling/importance_sampling_ratio/max": 1.0000412464141846, |
| "sampling/importance_sampling_ratio/mean": 0.9999992251396179, |
| "sampling/importance_sampling_ratio/min": 0.999980628490448, |
| "sampling/sampling_logp_difference/max": 3.74296578229405e-05, |
| "sampling/sampling_logp_difference/mean": 1.971904794118018e-06, |
| "step": 267, |
| "step_time": 10.228456085999824 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 273.5625, |
| "completions/mean_terminated_length": 273.5625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.407312296299096e-05, |
| "epoch": 0.00536, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.846478699008003e-05, |
| "kl": 5.062362797558308, |
| "learning_rate": 2.7833289621539925e-05, |
| "loss": 0.0089, |
| "num_tokens": 10040557.0, |
| "reward": 0.6143415570259094, |
| "reward_std": 0.4798487722873688, |
| "rewards/rollout_reward_func/mean": 0.6143415570259094, |
| "rewards/rollout_reward_func/std": 0.5538265109062195, |
| "sampling/importance_sampling_ratio/max": 1.0000193119049072, |
| "sampling/importance_sampling_ratio/mean": 0.9999940395355225, |
| "sampling/importance_sampling_ratio/min": 0.9999433755874634, |
| "sampling/sampling_logp_difference/max": 5.412838072516024e-05, |
| "sampling/sampling_logp_difference/mean": 2.6531811272434425e-06, |
| "step": 268, |
| "step_time": 10.941937255999164 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 231.4375, |
| "completions/mean_terminated_length": 231.4375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.330135456129483e-05, |
| "epoch": 0.00538, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.1427702197106555e-05, |
| "kl": 4.888749971985817, |
| "learning_rate": 2.766621081160059e-05, |
| "loss": 0.0038, |
| "num_tokens": 10077746.0, |
| "reward": 0.7699331045150757, |
| "reward_std": 0.42729663848876953, |
| "rewards/rollout_reward_func/mean": 0.7699331045150757, |
| "rewards/rollout_reward_func/std": 0.44109678268432617, |
| "sampling/importance_sampling_ratio/max": 1.0000017881393433, |
| "sampling/importance_sampling_ratio/mean": 0.9999940991401672, |
| "sampling/importance_sampling_ratio/min": 0.999950647354126, |
| "sampling/sampling_logp_difference/max": 5.555623647524044e-05, |
| "sampling/sampling_logp_difference/mean": 2.4010546439967584e-06, |
| "step": 269, |
| "step_time": 10.250042127999677 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 315.34375, |
| "completions/mean_terminated_length": 315.34375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.789425674786798e-05, |
| "epoch": 0.0054, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.233926690882072e-05, |
| "kl": 5.45307557284832, |
| "learning_rate": 2.7504027575781634e-05, |
| "loss": -0.0033, |
| "num_tokens": 10119423.0, |
| "reward": 0.6190123558044434, |
| "reward_std": 0.5169612169265747, |
| "rewards/rollout_reward_func/mean": 0.6190123558044434, |
| "rewards/rollout_reward_func/std": 0.5024634003639221, |
| "sampling/importance_sampling_ratio/max": 1.0000137090682983, |
| "sampling/importance_sampling_ratio/mean": 0.9999967813491821, |
| "sampling/importance_sampling_ratio/min": 0.9999703764915466, |
| "sampling/sampling_logp_difference/max": 3.6239842302165926e-05, |
| "sampling/sampling_logp_difference/mean": 2.813545734170475e-06, |
| "step": 270, |
| "step_time": 11.126338212000519 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 233.75, |
| "completions/mean_terminated_length": 233.75, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.70626255470097e-05, |
| "epoch": 0.00542, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.3942422558320686e-05, |
| "kl": 5.092776149511337, |
| "learning_rate": 2.734676270747047e-05, |
| "loss": 0.0042, |
| "num_tokens": 10156839.0, |
| "reward": 0.6436830759048462, |
| "reward_std": 0.49129801988601685, |
| "rewards/rollout_reward_func/mean": 0.6436830759048462, |
| "rewards/rollout_reward_func/std": 0.4945172667503357, |
| "sampling/importance_sampling_ratio/max": 1.000022530555725, |
| "sampling/importance_sampling_ratio/mean": 0.9999971389770508, |
| "sampling/importance_sampling_ratio/min": 0.9999656677246094, |
| "sampling/sampling_logp_difference/max": 3.0277311452664435e-05, |
| "sampling/sampling_logp_difference/mean": 2.1845353330718353e-06, |
| "step": 271, |
| "step_time": 10.331348605999665 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 304.46875, |
| "completions/mean_terminated_length": 304.46875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.222742306112195e-05, |
| "epoch": 0.00544, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3053702787146904e-05, |
| "kl": 5.665085136890411, |
| "learning_rate": 2.7194438308822428e-05, |
| "loss": 0.0068, |
| "num_tokens": 10197931.0, |
| "reward": 0.5868415832519531, |
| "reward_std": 0.5270721912384033, |
| "rewards/rollout_reward_func/mean": 0.5868415832519531, |
| "rewards/rollout_reward_func/std": 0.5654386878013611, |
| "sampling/importance_sampling_ratio/max": 1.0000213384628296, |
| "sampling/importance_sampling_ratio/mean": 0.9999969005584717, |
| "sampling/importance_sampling_ratio/min": 0.9999679923057556, |
| "sampling/sampling_logp_difference/max": 3.0994589906185865e-05, |
| "sampling/sampling_logp_difference/mean": 2.5059268864424666e-06, |
| "step": 272, |
| "step_time": 11.521847475000413 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 224.84375, |
| "completions/mean_terminated_length": 224.84375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.230691576547542e-05, |
| "epoch": 0.00546, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.916292553185485e-05, |
| "kl": 5.193328641355038, |
| "learning_rate": 2.7047075787654503e-05, |
| "loss": 0.0114, |
| "num_tokens": 10235147.0, |
| "reward": 0.6105915307998657, |
| "reward_std": 0.4785401225090027, |
| "rewards/rollout_reward_func/mean": 0.6105915307998657, |
| "rewards/rollout_reward_func/std": 0.4986937940120697, |
| "sampling/importance_sampling_ratio/max": 1.0000423192977905, |
| "sampling/importance_sampling_ratio/mean": 0.999998152256012, |
| "sampling/importance_sampling_ratio/min": 0.9999819993972778, |
| "sampling/sampling_logp_difference/max": 3.814511001110077e-05, |
| "sampling/sampling_logp_difference/mean": 1.9021531443286221e-06, |
| "step": 273, |
| "step_time": 11.075365086000602 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 269.34375, |
| "completions/mean_terminated_length": 269.34375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.955465923714655e-05, |
| "epoch": 0.00548, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.947480217902921e-05, |
| "kl": 4.885876089334488, |
| "learning_rate": 2.6904695854436662e-05, |
| "loss": 0.0102, |
| "num_tokens": 10273871.0, |
| "reward": 0.6780915260314941, |
| "reward_std": 0.501719057559967, |
| "rewards/rollout_reward_func/mean": 0.6780915260314941, |
| "rewards/rollout_reward_func/std": 0.4803001284599304, |
| "sampling/importance_sampling_ratio/max": 1.0000174045562744, |
| "sampling/importance_sampling_ratio/mean": 0.9999932050704956, |
| "sampling/importance_sampling_ratio/min": 0.9998982548713684, |
| "sampling/sampling_logp_difference/max": 0.00011146671022288501, |
| "sampling/sampling_logp_difference/mean": 2.8891031433886383e-06, |
| "step": 274, |
| "step_time": 10.701226339999266 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 619.0, |
| "completions/max_terminated_length": 619.0, |
| "completions/mean_length": 241.25, |
| "completions/mean_terminated_length": 241.25, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.047394075560987e-05, |
| "epoch": 0.0055, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.9621622161357664e-05, |
| "kl": 4.531084284186363, |
| "learning_rate": 2.676731851938118e-05, |
| "loss": -0.0066, |
| "num_tokens": 10311647.0, |
| "reward": 0.8674999475479126, |
| "reward_std": 0.31253060698509216, |
| "rewards/rollout_reward_func/mean": 0.8674999475479126, |
| "rewards/rollout_reward_func/std": 0.3693324327468872, |
| "sampling/importance_sampling_ratio/max": 1.0000293254852295, |
| "sampling/importance_sampling_ratio/mean": 0.9999931454658508, |
| "sampling/importance_sampling_ratio/min": 0.9999464154243469, |
| "sampling/sampling_logp_difference/max": 3.874358662869781e-05, |
| "sampling/sampling_logp_difference/mean": 3.144346919725649e-06, |
| "step": 275, |
| "step_time": 10.410628370998893 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 284.75, |
| "completions/mean_terminated_length": 284.75, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.49887765607582e-05, |
| "epoch": 0.00552, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.5303739676019177e-05, |
| "kl": 4.912029400467873, |
| "learning_rate": 2.663496308963041e-05, |
| "loss": -0.0006, |
| "num_tokens": 10350910.0, |
| "reward": 0.7405915260314941, |
| "reward_std": 0.479642391204834, |
| "rewards/rollout_reward_func/mean": 0.7405915260314941, |
| "rewards/rollout_reward_func/std": 0.4598024785518646, |
| "sampling/importance_sampling_ratio/max": 1.0000290870666504, |
| "sampling/importance_sampling_ratio/mean": 0.999995231628418, |
| "sampling/importance_sampling_ratio/min": 0.9999715089797974, |
| "sampling/sampling_logp_difference/max": 3.612082946347073e-05, |
| "sampling/sampling_logp_difference/mean": 2.5878532596834702e-06, |
| "step": 276, |
| "step_time": 11.039417921998393 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 181.0625, |
| "completions/mean_terminated_length": 181.0625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 4.9210345821393275e-05, |
| "epoch": 0.00554, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9761475414270535e-05, |
| "kl": 5.184775277972221, |
| "learning_rate": 2.6507648166543308e-05, |
| "loss": 0.0078, |
| "num_tokens": 10386430.0, |
| "reward": 0.7021707892417908, |
| "reward_std": 0.48910099267959595, |
| "rewards/rollout_reward_func/mean": 0.7021707892417908, |
| "rewards/rollout_reward_func/std": 0.470414936542511, |
| "sampling/importance_sampling_ratio/max": 1.0000289678573608, |
| "sampling/importance_sampling_ratio/mean": 1.0000001192092896, |
| "sampling/importance_sampling_ratio/min": 0.9999823570251465, |
| "sampling/sampling_logp_difference/max": 2.503280120436102e-05, |
| "sampling/sampling_logp_difference/mean": 1.6454481510663754e-06, |
| "step": 277, |
| "step_time": 10.672157200998754 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 242.15625, |
| "completions/mean_terminated_length": 242.15625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.979084449199036e-05, |
| "epoch": 0.00556, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.7508402126841247e-05, |
| "kl": 4.862967908382416, |
| "learning_rate": 2.63853916430812e-05, |
| "loss": -0.0024, |
| "num_tokens": 10424522.0, |
| "reward": 0.6446039080619812, |
| "reward_std": 0.5659518241882324, |
| "rewards/rollout_reward_func/mean": 0.6446039080619812, |
| "rewards/rollout_reward_func/std": 0.554857611656189, |
| "sampling/importance_sampling_ratio/max": 1.000006914138794, |
| "sampling/importance_sampling_ratio/mean": 0.9999958276748657, |
| "sampling/importance_sampling_ratio/min": 0.9999752640724182, |
| "sampling/sampling_logp_difference/max": 2.1934793039690703e-05, |
| "sampling/sampling_logp_difference/mean": 1.790413080016151e-06, |
| "step": 278, |
| "step_time": 10.83243991900008 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 211.4375, |
| "completions/mean_terminated_length": 211.4375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.138452050663545e-05, |
| "epoch": 0.00558, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.7026304451283067e-05, |
| "kl": 4.282514490187168, |
| "learning_rate": 2.6268210701293073e-05, |
| "loss": 0.0016, |
| "num_tokens": 10459611.0, |
| "reward": 0.7649331092834473, |
| "reward_std": 0.45170778036117554, |
| "rewards/rollout_reward_func/mean": 0.7649331092834473, |
| "rewards/rollout_reward_func/std": 0.44183656573295593, |
| "sampling/importance_sampling_ratio/max": 1.00001060962677, |
| "sampling/importance_sampling_ratio/mean": 0.9999970197677612, |
| "sampling/importance_sampling_ratio/min": 0.9999755620956421, |
| "sampling/sampling_logp_difference/max": 2.6465353585081175e-05, |
| "sampling/sampling_logp_difference/mean": 2.6112129489774816e-06, |
| "step": 279, |
| "step_time": 10.760200042999259 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 678.0, |
| "completions/max_terminated_length": 678.0, |
| "completions/mean_length": 202.6875, |
| "completions/mean_terminated_length": 202.6875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.559260192773309e-05, |
| "epoch": 0.0056, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.933691525366157e-05, |
| "kl": 5.390433177351952, |
| "learning_rate": 2.615612180990079e-05, |
| "loss": -0.0067, |
| "num_tokens": 10497249.0, |
| "reward": 0.6449999809265137, |
| "reward_std": 0.4967680275440216, |
| "rewards/rollout_reward_func/mean": 0.6449999809265137, |
| "rewards/rollout_reward_func/std": 0.4975034296512604, |
| "sampling/importance_sampling_ratio/max": 1.0000171661376953, |
| "sampling/importance_sampling_ratio/mean": 0.9999953508377075, |
| "sampling/importance_sampling_ratio/min": 0.9999397993087769, |
| "sampling/sampling_logp_difference/max": 6.330482574412599e-05, |
| "sampling/sampling_logp_difference/mean": 2.610198407637654e-06, |
| "step": 280, |
| "step_time": 10.605298713000593 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 276.4375, |
| "completions/mean_terminated_length": 276.4375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.060739913024918e-05, |
| "epoch": 0.00562, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.849232547916472e-05, |
| "kl": 5.0677084401249886, |
| "learning_rate": 2.60491407219846e-05, |
| "loss": -0.005, |
| "num_tokens": 10537347.0, |
| "reward": 0.7115123271942139, |
| "reward_std": 0.4817769229412079, |
| "rewards/rollout_reward_func/mean": 0.7115123271942139, |
| "rewards/rollout_reward_func/std": 0.4767700135707855, |
| "sampling/importance_sampling_ratio/max": 1.0000137090682983, |
| "sampling/importance_sampling_ratio/mean": 0.9999938607215881, |
| "sampling/importance_sampling_ratio/min": 0.9999478459358215, |
| "sampling/sampling_logp_difference/max": 4.4942185922991484e-05, |
| "sampling/sampling_logp_difference/mean": 2.9220987016742583e-06, |
| "step": 281, |
| "step_time": 10.972126118999768 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 232.40625, |
| "completions/mean_terminated_length": 232.40625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.892862940333089e-05, |
| "epoch": 0.00564, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8199409169028513e-05, |
| "kl": 5.05782288312912, |
| "learning_rate": 2.594728247276913e-05, |
| "loss": -0.0011, |
| "num_tokens": 10575586.0, |
| "reward": 0.6449331045150757, |
| "reward_std": 0.5124734044075012, |
| "rewards/rollout_reward_func/mean": 0.6449331045150757, |
| "rewards/rollout_reward_func/std": 0.4961819350719452, |
| "sampling/importance_sampling_ratio/max": 1.000025987625122, |
| "sampling/importance_sampling_ratio/mean": 0.9999963045120239, |
| "sampling/importance_sampling_ratio/min": 0.9999744296073914, |
| "sampling/sampling_logp_difference/max": 4.422439087647945e-05, |
| "sampling/sampling_logp_difference/mean": 2.5519543669361155e-06, |
| "step": 282, |
| "step_time": 10.717279948999021 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 220.8125, |
| "completions/mean_terminated_length": 220.8125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.212359551227564e-05, |
| "epoch": 0.00566, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.04080297407927e-05, |
| "kl": 5.217775493860245, |
| "learning_rate": 2.5850561377510356e-05, |
| "loss": 0.0129, |
| "num_tokens": 10613250.0, |
| "reward": 0.6746707558631897, |
| "reward_std": 0.4775552451610565, |
| "rewards/rollout_reward_func/mean": 0.6746707558631897, |
| "rewards/rollout_reward_func/std": 0.4804707467556, |
| "sampling/importance_sampling_ratio/max": 1.0000135898590088, |
| "sampling/importance_sampling_ratio/mean": 0.9999940395355225, |
| "sampling/importance_sampling_ratio/min": 0.9999485015869141, |
| "sampling/sampling_logp_difference/max": 5.3048734116600826e-05, |
| "sampling/sampling_logp_difference/mean": 2.8808067327190656e-06, |
| "step": 283, |
| "step_time": 10.839048934998573 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 291.15625, |
| "completions/mean_terminated_length": 291.15625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.180986020216551e-05, |
| "epoch": 0.00568, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.043676249741111e-05, |
| "kl": 5.018721252679825, |
| "learning_rate": 2.5758991029483713e-05, |
| "loss": 0.0081, |
| "num_tokens": 10653230.0, |
| "reward": 0.6168415546417236, |
| "reward_std": 0.5035527944564819, |
| "rewards/rollout_reward_func/mean": 0.6168415546417236, |
| "rewards/rollout_reward_func/std": 0.49877506494522095, |
| "sampling/importance_sampling_ratio/max": 1.0000146627426147, |
| "sampling/importance_sampling_ratio/mean": 0.9999990463256836, |
| "sampling/importance_sampling_ratio/min": 0.9999721050262451, |
| "sampling/sampling_logp_difference/max": 2.6345524020143785e-05, |
| "sampling/sampling_logp_difference/mean": 2.221646809630329e-06, |
| "step": 284, |
| "step_time": 11.054412653003055 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 275.78125, |
| "completions/mean_terminated_length": 275.78125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.33028123928625e-05, |
| "epoch": 0.0057, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.267479329835624e-05, |
| "kl": 4.437944404780865, |
| "learning_rate": 2.5672584298073688e-05, |
| "loss": 0.024, |
| "num_tokens": 10690936.0, |
| "reward": 0.6755915284156799, |
| "reward_std": 0.4725848436355591, |
| "rewards/rollout_reward_func/mean": 0.6755915284156799, |
| "rewards/rollout_reward_func/std": 0.47646206617355347, |
| "sampling/importance_sampling_ratio/max": 1.000028133392334, |
| "sampling/importance_sampling_ratio/mean": 0.9999963641166687, |
| "sampling/importance_sampling_ratio/min": 0.9999643564224243, |
| "sampling/sampling_logp_difference/max": 2.8133623345638625e-05, |
| "sampling/sampling_logp_difference/mean": 2.420891178189777e-06, |
| "step": 285, |
| "step_time": 10.956865436998669 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 305.09375, |
| "completions/mean_terminated_length": 305.09375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.367031156491066e-05, |
| "epoch": 0.00572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.3896298444014974e-05, |
| "kl": 5.410068087279797, |
| "learning_rate": 2.5591353326965118e-05, |
| "loss": 0.0064, |
| "num_tokens": 10731982.0, |
| "reward": 0.5849331617355347, |
| "reward_std": 0.5261651873588562, |
| "rewards/rollout_reward_func/mean": 0.5849331617355347, |
| "rewards/rollout_reward_func/std": 0.5046738386154175, |
| "sampling/importance_sampling_ratio/max": 1.0000083446502686, |
| "sampling/importance_sampling_ratio/mean": 0.9999963045120239, |
| "sampling/importance_sampling_ratio/min": 0.9999761581420898, |
| "sampling/sampling_logp_difference/max": 2.6822675863513723e-05, |
| "sampling/sampling_logp_difference/mean": 2.169199888157891e-06, |
| "step": 286, |
| "step_time": 11.055283683998823 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 609.0, |
| "completions/max_terminated_length": 609.0, |
| "completions/mean_length": 173.90625, |
| "completions/mean_terminated_length": 173.90625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.815319975965849e-05, |
| "epoch": 0.00574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.01816469093319e-05, |
| "kl": 4.101704500615597, |
| "learning_rate": 2.551530953243656e-05, |
| "loss": 0.0116, |
| "num_tokens": 10765955.0, |
| "reward": 0.7652623653411865, |
| "reward_std": 0.4456794261932373, |
| "rewards/rollout_reward_func/mean": 0.7652623653411865, |
| "rewards/rollout_reward_func/std": 0.4375646412372589, |
| "sampling/importance_sampling_ratio/max": 1.0000256299972534, |
| "sampling/importance_sampling_ratio/mean": 0.999998927116394, |
| "sampling/importance_sampling_ratio/min": 0.9999776482582092, |
| "sampling/sampling_logp_difference/max": 3.719073720276356e-05, |
| "sampling/sampling_logp_difference/mean": 2.711852630454814e-06, |
| "step": 287, |
| "step_time": 10.189362570999037 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 687.0, |
| "completions/max_terminated_length": 687.0, |
| "completions/mean_length": 260.90625, |
| "completions/mean_terminated_length": 260.90625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.0160799051800495e-05, |
| "epoch": 0.00576, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.7769527150667273e-05, |
| "kl": 4.67512171715498, |
| "learning_rate": 2.5444463601755776e-05, |
| "loss": 0.0217, |
| "num_tokens": 10803410.0, |
| "reward": 0.7071707248687744, |
| "reward_std": 0.45949870347976685, |
| "rewards/rollout_reward_func/mean": 0.7071707248687744, |
| "rewards/rollout_reward_func/std": 0.46562138199806213, |
| "sampling/importance_sampling_ratio/max": 1.0000154972076416, |
| "sampling/importance_sampling_ratio/mean": 0.999997615814209, |
| "sampling/importance_sampling_ratio/min": 0.9999765753746033, |
| "sampling/sampling_logp_difference/max": 4.637298115994781e-05, |
| "sampling/sampling_logp_difference/mean": 2.2777503545512445e-06, |
| "step": 288, |
| "step_time": 10.624308804000066 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 284.28125, |
| "completions/mean_terminated_length": 284.28125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.007407710408643e-05, |
| "epoch": 0.00578, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 7.265037129400298e-05, |
| "kl": 5.380328759551048, |
| "learning_rate": 2.537882549167776e-05, |
| "loss": -0.0069, |
| "num_tokens": 10843642.0, |
| "reward": 0.5521038770675659, |
| "reward_std": 0.5371196866035461, |
| "rewards/rollout_reward_func/mean": 0.5521038770675659, |
| "rewards/rollout_reward_func/std": 0.5132983922958374, |
| "sampling/importance_sampling_ratio/max": 1.0000168085098267, |
| "sampling/importance_sampling_ratio/mean": 0.999995231628418, |
| "sampling/importance_sampling_ratio/min": 0.9999474883079529, |
| "sampling/sampling_logp_difference/max": 5.6267705076606944e-05, |
| "sampling/sampling_logp_difference/mean": 3.2138741516973823e-06, |
| "step": 289, |
| "step_time": 11.017298460002166 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 242.78125, |
| "completions/mean_terminated_length": 242.78125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.603282352874885e-05, |
| "epoch": 0.0058, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.560553159331903e-05, |
| "kl": 5.179336726665497, |
| "learning_rate": 2.531840442704543e-05, |
| "loss": 0.0113, |
| "num_tokens": 10881481.0, |
| "reward": 0.674012303352356, |
| "reward_std": 0.47830620408058167, |
| "rewards/rollout_reward_func/mean": 0.674012303352356, |
| "rewards/rollout_reward_func/std": 0.48066821694374084, |
| "sampling/importance_sampling_ratio/max": 1.0000087022781372, |
| "sampling/importance_sampling_ratio/mean": 0.9999942779541016, |
| "sampling/importance_sampling_ratio/min": 0.9999580383300781, |
| "sampling/sampling_logp_difference/max": 4.435054142959416e-05, |
| "sampling/sampling_logp_difference/mean": 2.450002966725151e-06, |
| "step": 290, |
| "step_time": 10.734974283997872 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 248.59375, |
| "completions/mean_terminated_length": 248.59375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 5.6176058649271e-05, |
| "epoch": 0.00582, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.4972263418021612e-05, |
| "kl": 4.305257134139538, |
| "learning_rate": 2.5263208899493117e-05, |
| "loss": 0.0144, |
| "num_tokens": 10917982.0, |
| "reward": 0.8005915284156799, |
| "reward_std": 0.41739755868911743, |
| "rewards/rollout_reward_func/mean": 0.8005915284156799, |
| "rewards/rollout_reward_func/std": 0.41858813166618347, |
| "sampling/importance_sampling_ratio/max": 1.0000085830688477, |
| "sampling/importance_sampling_ratio/mean": 0.9999980926513672, |
| "sampling/importance_sampling_ratio/min": 0.999978244304657, |
| "sampling/sampling_logp_difference/max": 1.966987292689737e-05, |
| "sampling/sampling_logp_difference/mean": 1.6015185337892035e-06, |
| "step": 291, |
| "step_time": 10.732529017999695 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 705.0, |
| "completions/max_terminated_length": 705.0, |
| "completions/mean_length": 241.875, |
| "completions/mean_terminated_length": 241.875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.879591774440087e-05, |
| "epoch": 0.00584, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.2613104849588126e-05, |
| "kl": 5.317075669765472, |
| "learning_rate": 2.5213246666253165e-05, |
| "loss": 0.0006, |
| "num_tokens": 10956087.0, |
| "reward": 0.6459207534790039, |
| "reward_std": 0.4879300892353058, |
| "rewards/rollout_reward_func/mean": 0.6459207534790039, |
| "rewards/rollout_reward_func/std": 0.49300137162208557, |
| "sampling/importance_sampling_ratio/max": 1.0000144243240356, |
| "sampling/importance_sampling_ratio/mean": 0.9999915361404419, |
| "sampling/importance_sampling_ratio/min": 0.9999055862426758, |
| "sampling/sampling_logp_difference/max": 9.120126196648926e-05, |
| "sampling/sampling_logp_difference/mean": 2.860357653844403e-06, |
| "step": 292, |
| "step_time": 10.913091708997854 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 270.90625, |
| "completions/mean_terminated_length": 270.90625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 8.648679309430918e-05, |
| "epoch": 0.00586, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.2432013540528715e-05, |
| "kl": 5.279637806117535, |
| "learning_rate": 2.5168524749065723e-05, |
| "loss": 0.012, |
| "num_tokens": 10995272.0, |
| "reward": 0.7109207510948181, |
| "reward_std": 0.4653802514076233, |
| "rewards/rollout_reward_func/mean": 0.7109207510948181, |
| "rewards/rollout_reward_func/std": 0.4689246416091919, |
| "sampling/importance_sampling_ratio/max": 1.0000152587890625, |
| "sampling/importance_sampling_ratio/mean": 0.999992847442627, |
| "sampling/importance_sampling_ratio/min": 0.9998891353607178, |
| "sampling/sampling_logp_difference/max": 9.370438783662394e-05, |
| "sampling/sampling_logp_difference/mean": 2.75289994533523e-06, |
| "step": 293, |
| "step_time": 10.773192945001028 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 714.0, |
| "completions/max_terminated_length": 714.0, |
| "completions/mean_length": 250.625, |
| "completions/mean_terminated_length": 250.625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 7.307361252628652e-05, |
| "epoch": 0.00588, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2651251381612383e-05, |
| "kl": 4.669016398489475, |
| "learning_rate": 2.5129049433191904e-05, |
| "loss": 0.0081, |
| "num_tokens": 11034020.0, |
| "reward": 0.7430915832519531, |
| "reward_std": 0.461855411529541, |
| "rewards/rollout_reward_func/mean": 0.7430915832519531, |
| "rewards/rollout_reward_func/std": 0.45573315024375916, |
| "sampling/importance_sampling_ratio/max": 1.000016212463379, |
| "sampling/importance_sampling_ratio/mean": 0.9999939203262329, |
| "sampling/importance_sampling_ratio/min": 0.9999682903289795, |
| "sampling/sampling_logp_difference/max": 3.1233001209329814e-05, |
| "sampling/sampling_logp_difference/mean": 2.4066905552899698e-06, |
| "step": 294, |
| "step_time": 11.02961987100025 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 732.0, |
| "completions/max_terminated_length": 732.0, |
| "completions/mean_length": 209.40625, |
| "completions/mean_terminated_length": 209.40625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 4.992972093020853e-05, |
| "epoch": 0.0059, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 9.169368240691256e-06, |
| "kl": 5.3457541689276695, |
| "learning_rate": 2.509482626653043e-05, |
| "loss": 0.0057, |
| "num_tokens": 11070773.0, |
| "reward": 0.5475000143051147, |
| "reward_std": 0.4948720335960388, |
| "rewards/rollout_reward_func/mean": 0.5475000143051147, |
| "rewards/rollout_reward_func/std": 0.5076383948326111, |
| "sampling/importance_sampling_ratio/max": 1.000032663345337, |
| "sampling/importance_sampling_ratio/mean": 0.9999977946281433, |
| "sampling/importance_sampling_ratio/min": 0.9999710321426392, |
| "sampling/sampling_logp_difference/max": 4.434179572854191e-05, |
| "sampling/sampling_logp_difference/mean": 2.625229171826504e-06, |
| "step": 295, |
| "step_time": 11.026958206001837 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 696.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 166.6875, |
| "completions/mean_terminated_length": 166.6875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 3.824376666727858e-05, |
| "epoch": 0.00592, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7649032088229433e-05, |
| "kl": 4.179639674723148, |
| "learning_rate": 2.506586005883795e-05, |
| "loss": -0.0004, |
| "num_tokens": 11103634.0, |
| "reward": 0.7930915355682373, |
| "reward_std": 0.42395704984664917, |
| "rewards/rollout_reward_func/mean": 0.7930915355682373, |
| "rewards/rollout_reward_func/std": 0.4206209182739258, |
| "sampling/importance_sampling_ratio/max": 1.0000280141830444, |
| "sampling/importance_sampling_ratio/mean": 0.9999994039535522, |
| "sampling/importance_sampling_ratio/min": 0.999958336353302, |
| "sampling/sampling_logp_difference/max": 3.063714029849507e-05, |
| "sampling/sampling_logp_difference/mean": 1.612040250620339e-06, |
| "step": 296, |
| "step_time": 10.490348910001558 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 687.0, |
| "completions/max_terminated_length": 687.0, |
| "completions/mean_length": 258.3125, |
| "completions/mean_terminated_length": 258.3125, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 9.468211831631379e-05, |
| "epoch": 0.00594, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.705732812406495e-05, |
| "kl": 4.647277727723122, |
| "learning_rate": 2.5042154881053053e-05, |
| "loss": -0.0104, |
| "num_tokens": 11142661.0, |
| "reward": 0.8065123558044434, |
| "reward_std": 0.3456747829914093, |
| "rewards/rollout_reward_func/mean": 0.8065123558044434, |
| "rewards/rollout_reward_func/std": 0.42172476649284363, |
| "sampling/importance_sampling_ratio/max": 1.0000272989273071, |
| "sampling/importance_sampling_ratio/mean": 0.9999910593032837, |
| "sampling/importance_sampling_ratio/min": 0.9999492168426514, |
| "sampling/sampling_logp_difference/max": 4.67304780613631e-05, |
| "sampling/sampling_logp_difference/mean": 3.2617106171528576e-06, |
| "step": 297, |
| "step_time": 10.858166022999285 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 251.875, |
| "completions/mean_terminated_length": 251.875, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.953562817102465e-05, |
| "epoch": 0.00596, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 6.815532105974853e-05, |
| "kl": 5.288907490670681, |
| "learning_rate": 2.5023714064724153e-05, |
| "loss": -0.0039, |
| "num_tokens": 11181771.0, |
| "reward": 0.5518415570259094, |
| "reward_std": 0.5265346169471741, |
| "rewards/rollout_reward_func/mean": 0.5518415570259094, |
| "rewards/rollout_reward_func/std": 0.5117542147636414, |
| "sampling/importance_sampling_ratio/max": 1.0000152587890625, |
| "sampling/importance_sampling_ratio/mean": 0.9999966621398926, |
| "sampling/importance_sampling_ratio/min": 0.9999724626541138, |
| "sampling/sampling_logp_difference/max": 2.8370497602736577e-05, |
| "sampling/sampling_logp_difference/mean": 2.1110099623911083e-06, |
| "step": 298, |
| "step_time": 10.510601238000163 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 741.0, |
| "completions/max_terminated_length": 741.0, |
| "completions/mean_length": 206.4375, |
| "completions/mean_terminated_length": 206.4375, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 6.404310158814042e-05, |
| "epoch": 0.00598, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 4.147389699937776e-05, |
| "kl": 4.263456001877785, |
| "learning_rate": 2.5010540201541244e-05, |
| "loss": -0.0012, |
| "num_tokens": 11217402.0, |
| "reward": 0.798420786857605, |
| "reward_std": 0.3639390766620636, |
| "rewards/rollout_reward_func/mean": 0.798420786857605, |
| "rewards/rollout_reward_func/std": 0.41741809248924255, |
| "sampling/importance_sampling_ratio/max": 1.0000015497207642, |
| "sampling/importance_sampling_ratio/mean": 0.9999944567680359, |
| "sampling/importance_sampling_ratio/min": 0.9999539852142334, |
| "sampling/sampling_logp_difference/max": 5.006824358133599e-05, |
| "sampling/sampling_logp_difference/mean": 2.4876530915207695e-06, |
| "step": 299, |
| "step_time": 11.449250560001019 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 723.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 128.0625, |
| "completions/mean_terminated_length": 128.0625, |
| "completions/min_length": 3.0, |
| "completions/min_terminated_length": 3.0, |
| "entropy": 4.052700117540553e-05, |
| "epoch": 0.006, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.4804732422344387e-05, |
| "kl": 4.577869638800621, |
| "learning_rate": 2.5002635142971693e-05, |
| "loss": -0.0049, |
| "num_tokens": 11249250.0, |
| "reward": 0.8246707916259766, |
| "reward_std": 0.33838966488838196, |
| "rewards/rollout_reward_func/mean": 0.8246707916259766, |
| "rewards/rollout_reward_func/std": 0.3964185118675232, |
| "sampling/importance_sampling_ratio/max": 1.0000133514404297, |
| "sampling/importance_sampling_ratio/mean": 0.999998152256012, |
| "sampling/importance_sampling_ratio/min": 0.9999639987945557, |
| "sampling/sampling_logp_difference/max": 3.254435432609171e-05, |
| "sampling/sampling_logp_difference/mean": 1.622052195671131e-06, |
| "step": 300, |
| "step_time": 10.629190601000118 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 300, |
| "num_input_tokens_seen": 11249250, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|