liarsdice_dancik / trainer_state.json
Jordansky's picture
Upload folder using huggingface_hub
9e68c40 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.006,
"eval_steps": 500,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1124.0,
"completions/max_terminated_length": 1124.0,
"completions/mean_length": 220.03125,
"completions/mean_terminated_length": 220.03125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.10999894605993177,
"epoch": 2e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0684715509414673,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0088,
"num_tokens": 35769.0,
"reward": -0.7897067070007324,
"reward_std": 0.7282192707061768,
"rewards/rollout_reward_func/mean": -0.7897067070007324,
"rewards/rollout_reward_func/std": 0.6921246647834778,
"sampling/importance_sampling_ratio/max": 1.680492877960205,
"sampling/importance_sampling_ratio/mean": 0.8940014839172363,
"sampling/importance_sampling_ratio/min": 0.04773883521556854,
"sampling/sampling_logp_difference/max": 2.8054585456848145,
"sampling/sampling_logp_difference/mean": 0.07180452346801758,
"step": 1,
"step_time": 13.410615189000055
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1152.0,
"completions/max_terminated_length": 1152.0,
"completions/mean_length": 302.0,
"completions/mean_terminated_length": 302.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.1047596417774912,
"epoch": 4e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6198148727416992,
"kl": 0.0,
"learning_rate": 2.8571428571428573e-06,
"loss": -0.0135,
"num_tokens": 77064.0,
"reward": -0.7216953039169312,
"reward_std": 0.6422248482704163,
"rewards/rollout_reward_func/mean": -0.7216953039169312,
"rewards/rollout_reward_func/std": 0.7596297860145569,
"sampling/importance_sampling_ratio/max": 2.3478667736053467,
"sampling/importance_sampling_ratio/mean": 0.9927622675895691,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.459986925125122,
"sampling/sampling_logp_difference/mean": 0.0796830952167511,
"step": 2,
"step_time": 13.996805407999773
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.016406250186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.037239584140479565,
"completions/clipped_ratio": 0.0,
"completions/max_length": 854.0,
"completions/max_terminated_length": 854.0,
"completions/mean_length": 280.40625,
"completions/mean_terminated_length": 280.40625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.10994739399757236,
"epoch": 6e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.830590546131134,
"kl": 0.010026682673075604,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.0022,
"num_tokens": 114789.0,
"reward": -0.6636782884597778,
"reward_std": 0.843880832195282,
"rewards/rollout_reward_func/mean": -0.6636782884597778,
"rewards/rollout_reward_func/std": 0.8200815320014954,
"sampling/importance_sampling_ratio/max": 1.9734946489334106,
"sampling/importance_sampling_ratio/mean": 0.9548332095146179,
"sampling/importance_sampling_ratio/min": 0.19213292002677917,
"sampling/sampling_logp_difference/max": 1.5306037664413452,
"sampling/sampling_logp_difference/mean": 0.04678232967853546,
"step": 3,
"step_time": 11.012894956999844
},
{
"clip_ratio/high_max": 0.02142857201397419,
"clip_ratio/high_mean": 0.010714286006987095,
"clip_ratio/low_mean": 0.05390625121071935,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06462053721770644,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1307.0,
"completions/max_terminated_length": 1307.0,
"completions/mean_length": 368.0,
"completions/mean_terminated_length": 368.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.11609030631370842,
"epoch": 8e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2315211296081543,
"kl": 0.04723566836028681,
"learning_rate": 8.571428571428573e-06,
"loss": -0.0016,
"num_tokens": 156927.0,
"reward": -0.5773141384124756,
"reward_std": 0.8677605390548706,
"rewards/rollout_reward_func/mean": -0.5773141384124756,
"rewards/rollout_reward_func/std": 0.8631168603897095,
"sampling/importance_sampling_ratio/max": 2.472893238067627,
"sampling/importance_sampling_ratio/mean": 0.9910582900047302,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2368862628936768,
"sampling/sampling_logp_difference/mean": 0.0653323084115982,
"step": 4,
"step_time": 14.133309550000035
},
{
"clip_ratio/high_max": 0.023685516323894262,
"clip_ratio/high_mean": 0.011842758161947131,
"clip_ratio/low_mean": 0.03437500027939677,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.046217758441343904,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1653.0,
"completions/max_terminated_length": 1653.0,
"completions/mean_length": 358.71875,
"completions/mean_terminated_length": 358.71875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.11123159667477012,
"epoch": 0.0001,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.8210487365722656,
"kl": 0.06365584026783466,
"learning_rate": 1.1428571428571429e-05,
"loss": -0.0616,
"num_tokens": 198606.0,
"reward": -0.51972895860672,
"reward_std": 0.8936069011688232,
"rewards/rollout_reward_func/mean": -0.51972895860672,
"rewards/rollout_reward_func/std": 0.9223779439926147,
"sampling/importance_sampling_ratio/max": 2.422663688659668,
"sampling/importance_sampling_ratio/mean": 1.0740761756896973,
"sampling/importance_sampling_ratio/min": 0.3291013836860657,
"sampling/sampling_logp_difference/max": 1.1113799810409546,
"sampling/sampling_logp_difference/mean": 0.0602106899023056,
"step": 5,
"step_time": 14.35855693699989
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.016666667070239782,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.020572917070239782,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1054.0,
"completions/max_terminated_length": 1054.0,
"completions/mean_length": 259.46875,
"completions/mean_terminated_length": 259.46875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09252851491328329,
"epoch": 0.00012,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.394017219543457,
"kl": 0.05397044947139307,
"learning_rate": 1.4285714285714285e-05,
"loss": 0.0026,
"num_tokens": 237513.0,
"reward": -0.9176378846168518,
"reward_std": 0.393246054649353,
"rewards/rollout_reward_func/mean": -0.9176378846168518,
"rewards/rollout_reward_func/std": 0.5164620280265808,
"sampling/importance_sampling_ratio/max": 1.197955846786499,
"sampling/importance_sampling_ratio/mean": 0.9096383452415466,
"sampling/importance_sampling_ratio/min": 0.07589028030633926,
"sampling/sampling_logp_difference/max": 2.950902223587036,
"sampling/sampling_logp_difference/mean": 0.06425637006759644,
"step": 6,
"step_time": 13.00664691299994
},
{
"clip_ratio/high_max": 0.033333334140479565,
"clip_ratio/high_mean": 0.016666667070239782,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.029166667256504297,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1530.0,
"completions/max_terminated_length": 1530.0,
"completions/mean_length": 242.15625,
"completions/mean_terminated_length": 242.15625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08201168113737367,
"epoch": 0.00014,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0027066469192505,
"kl": 0.08505436949724299,
"learning_rate": 1.7142857142857145e-05,
"loss": -0.0065,
"num_tokens": 275059.0,
"reward": -0.7800816893577576,
"reward_std": 0.610554039478302,
"rewards/rollout_reward_func/mean": -0.7800816893577576,
"rewards/rollout_reward_func/std": 0.6927689909934998,
"sampling/importance_sampling_ratio/max": 1.4916099309921265,
"sampling/importance_sampling_ratio/mean": 0.9357227087020874,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.6500071287155151,
"sampling/sampling_logp_difference/mean": 0.05492936447262764,
"step": 7,
"step_time": 13.66465330799997
},
{
"clip_ratio/high_max": 0.031250000931322575,
"clip_ratio/high_mean": 0.015625000465661287,
"clip_ratio/low_mean": 0.010156250093132257,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.025781250558793545,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1273.0,
"completions/max_terminated_length": 1273.0,
"completions/mean_length": 331.40625,
"completions/mean_terminated_length": 331.40625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.10761701926821843,
"epoch": 0.00016,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1414554119110107,
"kl": 0.31695732939988375,
"learning_rate": 2e-05,
"loss": -0.0063,
"num_tokens": 316632.0,
"reward": -0.1924755573272705,
"reward_std": 1.0470077991485596,
"rewards/rollout_reward_func/mean": -0.1924755573272705,
"rewards/rollout_reward_func/std": 1.0184544324874878,
"sampling/importance_sampling_ratio/max": 2.0652823448181152,
"sampling/importance_sampling_ratio/mean": 0.9144597053527832,
"sampling/importance_sampling_ratio/min": 0.11971249431371689,
"sampling/sampling_logp_difference/max": 2.123110055923462,
"sampling/sampling_logp_difference/mean": 0.06788742542266846,
"step": 8,
"step_time": 13.650407897999571
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.02473958395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02473958395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1164.0,
"completions/max_terminated_length": 1164.0,
"completions/mean_length": 223.5625,
"completions/mean_terminated_length": 223.5625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.11182238413312007,
"epoch": 0.00018,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2173668146133423,
"kl": 0.5084238715935498,
"learning_rate": 2.2857142857142858e-05,
"loss": -0.0034,
"num_tokens": 351791.0,
"reward": -0.020846828818321228,
"reward_std": 1.0506192445755005,
"rewards/rollout_reward_func/mean": -0.020846828818321228,
"rewards/rollout_reward_func/std": 1.0528043508529663,
"sampling/importance_sampling_ratio/max": 2.8375391960144043,
"sampling/importance_sampling_ratio/mean": 0.9840835332870483,
"sampling/importance_sampling_ratio/min": 0.08557987958192825,
"sampling/sampling_logp_difference/max": 2.453369379043579,
"sampling/sampling_logp_difference/mean": 0.05880027264356613,
"step": 9,
"step_time": 12.694531075999976
},
{
"clip_ratio/high_max": 0.020312500186264515,
"clip_ratio/high_mean": 0.010156250093132257,
"clip_ratio/low_mean": 0.028125000651925802,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03828125121071935,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1143.0,
"completions/max_terminated_length": 1143.0,
"completions/mean_length": 246.71875,
"completions/mean_terminated_length": 246.71875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06009864609222859,
"epoch": 0.0002,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3861747086048126,
"kl": 0.5755460254719464,
"learning_rate": 2.5714285714285714e-05,
"loss": 0.0033,
"num_tokens": 389005.0,
"reward": -0.13610242307186127,
"reward_std": 1.0442836284637451,
"rewards/rollout_reward_func/mean": -0.13610242307186127,
"rewards/rollout_reward_func/std": 1.038696050643921,
"sampling/importance_sampling_ratio/max": 2.78231143951416,
"sampling/importance_sampling_ratio/mean": 0.9999732375144958,
"sampling/importance_sampling_ratio/min": 0.27365317940711975,
"sampling/sampling_logp_difference/max": 1.3099734783172607,
"sampling/sampling_logp_difference/mean": 0.04202880337834358,
"step": 10,
"step_time": 12.608456127999943
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.016666667070239782,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016666667070239782,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1094.0,
"completions/max_terminated_length": 1094.0,
"completions/mean_length": 334.1875,
"completions/mean_terminated_length": 334.1875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06632963023184857,
"epoch": 0.00022,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2863199710845947,
"kl": 0.7946649645148227,
"learning_rate": 2.857142857142857e-05,
"loss": -0.0114,
"num_tokens": 430612.0,
"reward": 0.002641141414642334,
"reward_std": 1.030980110168457,
"rewards/rollout_reward_func/mean": 0.002641141414642334,
"rewards/rollout_reward_func/std": 1.0341851711273193,
"sampling/importance_sampling_ratio/max": 1.5416392087936401,
"sampling/importance_sampling_ratio/mean": 0.9006525278091431,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.674771785736084,
"sampling/sampling_logp_difference/mean": 0.052167896181344986,
"step": 11,
"step_time": 12.571019431999957
},
{
"clip_ratio/high_max": 0.012500000186264515,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.027678572107106447,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03392857266589999,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1281.0,
"completions/max_terminated_length": 1281.0,
"completions/mean_length": 327.84375,
"completions/mean_terminated_length": 327.84375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.06426836037917383,
"epoch": 0.00024,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9448660612106323,
"kl": 1.1216627955436707,
"learning_rate": 3.142857142857143e-05,
"loss": -0.0031,
"num_tokens": 469849.0,
"reward": 0.44050925970077515,
"reward_std": 0.9405170679092407,
"rewards/rollout_reward_func/mean": 0.44050925970077515,
"rewards/rollout_reward_func/std": 0.9331760406494141,
"sampling/importance_sampling_ratio/max": 1.6956889629364014,
"sampling/importance_sampling_ratio/mean": 0.966093122959137,
"sampling/importance_sampling_ratio/min": 0.10954803228378296,
"sampling/sampling_logp_difference/max": 2.2184529304504395,
"sampling/sampling_logp_difference/mean": 0.06628098338842392,
"step": 12,
"step_time": 13.392358952999984
},
{
"clip_ratio/high_max": 0.02291666716337204,
"clip_ratio/high_mean": 0.01145833358168602,
"clip_ratio/low_mean": 0.02291666716337204,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03437500074505806,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1111.0,
"completions/max_terminated_length": 1111.0,
"completions/mean_length": 263.53125,
"completions/mean_terminated_length": 263.53125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05535798534674541,
"epoch": 0.00026,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.7448770999908447,
"kl": 4.385535407811403,
"learning_rate": 3.428571428571429e-05,
"loss": -0.0296,
"num_tokens": 507656.0,
"reward": 0.6070110201835632,
"reward_std": 0.8325015902519226,
"rewards/rollout_reward_func/mean": 0.6070110201835632,
"rewards/rollout_reward_func/std": 0.8162023425102234,
"sampling/importance_sampling_ratio/max": 2.217287302017212,
"sampling/importance_sampling_ratio/mean": 0.8639674782752991,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 5.976809501647949,
"sampling/sampling_logp_difference/mean": 0.1777144819498062,
"step": 13,
"step_time": 12.483832168999925
},
{
"clip_ratio/high_max": 0.012500000186264515,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.016666667070239782,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1204.0,
"completions/max_terminated_length": 1204.0,
"completions/mean_length": 175.375,
"completions/mean_terminated_length": 175.375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.037813675041661554,
"epoch": 0.00028,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9732633829116821,
"kl": 3.082265551201999,
"learning_rate": 3.7142857142857143e-05,
"loss": -0.0294,
"num_tokens": 541743.0,
"reward": 0.44121330976486206,
"reward_std": 0.9663182497024536,
"rewards/rollout_reward_func/mean": 0.44121330976486206,
"rewards/rollout_reward_func/std": 0.9269620180130005,
"sampling/importance_sampling_ratio/max": 2.2618067264556885,
"sampling/importance_sampling_ratio/mean": 0.9065130352973938,
"sampling/importance_sampling_ratio/min": 0.00045742199290543795,
"sampling/sampling_logp_difference/max": 7.674319744110107,
"sampling/sampling_logp_difference/mean": 0.14372462034225464,
"step": 14,
"step_time": 12.786647417999802
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 608.0,
"completions/max_terminated_length": 608.0,
"completions/mean_length": 137.71875,
"completions/mean_terminated_length": 137.71875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.04335687311379388,
"epoch": 0.0003,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.684906482696533,
"kl": 9.895050249993801,
"learning_rate": 4e-05,
"loss": -0.0221,
"num_tokens": 574790.0,
"reward": 0.4522267282009125,
"reward_std": 0.7068161964416504,
"rewards/rollout_reward_func/mean": 0.4522267282009125,
"rewards/rollout_reward_func/std": 0.8777400851249695,
"sampling/importance_sampling_ratio/max": 2.254448890686035,
"sampling/importance_sampling_ratio/mean": 0.9891846179962158,
"sampling/importance_sampling_ratio/min": 0.0026889105793088675,
"sampling/sampling_logp_difference/max": 5.908565044403076,
"sampling/sampling_logp_difference/mean": 0.12002458423376083,
"step": 15,
"step_time": 10.161046653999847
},
{
"clip_ratio/high_max": 0.020312500186264515,
"clip_ratio/high_mean": 0.010156250093132257,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010156250093132257,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1153.0,
"completions/max_terminated_length": 1153.0,
"completions/mean_length": 235.75,
"completions/mean_terminated_length": 235.75,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.07614646388719848,
"epoch": 0.00032,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.6830127239227295,
"kl": 2.908422634471208,
"learning_rate": 4.2857142857142856e-05,
"loss": -0.0083,
"num_tokens": 612382.0,
"reward": 0.7016310095787048,
"reward_std": 0.6054160594940186,
"rewards/rollout_reward_func/mean": 0.7016310095787048,
"rewards/rollout_reward_func/std": 0.7000784277915955,
"sampling/importance_sampling_ratio/max": 1.1297085285186768,
"sampling/importance_sampling_ratio/mean": 0.8337504863739014,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 6.672256946563721,
"sampling/sampling_logp_difference/mean": 0.1192205548286438,
"step": 16,
"step_time": 12.473349069999585
},
{
"clip_ratio/high_max": 0.028125000186264515,
"clip_ratio/high_mean": 0.014062500093132257,
"clip_ratio/low_mean": 0.0062500000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.020312499720603228,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1184.0,
"completions/max_terminated_length": 1184.0,
"completions/mean_length": 272.1875,
"completions/mean_terminated_length": 272.1875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07187356776557863,
"epoch": 0.00034,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0832687616348267,
"kl": 2.5055846490431577,
"learning_rate": 4.5714285714285716e-05,
"loss": -0.0186,
"num_tokens": 652165.0,
"reward": 0.42282533645629883,
"reward_std": 0.9263465404510498,
"rewards/rollout_reward_func/mean": 0.42282533645629883,
"rewards/rollout_reward_func/std": 0.9135696291923523,
"sampling/importance_sampling_ratio/max": 1.9455113410949707,
"sampling/importance_sampling_ratio/mean": 0.8885751962661743,
"sampling/importance_sampling_ratio/min": 0.020396223291754723,
"sampling/sampling_logp_difference/max": 3.8924026489257812,
"sampling/sampling_logp_difference/mean": 0.10638131201267242,
"step": 17,
"step_time": 12.625183045000085
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.04062500037252903,
"clip_ratio/low_min": 0.012500000186264515,
"clip_ratio/region_mean": 0.04062500037252903,
"completions/clipped_ratio": 0.0,
"completions/max_length": 638.0,
"completions/max_terminated_length": 638.0,
"completions/mean_length": 209.0,
"completions/mean_terminated_length": 209.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08415665684276519,
"epoch": 0.00036,
"frac_reward_zero_std": 0.0,
"grad_norm": 177.4296875,
"kl": 69.13030137866735,
"learning_rate": 4.8571428571428576e-05,
"loss": 0.0815,
"num_tokens": 688568.0,
"reward": 0.1535860300064087,
"reward_std": 1.023172378540039,
"rewards/rollout_reward_func/mean": 0.1535860300064087,
"rewards/rollout_reward_func/std": 1.0038524866104126,
"sampling/importance_sampling_ratio/max": 1.140038251876831,
"sampling/importance_sampling_ratio/mean": 0.7489842176437378,
"sampling/importance_sampling_ratio/min": 0.0006028485368005931,
"sampling/sampling_logp_difference/max": 6.032342433929443,
"sampling/sampling_logp_difference/mean": 0.2542610764503479,
"step": 18,
"step_time": 10.28650643099968
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.014322916977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014322916977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 914.0,
"completions/max_terminated_length": 914.0,
"completions/mean_length": 284.375,
"completions/mean_terminated_length": 284.375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.08210387951271514,
"epoch": 0.00038,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.229077935218811,
"kl": 3.3837516829371452,
"learning_rate": 5.142857142857143e-05,
"loss": -0.0122,
"num_tokens": 727121.0,
"reward": 0.41616448760032654,
"reward_std": 0.7233636975288391,
"rewards/rollout_reward_func/mean": 0.41616448760032654,
"rewards/rollout_reward_func/std": 0.9268736243247986,
"sampling/importance_sampling_ratio/max": 2.5834200382232666,
"sampling/importance_sampling_ratio/mean": 0.9522545337677002,
"sampling/importance_sampling_ratio/min": 0.038529153913259506,
"sampling/sampling_logp_difference/max": 2.787929058074951,
"sampling/sampling_logp_difference/mean": 0.10299322754144669,
"step": 19,
"step_time": 11.579511369000329
},
{
"clip_ratio/high_max": 0.012500000186264515,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.02291666716337204,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.029166667256504297,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1124.0,
"completions/max_terminated_length": 1124.0,
"completions/mean_length": 230.0625,
"completions/mean_terminated_length": 230.0625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.11127446611544656,
"epoch": 0.0004,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9922760725021362,
"kl": 3.5128636844456196,
"learning_rate": 5.428571428571428e-05,
"loss": -0.0049,
"num_tokens": 764269.0,
"reward": 0.21909433603286743,
"reward_std": 0.9744149446487427,
"rewards/rollout_reward_func/mean": 0.21909433603286743,
"rewards/rollout_reward_func/std": 1.0001939535140991,
"sampling/importance_sampling_ratio/max": 1.926369547843933,
"sampling/importance_sampling_ratio/mean": 0.9654070734977722,
"sampling/importance_sampling_ratio/min": 0.12186437845230103,
"sampling/sampling_logp_difference/max": 3.6394667625427246,
"sampling/sampling_logp_difference/mean": 0.0875457376241684,
"step": 20,
"step_time": 12.297075339999765
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1024.0,
"completions/mean_length": 198.5625,
"completions/mean_terminated_length": 198.5625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.045957784245274524,
"epoch": 0.00042,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.33694905042648315,
"kl": 2.610730357468128,
"learning_rate": 5.714285714285714e-05,
"loss": -0.0197,
"num_tokens": 801033.0,
"reward": 0.48597443103790283,
"reward_std": 0.9212241172790527,
"rewards/rollout_reward_func/mean": 0.48597443103790283,
"rewards/rollout_reward_func/std": 0.8926963210105896,
"sampling/importance_sampling_ratio/max": 1.204205870628357,
"sampling/importance_sampling_ratio/mean": 0.9019367694854736,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.5509400367736816,
"sampling/sampling_logp_difference/mean": 0.03802880644798279,
"step": 21,
"step_time": 12.204463460999932
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 578.0,
"completions/max_terminated_length": 578.0,
"completions/mean_length": 168.46875,
"completions/mean_terminated_length": 168.46875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07223565122012587,
"epoch": 0.00044,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6332337856292725,
"kl": 3.1288448721170425,
"learning_rate": 6e-05,
"loss": -0.0054,
"num_tokens": 836429.0,
"reward": 0.31113743782043457,
"reward_std": 0.9805623292922974,
"rewards/rollout_reward_func/mean": 0.31113743782043457,
"rewards/rollout_reward_func/std": 0.9535738229751587,
"sampling/importance_sampling_ratio/max": 1.7136268615722656,
"sampling/importance_sampling_ratio/mean": 0.9599097967147827,
"sampling/importance_sampling_ratio/min": 0.3510478734970093,
"sampling/sampling_logp_difference/max": 1.039665699005127,
"sampling/sampling_logp_difference/mean": 0.0384407714009285,
"step": 22,
"step_time": 10.721943561000444
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.016666667070239782,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016666667070239782,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1273.0,
"completions/max_terminated_length": 1273.0,
"completions/mean_length": 274.90625,
"completions/mean_terminated_length": 274.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07087159495586093,
"epoch": 0.00046,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6439398527145386,
"kl": 3.9721709862351418,
"learning_rate": 6.285714285714286e-05,
"loss": -0.0091,
"num_tokens": 875727.0,
"reward": 0.23191100358963013,
"reward_std": 0.9855490922927856,
"rewards/rollout_reward_func/mean": 0.23191100358963013,
"rewards/rollout_reward_func/std": 0.9528393745422363,
"sampling/importance_sampling_ratio/max": 1.7258304357528687,
"sampling/importance_sampling_ratio/mean": 0.8456529378890991,
"sampling/importance_sampling_ratio/min": 0.08970843255519867,
"sampling/sampling_logp_difference/max": 1.6682171821594238,
"sampling/sampling_logp_difference/mean": 0.08408902585506439,
"step": 23,
"step_time": 12.50699520400076
},
{
"clip_ratio/high_max": 0.012500000186264515,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0062500000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1014.0,
"completions/max_terminated_length": 1014.0,
"completions/mean_length": 233.0,
"completions/mean_terminated_length": 233.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.057429388416494476,
"epoch": 0.00048,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2117566019296646,
"kl": 2.7027904614806175,
"learning_rate": 6.571428571428571e-05,
"loss": -0.0072,
"num_tokens": 912356.0,
"reward": 0.7661426067352295,
"reward_std": 0.4510638415813446,
"rewards/rollout_reward_func/mean": 0.7661426067352295,
"rewards/rollout_reward_func/std": 0.6789449453353882,
"sampling/importance_sampling_ratio/max": 1.2537685632705688,
"sampling/importance_sampling_ratio/mean": 0.9261639714241028,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.0963659286499023,
"sampling/sampling_logp_difference/mean": 0.0501352995634079,
"step": 24,
"step_time": 12.994097786999873
},
{
"clip_ratio/high_max": 0.012500000186264515,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.0062500000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 864.0,
"completions/max_terminated_length": 864.0,
"completions/mean_length": 284.15625,
"completions/mean_terminated_length": 284.15625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.07915750662141363,
"epoch": 0.0005,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2489689290523529,
"kl": 3.51483316719532,
"learning_rate": 6.857142857142858e-05,
"loss": -0.0141,
"num_tokens": 951303.0,
"reward": 0.6401481628417969,
"reward_std": 0.7306121587753296,
"rewards/rollout_reward_func/mean": 0.6401481628417969,
"rewards/rollout_reward_func/std": 0.7182337045669556,
"sampling/importance_sampling_ratio/max": 1.7077192068099976,
"sampling/importance_sampling_ratio/mean": 0.9699938893318176,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2909077405929565,
"sampling/sampling_logp_difference/mean": 0.06180023029446602,
"step": 25,
"step_time": 11.09379838199925
},
{
"clip_ratio/high_max": 0.012500000186264515,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016666667070239782,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1052.0,
"completions/max_terminated_length": 1052.0,
"completions/mean_length": 248.09375,
"completions/mean_terminated_length": 248.09375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06904788634801662,
"epoch": 0.00052,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.426376074552536,
"kl": 3.752748064696789,
"learning_rate": 7.142857142857143e-05,
"loss": -0.0171,
"num_tokens": 989455.0,
"reward": 0.5763981938362122,
"reward_std": 0.8214547634124756,
"rewards/rollout_reward_func/mean": 0.5763981938362122,
"rewards/rollout_reward_func/std": 0.8107219934463501,
"sampling/importance_sampling_ratio/max": 1.2095879316329956,
"sampling/importance_sampling_ratio/mean": 0.9170844554901123,
"sampling/importance_sampling_ratio/min": 0.17924457788467407,
"sampling/sampling_logp_difference/max": 1.90674889087677,
"sampling/sampling_logp_difference/mean": 0.06156245991587639,
"step": 26,
"step_time": 12.128414304999978
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 913.0,
"completions/max_terminated_length": 913.0,
"completions/mean_length": 238.5,
"completions/mean_terminated_length": 238.5,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.047321546943749127,
"epoch": 0.00054,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13649658858776093,
"kl": 3.7867856696248055,
"learning_rate": 7.428571428571429e-05,
"loss": 0.0024,
"num_tokens": 1026966.0,
"reward": 0.5796800851821899,
"reward_std": 0.5545504093170166,
"rewards/rollout_reward_func/mean": 0.5796800851821899,
"rewards/rollout_reward_func/std": 0.7230707406997681,
"sampling/importance_sampling_ratio/max": 1.8340015411376953,
"sampling/importance_sampling_ratio/mean": 0.9975396990776062,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.570065975189209,
"sampling/sampling_logp_difference/mean": 0.04671240225434303,
"step": 27,
"step_time": 11.592925249000245
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0062500000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0062500000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 695.0,
"completions/max_terminated_length": 695.0,
"completions/mean_length": 213.03125,
"completions/mean_terminated_length": 213.03125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.02343887006645673,
"epoch": 0.00056,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.22743558883667,
"kl": 15.115263469517231,
"learning_rate": 7.714285714285715e-05,
"loss": 0.019,
"num_tokens": 1062974.0,
"reward": 0.7336158752441406,
"reward_std": 0.5854325294494629,
"rewards/rollout_reward_func/mean": 0.7336158752441406,
"rewards/rollout_reward_func/std": 0.5881174206733704,
"sampling/importance_sampling_ratio/max": 1.5286439657211304,
"sampling/importance_sampling_ratio/mean": 1.003950834274292,
"sampling/importance_sampling_ratio/min": 0.12551912665367126,
"sampling/sampling_logp_difference/max": 1.9951891899108887,
"sampling/sampling_logp_difference/mean": 0.025340761989355087,
"step": 28,
"step_time": 10.574544273999663
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 677.0,
"completions/max_terminated_length": 677.0,
"completions/mean_length": 202.3125,
"completions/mean_terminated_length": 202.3125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.021139492744623567,
"epoch": 0.00058,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.06938721239566803,
"kl": 3.8235014770179987,
"learning_rate": 8e-05,
"loss": -0.0084,
"num_tokens": 1099842.0,
"reward": 0.7044858932495117,
"reward_std": 0.3917056620121002,
"rewards/rollout_reward_func/mean": 0.7044858932495117,
"rewards/rollout_reward_func/std": 0.4747315049171448,
"sampling/importance_sampling_ratio/max": 1.07483971118927,
"sampling/importance_sampling_ratio/mean": 1.0014500617980957,
"sampling/importance_sampling_ratio/min": 0.9500851035118103,
"sampling/sampling_logp_difference/max": 0.12000381201505661,
"sampling/sampling_logp_difference/mean": 0.004348148591816425,
"step": 29,
"step_time": 10.580135002999441
},
{
"clip_ratio/high_max": 0.012500000186264515,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0062500000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 914.0,
"completions/max_terminated_length": 914.0,
"completions/mean_length": 231.90625,
"completions/mean_terminated_length": 231.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05673399774605059,
"epoch": 0.0006,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.9880616664886475,
"kl": 22.03186371177435,
"learning_rate": 8.285714285714287e-05,
"loss": 0.0294,
"num_tokens": 1136891.0,
"reward": 0.7692453861236572,
"reward_std": 0.40567266941070557,
"rewards/rollout_reward_func/mean": 0.7692453861236572,
"rewards/rollout_reward_func/std": 0.5713350176811218,
"sampling/importance_sampling_ratio/max": 1.5409276485443115,
"sampling/importance_sampling_ratio/mean": 0.9707353115081787,
"sampling/importance_sampling_ratio/min": 0.3624197244644165,
"sampling/sampling_logp_difference/max": 1.0131800174713135,
"sampling/sampling_logp_difference/mean": 0.02174563892185688,
"step": 30,
"step_time": 11.996624365999878
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0062500000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010156250093132257,
"completions/clipped_ratio": 0.0,
"completions/max_length": 930.0,
"completions/max_terminated_length": 930.0,
"completions/mean_length": 248.90625,
"completions/mean_terminated_length": 248.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.16393944306037156,
"epoch": 0.00062,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5357746481895447,
"kl": 3.937878467142582,
"learning_rate": 8.571428571428571e-05,
"loss": 0.014,
"num_tokens": 1175169.0,
"reward": 0.357496440410614,
"reward_std": 0.7932678461074829,
"rewards/rollout_reward_func/mean": 0.357496440410614,
"rewards/rollout_reward_func/std": 0.7959622144699097,
"sampling/importance_sampling_ratio/max": 2.3141262531280518,
"sampling/importance_sampling_ratio/mean": 0.9407311677932739,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.5570695400238037,
"sampling/sampling_logp_difference/mean": 0.04863358661532402,
"step": 31,
"step_time": 11.595104513999786
},
{
"clip_ratio/high_max": 0.012500000186264515,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.0062500000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1023.0,
"completions/max_terminated_length": 1023.0,
"completions/mean_length": 243.15625,
"completions/mean_terminated_length": 243.15625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.18836299496615538,
"epoch": 0.00064,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19593629240989685,
"kl": 3.839852336794138,
"learning_rate": 8.857142857142857e-05,
"loss": -0.0085,
"num_tokens": 1213031.0,
"reward": 0.7341962456703186,
"reward_std": 0.5838453769683838,
"rewards/rollout_reward_func/mean": 0.7341962456703186,
"rewards/rollout_reward_func/std": 0.5884881019592285,
"sampling/importance_sampling_ratio/max": 1.9368575811386108,
"sampling/importance_sampling_ratio/mean": 0.998816728591919,
"sampling/importance_sampling_ratio/min": 0.486605167388916,
"sampling/sampling_logp_difference/max": 0.6482534408569336,
"sampling/sampling_logp_difference/mean": 0.05510255694389343,
"step": 32,
"step_time": 13.026593125999852
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1079.0,
"completions/max_terminated_length": 1079.0,
"completions/mean_length": 180.9375,
"completions/mean_terminated_length": 180.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.19241300441353815,
"epoch": 0.00066,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21624083817005157,
"kl": 3.650024376809597,
"learning_rate": 9.142857142857143e-05,
"loss": -0.0102,
"num_tokens": 1246957.0,
"reward": 0.6980147361755371,
"reward_std": 0.6458718776702881,
"rewards/rollout_reward_func/mean": 0.6980147361755371,
"rewards/rollout_reward_func/std": 0.6438339352607727,
"sampling/importance_sampling_ratio/max": 1.5693196058273315,
"sampling/importance_sampling_ratio/mean": 1.0197675228118896,
"sampling/importance_sampling_ratio/min": 0.33508750796318054,
"sampling/sampling_logp_difference/max": 0.8358498811721802,
"sampling/sampling_logp_difference/mean": 0.046664539724588394,
"step": 33,
"step_time": 11.550154542999962
},
{
"clip_ratio/high_max": 0.012500000186264515,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0062500000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 668.0,
"completions/max_terminated_length": 668.0,
"completions/mean_length": 252.46875,
"completions/mean_terminated_length": 252.46875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.3369806137343403,
"epoch": 0.00068,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24248099327087402,
"kl": 4.222835153341293,
"learning_rate": 9.428571428571429e-05,
"loss": -0.0043,
"num_tokens": 1286249.0,
"reward": 0.5828893184661865,
"reward_std": 0.6884247064590454,
"rewards/rollout_reward_func/mean": 0.5828893184661865,
"rewards/rollout_reward_func/std": 0.7157434821128845,
"sampling/importance_sampling_ratio/max": 2.1673381328582764,
"sampling/importance_sampling_ratio/mean": 0.952195405960083,
"sampling/importance_sampling_ratio/min": 0.5653239488601685,
"sampling/sampling_logp_difference/max": 0.8169012069702148,
"sampling/sampling_logp_difference/mean": 0.047646619379520416,
"step": 34,
"step_time": 11.025269679999838
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0062500000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0062500000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 548.0,
"completions/max_terminated_length": 548.0,
"completions/mean_length": 174.75,
"completions/mean_terminated_length": 174.75,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.26513355743372813,
"epoch": 0.0007,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.12303785234689713,
"kl": 4.241589665412903,
"learning_rate": 9.714285714285715e-05,
"loss": -0.0109,
"num_tokens": 1322454.0,
"reward": 0.5774887204170227,
"reward_std": 0.5838077068328857,
"rewards/rollout_reward_func/mean": 0.5774887204170227,
"rewards/rollout_reward_func/std": 0.7217473387718201,
"sampling/importance_sampling_ratio/max": 2.0283899307250977,
"sampling/importance_sampling_ratio/mean": 1.0524414777755737,
"sampling/importance_sampling_ratio/min": 0.8173112273216248,
"sampling/sampling_logp_difference/max": 0.7269496917724609,
"sampling/sampling_logp_difference/mean": 0.027478456497192383,
"step": 35,
"step_time": 9.609598415999699
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0062500000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0062500000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1031.0,
"completions/max_terminated_length": 1031.0,
"completions/mean_length": 243.75,
"completions/mean_terminated_length": 243.75,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.2274461947963573,
"epoch": 0.00072,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4030201733112335,
"kl": 4.836214572191238,
"learning_rate": 0.0001,
"loss": -0.0031,
"num_tokens": 1360418.0,
"reward": 0.45134004950523376,
"reward_std": 0.6674649119377136,
"rewards/rollout_reward_func/mean": 0.45134004950523376,
"rewards/rollout_reward_func/std": 0.6721746325492859,
"sampling/importance_sampling_ratio/max": 1.166141390800476,
"sampling/importance_sampling_ratio/mean": 0.9575443863868713,
"sampling/importance_sampling_ratio/min": 0.4807126522064209,
"sampling/sampling_logp_difference/max": 0.5672388076782227,
"sampling/sampling_logp_difference/mean": 0.02744535729289055,
"step": 36,
"step_time": 12.980629156000077
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 558.0,
"completions/max_terminated_length": 558.0,
"completions/mean_length": 195.125,
"completions/mean_terminated_length": 195.125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.17616549076046795,
"epoch": 0.00074,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1173086166381836,
"kl": 3.182346645742655,
"learning_rate": 9.999736485702831e-05,
"loss": -0.0185,
"num_tokens": 1396929.0,
"reward": 0.8315432071685791,
"reward_std": 0.3220025300979614,
"rewards/rollout_reward_func/mean": 0.8315432071685791,
"rewards/rollout_reward_func/std": 0.40297287702560425,
"sampling/importance_sampling_ratio/max": 1.438734769821167,
"sampling/importance_sampling_ratio/mean": 1.0194497108459473,
"sampling/importance_sampling_ratio/min": 0.805536150932312,
"sampling/sampling_logp_difference/max": 0.3535919189453125,
"sampling/sampling_logp_difference/mean": 0.011818947270512581,
"step": 37,
"step_time": 9.724833724999371
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1022.0,
"completions/max_terminated_length": 1022.0,
"completions/mean_length": 309.0,
"completions/mean_terminated_length": 309.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.22344195400364697,
"epoch": 0.00076,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.27476319670677185,
"kl": 4.308917306363583,
"learning_rate": 9.998945979845876e-05,
"loss": 0.0127,
"num_tokens": 1437079.0,
"reward": 0.6767849922180176,
"reward_std": 0.5794224143028259,
"rewards/rollout_reward_func/mean": 0.6767849922180176,
"rewards/rollout_reward_func/std": 0.6077510714530945,
"sampling/importance_sampling_ratio/max": 1.5739073753356934,
"sampling/importance_sampling_ratio/mean": 1.0125946998596191,
"sampling/importance_sampling_ratio/min": 0.7139722108840942,
"sampling/sampling_logp_difference/max": 0.46108484268188477,
"sampling/sampling_logp_difference/mean": 0.02042299136519432,
"step": 38,
"step_time": 12.233769962999759
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 608.0,
"completions/max_terminated_length": 608.0,
"completions/mean_length": 232.28125,
"completions/mean_terminated_length": 232.28125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.21799559774808586,
"epoch": 0.00078,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.292609840631485,
"kl": 3.502250585705042,
"learning_rate": 9.997628593527586e-05,
"loss": -0.0016,
"num_tokens": 1475190.0,
"reward": 0.6436625123023987,
"reward_std": 0.7471475005149841,
"rewards/rollout_reward_func/mean": 0.6436625123023987,
"rewards/rollout_reward_func/std": 0.7162776589393616,
"sampling/importance_sampling_ratio/max": 1.1646549701690674,
"sampling/importance_sampling_ratio/mean": 0.9416862726211548,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2828545570373535,
"sampling/sampling_logp_difference/mean": 0.02604352869093418,
"step": 39,
"step_time": 10.37775418499973
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 658.0,
"completions/max_terminated_length": 658.0,
"completions/mean_length": 186.8125,
"completions/mean_terminated_length": 186.8125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.1057002441957593,
"epoch": 0.0008,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6740612387657166,
"kl": 4.765574503690004,
"learning_rate": 9.995784511894694e-05,
"loss": -0.0092,
"num_tokens": 1510583.0,
"reward": 0.5121347904205322,
"reward_std": 0.5755149126052856,
"rewards/rollout_reward_func/mean": 0.5121347904205322,
"rewards/rollout_reward_func/std": 0.5685732364654541,
"sampling/importance_sampling_ratio/max": 2.6313016414642334,
"sampling/importance_sampling_ratio/mean": 1.0894813537597656,
"sampling/importance_sampling_ratio/min": 0.952610433101654,
"sampling/sampling_logp_difference/max": 0.9827308654785156,
"sampling/sampling_logp_difference/mean": 0.023295916616916656,
"step": 40,
"step_time": 10.425912250000465
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1387.0,
"completions/max_terminated_length": 1387.0,
"completions/mean_length": 186.34375,
"completions/mean_terminated_length": 186.34375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.15000496682478115,
"epoch": 0.00082,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2697708010673523,
"kl": 4.247643496841192,
"learning_rate": 9.993413994116206e-05,
"loss": -0.0199,
"num_tokens": 1545590.0,
"reward": 0.6390933990478516,
"reward_std": 0.6065553426742554,
"rewards/rollout_reward_func/mean": 0.6390933990478516,
"rewards/rollout_reward_func/std": 0.6180797815322876,
"sampling/importance_sampling_ratio/max": 2.1225719451904297,
"sampling/importance_sampling_ratio/mean": 0.9938382506370544,
"sampling/importance_sampling_ratio/min": 0.4029119610786438,
"sampling/sampling_logp_difference/max": 0.9079653024673462,
"sampling/sampling_logp_difference/mean": 0.04717833548784256,
"step": 41,
"step_time": 13.721549190000815
},
{
"clip_ratio/high_max": 0.0017857142956927419,
"clip_ratio/high_mean": 0.0008928571478463709,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0008928571478463709,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 178.8125,
"completions/mean_terminated_length": 178.8125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.09959755872841924,
"epoch": 0.00084,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18356552720069885,
"kl": 4.275203719735146,
"learning_rate": 9.990517373346957e-05,
"loss": 0.0207,
"num_tokens": 1580072.0,
"reward": 0.670920729637146,
"reward_std": 0.49966514110565186,
"rewards/rollout_reward_func/mean": 0.670920729637146,
"rewards/rollout_reward_func/std": 0.47837620973587036,
"sampling/importance_sampling_ratio/max": 1.096825122833252,
"sampling/importance_sampling_ratio/mean": 0.9490351676940918,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.6992363929748535,
"sampling/sampling_logp_difference/mean": 0.018723629415035248,
"step": 42,
"step_time": 13.593521437000163
},
{
"clip_ratio/high_max": 0.012202381272800267,
"clip_ratio/high_mean": 0.006101190636400133,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006101190636400133,
"completions/clipped_ratio": 0.0,
"completions/max_length": 579.0,
"completions/max_terminated_length": 579.0,
"completions/mean_length": 125.875,
"completions/mean_terminated_length": 125.875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.13518582028336823,
"epoch": 0.00086,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.5788092613220215,
"kl": 4.227362431585789,
"learning_rate": 9.98709505668081e-05,
"loss": -0.016,
"num_tokens": 1612885.0,
"reward": 0.7634040117263794,
"reward_std": 0.4348216652870178,
"rewards/rollout_reward_func/mean": 0.7634040117263794,
"rewards/rollout_reward_func/std": 0.4481413960456848,
"sampling/importance_sampling_ratio/max": 1.1939719915390015,
"sampling/importance_sampling_ratio/mean": 0.9179114699363708,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.5640859603881836,
"sampling/sampling_logp_difference/mean": 0.029528968036174774,
"step": 43,
"step_time": 12.544377819000147
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0052083334885537624,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0052083334885537624,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 236.71875,
"completions/mean_terminated_length": 236.71875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.21137235511559993,
"epoch": 0.00088,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.37624236941337585,
"kl": 5.903023138642311,
"learning_rate": 9.983147525093428e-05,
"loss": -0.0268,
"num_tokens": 1651371.0,
"reward": 0.6768415570259094,
"reward_std": 0.49680012464523315,
"rewards/rollout_reward_func/mean": 0.6768415570259094,
"rewards/rollout_reward_func/std": 0.484732449054718,
"sampling/importance_sampling_ratio/max": 1.658105731010437,
"sampling/importance_sampling_ratio/mean": 1.0146586894989014,
"sampling/importance_sampling_ratio/min": 0.6395002603530884,
"sampling/sampling_logp_difference/max": 0.44228410720825195,
"sampling/sampling_logp_difference/mean": 0.032690562307834625,
"step": 44,
"step_time": 13.895338261000461
},
{
"clip_ratio/high_max": 0.010714286123402417,
"clip_ratio/high_mean": 0.005357143061701208,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005357143061701208,
"completions/clipped_ratio": 0.0,
"completions/max_length": 629.0,
"completions/max_terminated_length": 629.0,
"completions/mean_length": 265.65625,
"completions/mean_terminated_length": 265.65625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.34391735191456974,
"epoch": 0.0009,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2791600823402405,
"kl": 4.59170226752758,
"learning_rate": 9.978675333374685e-05,
"loss": -0.0415,
"num_tokens": 1690386.0,
"reward": 0.8033539056777954,
"reward_std": 0.4299496114253998,
"rewards/rollout_reward_func/mean": 0.8033539056777954,
"rewards/rollout_reward_func/std": 0.42693132162094116,
"sampling/importance_sampling_ratio/max": 1.4508812427520752,
"sampling/importance_sampling_ratio/mean": 0.9076820015907288,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.5162277221679688,
"sampling/sampling_logp_difference/mean": 0.043557487428188324,
"step": 45,
"step_time": 12.850337384999875
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 231.5625,
"completions/mean_terminated_length": 238.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.20788781438022852,
"epoch": 0.00092,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.45684507489204407,
"kl": 4.819624692201614,
"learning_rate": 9.973679110050689e-05,
"loss": 0.0593,
"num_tokens": 1727749.0,
"reward": 0.5148816704750061,
"reward_std": 0.5777592658996582,
"rewards/rollout_reward_func/mean": 0.5148816704750061,
"rewards/rollout_reward_func/std": 0.5782047510147095,
"sampling/importance_sampling_ratio/max": 1.0983736515045166,
"sampling/importance_sampling_ratio/mean": 0.945776104927063,
"sampling/importance_sampling_ratio/min": 0.5659478306770325,
"sampling/sampling_logp_difference/max": 0.5768914222717285,
"sampling/sampling_logp_difference/mean": 0.0224138256162405,
"step": 46,
"step_time": 14.142768939000007
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 687.0,
"completions/max_terminated_length": 687.0,
"completions/mean_length": 192.5625,
"completions/mean_terminated_length": 197.74192810058594,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.20947814546525478,
"epoch": 0.00094,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9501240849494934,
"kl": 4.3865058943629265,
"learning_rate": 9.968159557295458e-05,
"loss": 0.1084,
"num_tokens": 1762294.0,
"reward": 0.7052456140518188,
"reward_std": 0.5941587686538696,
"rewards/rollout_reward_func/mean": 0.7052456140518188,
"rewards/rollout_reward_func/std": 0.5893100500106812,
"sampling/importance_sampling_ratio/max": 1.3676362037658691,
"sampling/importance_sampling_ratio/mean": 0.9286255836486816,
"sampling/importance_sampling_ratio/min": 0.6919541954994202,
"sampling/sampling_logp_difference/max": 0.6358003616333008,
"sampling/sampling_logp_difference/mean": 0.03289835900068283,
"step": 47,
"step_time": 12.296653302000095
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 997.0,
"completions/max_terminated_length": 997.0,
"completions/mean_length": 231.25,
"completions/mean_terminated_length": 231.25,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.10634760372340679,
"epoch": 0.00096,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5390028953552246,
"kl": 4.756238490343094,
"learning_rate": 9.962117450832225e-05,
"loss": -0.001,
"num_tokens": 1798402.0,
"reward": 0.7353510856628418,
"reward_std": 0.542205810546875,
"rewards/rollout_reward_func/mean": 0.7353510856628418,
"rewards/rollout_reward_func/std": 0.6517592072486877,
"sampling/importance_sampling_ratio/max": 2.089435577392578,
"sampling/importance_sampling_ratio/mean": 0.9923274517059326,
"sampling/importance_sampling_ratio/min": 0.5558032989501953,
"sampling/sampling_logp_difference/max": 0.7373199462890625,
"sampling/sampling_logp_difference/mean": 0.030953995883464813,
"step": 48,
"step_time": 14.2758760239999
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 237.375,
"completions/mean_terminated_length": 237.375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.0647038493771106,
"epoch": 0.00098,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.09670515358448029,
"kl": 4.974646285176277,
"learning_rate": 9.955553639824423e-05,
"loss": -0.0057,
"num_tokens": 1836399.0,
"reward": 0.6488252282142639,
"reward_std": 0.6594193577766418,
"rewards/rollout_reward_func/mean": 0.6488252282142639,
"rewards/rollout_reward_func/std": 0.6682618856430054,
"sampling/importance_sampling_ratio/max": 1.2080090045928955,
"sampling/importance_sampling_ratio/mean": 1.002594232559204,
"sampling/importance_sampling_ratio/min": 0.8824653625488281,
"sampling/sampling_logp_difference/max": 0.18907243013381958,
"sampling/sampling_logp_difference/mean": 0.007402781397104263,
"step": 49,
"step_time": 12.638315505999799
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 285.59375,
"completions/mean_terminated_length": 285.59375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.050345817173365504,
"epoch": 0.001,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.07216052711009979,
"kl": 5.428681015968323,
"learning_rate": 9.948469046756344e-05,
"loss": 0.0006,
"num_tokens": 1875978.0,
"reward": 0.6179745197296143,
"reward_std": 0.5279039740562439,
"rewards/rollout_reward_func/mean": 0.6179745197296143,
"rewards/rollout_reward_func/std": 0.5638126730918884,
"sampling/importance_sampling_ratio/max": 1.2487702369689941,
"sampling/importance_sampling_ratio/mean": 0.974506139755249,
"sampling/importance_sampling_ratio/min": 0.6416374444961548,
"sampling/sampling_logp_difference/max": 0.4437246322631836,
"sampling/sampling_logp_difference/mean": 0.01099600363522768,
"step": 50,
"step_time": 13.79704612400019
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 189.34375,
"completions/mean_terminated_length": 189.34375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.044075825016989256,
"epoch": 0.00102,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16175970435142517,
"kl": 4.5743416622281075,
"learning_rate": 9.940864667303489e-05,
"loss": 0.001,
"num_tokens": 1910574.0,
"reward": 0.67287677526474,
"reward_std": 0.5520733594894409,
"rewards/rollout_reward_func/mean": 0.67287677526474,
"rewards/rollout_reward_func/std": 0.5455199480056763,
"sampling/importance_sampling_ratio/max": 1.0763746500015259,
"sampling/importance_sampling_ratio/mean": 0.9990264773368835,
"sampling/importance_sampling_ratio/min": 0.9405080676078796,
"sampling/sampling_logp_difference/max": 0.07361352443695068,
"sampling/sampling_logp_difference/mean": 0.0017119484255090356,
"step": 51,
"step_time": 12.270658274999732
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0052083334885537624,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0052083334885537624,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 298.96875,
"completions/mean_terminated_length": 298.96875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.05881836503976956,
"epoch": 0.00104,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.454397052526474,
"kl": 5.88704876601696,
"learning_rate": 9.932741570192633e-05,
"loss": -0.003,
"num_tokens": 1951224.0,
"reward": 0.5557973384857178,
"reward_std": 0.7397861480712891,
"rewards/rollout_reward_func/mean": 0.5557973384857178,
"rewards/rollout_reward_func/std": 0.7254597544670105,
"sampling/importance_sampling_ratio/max": 1.504248023033142,
"sampling/importance_sampling_ratio/mean": 1.0347909927368164,
"sampling/importance_sampling_ratio/min": 0.7870653867721558,
"sampling/sampling_logp_difference/max": 0.4082956314086914,
"sampling/sampling_logp_difference/mean": 0.00977290328592062,
"step": 52,
"step_time": 13.339979744000175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 245.40625,
"completions/mean_terminated_length": 245.40625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.04109222920851607,
"epoch": 0.00106,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.10122750699520111,
"kl": 5.1483363807201385,
"learning_rate": 9.924100897051629e-05,
"loss": 0.0049,
"num_tokens": 1988931.0,
"reward": 0.6121540665626526,
"reward_std": 0.5229529142379761,
"rewards/rollout_reward_func/mean": 0.6121540665626526,
"rewards/rollout_reward_func/std": 0.5000297427177429,
"sampling/importance_sampling_ratio/max": 1.0420476198196411,
"sampling/importance_sampling_ratio/mean": 0.9965465068817139,
"sampling/importance_sampling_ratio/min": 0.8893992900848389,
"sampling/sampling_logp_difference/max": 0.11720812320709229,
"sampling/sampling_logp_difference/mean": 0.0017938524251803756,
"step": 53,
"step_time": 12.912300477000372
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 193.15625,
"completions/mean_terminated_length": 193.15625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.023224864701660408,
"epoch": 0.00108,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04124576970934868,
"kl": 4.823809951543808,
"learning_rate": 9.914943862248966e-05,
"loss": 0.0063,
"num_tokens": 2024848.0,
"reward": 0.7362499833106995,
"reward_std": 0.4534528851509094,
"rewards/rollout_reward_func/mean": 0.7362499833106995,
"rewards/rollout_reward_func/std": 0.4570716321468353,
"sampling/importance_sampling_ratio/max": 1.0256677865982056,
"sampling/importance_sampling_ratio/mean": 0.9978145360946655,
"sampling/importance_sampling_ratio/min": 0.8835611939430237,
"sampling/sampling_logp_difference/max": 0.12363004684448242,
"sampling/sampling_logp_difference/mean": 0.0014609881909564137,
"step": 54,
"step_time": 12.811207481999872
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 199.0,
"completions/mean_terminated_length": 199.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.02716700900498381,
"epoch": 0.0011,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01915285922586918,
"kl": 4.195980899035931,
"learning_rate": 9.905271752723088e-05,
"loss": 0.0038,
"num_tokens": 2060220.0,
"reward": 0.7684207558631897,
"reward_std": 0.4268929362297058,
"rewards/rollout_reward_func/mean": 0.7684207558631897,
"rewards/rollout_reward_func/std": 0.44239601492881775,
"sampling/importance_sampling_ratio/max": 1.024228811264038,
"sampling/importance_sampling_ratio/mean": 1.0001416206359863,
"sampling/importance_sampling_ratio/min": 0.9879773259162903,
"sampling/sampling_logp_difference/max": 0.023940302431583405,
"sampling/sampling_logp_difference/mean": 0.00064814742654562,
"step": 55,
"step_time": 13.081034389999559
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 213.0,
"completions/mean_terminated_length": 213.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.03577808013187678,
"epoch": 0.00112,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14266906678676605,
"kl": 5.262552082538605,
"learning_rate": 9.895085927801542e-05,
"loss": -0.0074,
"num_tokens": 2097723.0,
"reward": 0.5815123319625854,
"reward_std": 0.5697444081306458,
"rewards/rollout_reward_func/mean": 0.5815123319625854,
"rewards/rollout_reward_func/std": 0.5674311518669128,
"sampling/importance_sampling_ratio/max": 1.1195484399795532,
"sampling/importance_sampling_ratio/mean": 1.004412055015564,
"sampling/importance_sampling_ratio/min": 0.9843540191650391,
"sampling/sampling_logp_difference/max": 0.11651325225830078,
"sampling/sampling_logp_difference/mean": 0.0017534032231196761,
"step": 56,
"step_time": 11.058078823999722
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 290.34375,
"completions/mean_terminated_length": 290.34375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.027698776671741143,
"epoch": 0.00114,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0155794033780694,
"kl": 5.111991301178932,
"learning_rate": 9.884387819009922e-05,
"loss": -0.007,
"num_tokens": 2137901.0,
"reward": 0.7130915522575378,
"reward_std": 0.3993915915489197,
"rewards/rollout_reward_func/mean": 0.7130915522575378,
"rewards/rollout_reward_func/std": 0.4723998010158539,
"sampling/importance_sampling_ratio/max": 1.0058835744857788,
"sampling/importance_sampling_ratio/mean": 0.9984175562858582,
"sampling/importance_sampling_ratio/min": 0.9888657331466675,
"sampling/sampling_logp_difference/max": 0.01078471913933754,
"sampling/sampling_logp_difference/mean": 0.0008065314614214003,
"step": 57,
"step_time": 11.11290242799987
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0052083334885537624,
"clip_ratio/low_mean": 0.0052083334885537624,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 223.625,
"completions/mean_terminated_length": 223.625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.02802778462881861,
"epoch": 0.00116,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.09351195394992828,
"kl": 4.502256289124489,
"learning_rate": 9.873178929870695e-05,
"loss": 0.0007,
"num_tokens": 2175660.0,
"reward": 0.6445872187614441,
"reward_std": 0.5539962649345398,
"rewards/rollout_reward_func/mean": 0.6445872187614441,
"rewards/rollout_reward_func/std": 0.5525522828102112,
"sampling/importance_sampling_ratio/max": 1.131099820137024,
"sampling/importance_sampling_ratio/mean": 0.9855086803436279,
"sampling/importance_sampling_ratio/min": 0.6534705758094788,
"sampling/sampling_logp_difference/max": 0.4254617691040039,
"sampling/sampling_logp_difference/mean": 0.007256423123180866,
"step": 58,
"step_time": 11.697415718000002
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 286.8125,
"completions/mean_terminated_length": 286.8125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.014620267960026467,
"epoch": 0.00118,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.020578376948833466,
"kl": 5.063114173710346,
"learning_rate": 9.86146083569188e-05,
"loss": 0.007,
"num_tokens": 2215676.0,
"reward": 0.6477622985839844,
"reward_std": 0.4987693727016449,
"rewards/rollout_reward_func/mean": 0.6477622985839844,
"rewards/rollout_reward_func/std": 0.4924916923046112,
"sampling/importance_sampling_ratio/max": 1.0271024703979492,
"sampling/importance_sampling_ratio/mean": 0.9996525049209595,
"sampling/importance_sampling_ratio/min": 0.9902810454368591,
"sampling/sampling_logp_difference/max": 0.026741325855255127,
"sampling/sampling_logp_difference/mean": 0.0005729912081733346,
"step": 59,
"step_time": 10.581422125000472
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 232.03125,
"completions/mean_terminated_length": 232.03125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.006479981271610313,
"epoch": 0.0012,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007562472950667143,
"kl": 5.7793563306331635,
"learning_rate": 9.84923518334567e-05,
"loss": 0.0099,
"num_tokens": 2253158.0,
"reward": 0.579670786857605,
"reward_std": 0.4654841423034668,
"rewards/rollout_reward_func/mean": 0.579670786857605,
"rewards/rollout_reward_func/std": 0.501867949962616,
"sampling/importance_sampling_ratio/max": 1.016674280166626,
"sampling/importance_sampling_ratio/mean": 1.0001671314239502,
"sampling/importance_sampling_ratio/min": 0.9948341846466064,
"sampling/sampling_logp_difference/max": 0.0165358018130064,
"sampling/sampling_logp_difference/mean": 0.0005334613961167634,
"step": 60,
"step_time": 11.516399574999923
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 304.8125,
"completions/mean_terminated_length": 304.8125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.01002441996001835,
"epoch": 0.00122,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2283029556274414,
"kl": 5.093721926212311,
"learning_rate": 9.83650369103696e-05,
"loss": 0.0015,
"num_tokens": 2293842.0,
"reward": 0.7455748319625854,
"reward_std": 0.5360188484191895,
"rewards/rollout_reward_func/mean": 0.7455748319625854,
"rewards/rollout_reward_func/std": 0.5213343501091003,
"sampling/importance_sampling_ratio/max": 1.000532865524292,
"sampling/importance_sampling_ratio/mean": 0.9848321676254272,
"sampling/importance_sampling_ratio/min": 0.5504863858222961,
"sampling/sampling_logp_difference/max": 0.5969529151916504,
"sampling/sampling_logp_difference/mean": 0.004100325983017683,
"step": 61,
"step_time": 10.772246324000207
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 687.0,
"completions/max_terminated_length": 687.0,
"completions/mean_length": 169.75,
"completions/mean_terminated_length": 169.75,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.005074412345948076,
"epoch": 0.00124,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.011714676395058632,
"kl": 4.614134136587381,
"learning_rate": 9.823268148061883e-05,
"loss": -0.0063,
"num_tokens": 2328593.0,
"reward": 0.7350000143051147,
"reward_std": 0.468832790851593,
"rewards/rollout_reward_func/mean": 0.7350000143051147,
"rewards/rollout_reward_func/std": 0.46188950538635254,
"sampling/importance_sampling_ratio/max": 1.0447978973388672,
"sampling/importance_sampling_ratio/mean": 1.0014455318450928,
"sampling/importance_sampling_ratio/min": 0.9953779578208923,
"sampling/sampling_logp_difference/max": 0.04382299259305,
"sampling/sampling_logp_difference/mean": 0.0006599511252716184,
"step": 62,
"step_time": 10.030213335000326
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 669.0,
"completions/max_terminated_length": 669.0,
"completions/mean_length": 239.4375,
"completions/mean_terminated_length": 239.4375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.0035096063779747055,
"epoch": 0.00126,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.002542179776355624,
"kl": 5.135300308465958,
"learning_rate": 9.809530414556335e-05,
"loss": 0.0115,
"num_tokens": 2366527.0,
"reward": 0.7065123319625854,
"reward_std": 0.47867923974990845,
"rewards/rollout_reward_func/mean": 0.7065123319625854,
"rewards/rollout_reward_func/std": 0.4686340391635895,
"sampling/importance_sampling_ratio/max": 1.0115028619766235,
"sampling/importance_sampling_ratio/mean": 1.0001249313354492,
"sampling/importance_sampling_ratio/min": 0.9947195053100586,
"sampling/sampling_logp_difference/max": 0.011437196284532547,
"sampling/sampling_logp_difference/mean": 0.00027378558297641575,
"step": 63,
"step_time": 11.122873709000032
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 232.09375,
"completions/mean_terminated_length": 232.09375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.0033450635638700987,
"epoch": 0.00128,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005013711750507355,
"kl": 4.000352640869096,
"learning_rate": 9.79529242123455e-05,
"loss": -0.0043,
"num_tokens": 2403642.0,
"reward": 0.7711831331253052,
"reward_std": 0.41131600737571716,
"rewards/rollout_reward_func/mean": 0.7711831331253052,
"rewards/rollout_reward_func/std": 0.5058895945549011,
"sampling/importance_sampling_ratio/max": 1.0016292333602905,
"sampling/importance_sampling_ratio/mean": 0.9997192025184631,
"sampling/importance_sampling_ratio/min": 0.9981485605239868,
"sampling/sampling_logp_difference/max": 0.0019368196371942759,
"sampling/sampling_logp_difference/mean": 0.00012640230124816298,
"step": 64,
"step_time": 10.301682713000446
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 217.21875,
"completions/mean_terminated_length": 217.21875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.0023427301867116057,
"epoch": 0.0013,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0033939932473003864,
"kl": 5.74190029501915,
"learning_rate": 9.780556169117757e-05,
"loss": -0.0099,
"num_tokens": 2442962.0,
"reward": 0.6796708106994629,
"reward_std": 0.5027361512184143,
"rewards/rollout_reward_func/mean": 0.6796708106994629,
"rewards/rollout_reward_func/std": 0.4920700490474701,
"sampling/importance_sampling_ratio/max": 1.0012661218643188,
"sampling/importance_sampling_ratio/mean": 0.9997900724411011,
"sampling/importance_sampling_ratio/min": 0.9958721399307251,
"sampling/sampling_logp_difference/max": 0.004124820698052645,
"sampling/sampling_logp_difference/mean": 0.00011377107148291543,
"step": 65,
"step_time": 11.305019645000584
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 274.75,
"completions/mean_terminated_length": 274.75,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.0013343448747491493,
"epoch": 0.00132,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0006489777588285506,
"kl": 4.528398025780916,
"learning_rate": 9.765323729252955e-05,
"loss": 0.0078,
"num_tokens": 2482153.0,
"reward": 0.7402623295783997,
"reward_std": 0.3541877269744873,
"rewards/rollout_reward_func/mean": 0.7402623295783997,
"rewards/rollout_reward_func/std": 0.45108354091644287,
"sampling/importance_sampling_ratio/max": 1.0007672309875488,
"sampling/importance_sampling_ratio/mean": 0.9998499155044556,
"sampling/importance_sampling_ratio/min": 0.9979432821273804,
"sampling/sampling_logp_difference/max": 0.0020514577627182007,
"sampling/sampling_logp_difference/mean": 5.5366330343531445e-05,
"step": 66,
"step_time": 12.143032751000646
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 199.90625,
"completions/mean_terminated_length": 199.90625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.0006031800883192773,
"epoch": 0.00134,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00118810695130378,
"kl": 4.914457447826862,
"learning_rate": 9.749597242421838e-05,
"loss": 0.0153,
"num_tokens": 2518141.0,
"reward": 0.6096707582473755,
"reward_std": 0.5082674622535706,
"rewards/rollout_reward_func/mean": 0.6096707582473755,
"rewards/rollout_reward_func/std": 0.4953238368034363,
"sampling/importance_sampling_ratio/max": 1.000567078590393,
"sampling/importance_sampling_ratio/mean": 1.000020980834961,
"sampling/importance_sampling_ratio/min": 0.9995952248573303,
"sampling/sampling_logp_difference/max": 0.0005287290550768375,
"sampling/sampling_logp_difference/mean": 2.2719808839610778e-05,
"step": 67,
"step_time": 13.364112795999972
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 196.71875,
"completions/mean_terminated_length": 196.71875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.0006413085729377599,
"epoch": 0.00136,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0004711895599029958,
"kl": 4.558197975158691,
"learning_rate": 9.733378918839942e-05,
"loss": 0.0142,
"num_tokens": 2553662.0,
"reward": 0.7346707582473755,
"reward_std": 0.4728625416755676,
"rewards/rollout_reward_func/mean": 0.7346707582473755,
"rewards/rollout_reward_func/std": 0.453955739736557,
"sampling/importance_sampling_ratio/max": 1.0002586841583252,
"sampling/importance_sampling_ratio/mean": 0.9999353289604187,
"sampling/importance_sampling_ratio/min": 0.9994540214538574,
"sampling/sampling_logp_difference/max": 0.0005491760093718767,
"sampling/sampling_logp_difference/mean": 2.3225420591188595e-05,
"step": 68,
"step_time": 12.225320441999429
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 276.5625,
"completions/mean_terminated_length": 276.5625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.0007257235612314616,
"epoch": 0.00138,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00036485324380919337,
"kl": 4.824105702340603,
"learning_rate": 9.716671037846007e-05,
"loss": 0.0176,
"num_tokens": 2592569.0,
"reward": 0.7393414974212646,
"reward_std": 0.4584631323814392,
"rewards/rollout_reward_func/mean": 0.7393414974212646,
"rewards/rollout_reward_func/std": 0.45201003551483154,
"sampling/importance_sampling_ratio/max": 1.0006366968154907,
"sampling/importance_sampling_ratio/mean": 0.9999854564666748,
"sampling/importance_sampling_ratio/min": 0.9991105198860168,
"sampling/sampling_logp_difference/max": 0.0008880097884684801,
"sampling/sampling_logp_difference/mean": 3.5278186260256916e-05,
"step": 69,
"step_time": 13.64757463699948
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 212.9375,
"completions/mean_terminated_length": 212.9375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.0004797347238962857,
"epoch": 0.0014,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0005533385556191206,
"kl": 5.058364823460579,
"learning_rate": 9.699475947581644e-05,
"loss": -0.0099,
"num_tokens": 2629923.0,
"reward": 0.7068415284156799,
"reward_std": 0.3828880488872528,
"rewards/rollout_reward_func/mean": 0.7068415284156799,
"rewards/rollout_reward_func/std": 0.4743438959121704,
"sampling/importance_sampling_ratio/max": 1.000449538230896,
"sampling/importance_sampling_ratio/mean": 1.0000323057174683,
"sampling/importance_sampling_ratio/min": 0.9997754096984863,
"sampling/sampling_logp_difference/max": 0.00045936531387269497,
"sampling/sampling_logp_difference/mean": 1.7823947928263806e-05,
"step": 70,
"step_time": 12.950889879000442
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 178.4375,
"completions/mean_terminated_length": 178.4375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00032274034408885655,
"epoch": 0.00142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00046846744953654706,
"kl": 4.175679575651884,
"learning_rate": 9.681796064661319e-05,
"loss": -0.0097,
"num_tokens": 2664803.0,
"reward": 0.9249999523162842,
"reward_std": 0.21163278818130493,
"rewards/rollout_reward_func/mean": 0.9249999523162842,
"rewards/rollout_reward_func/std": 0.2942458689212799,
"sampling/importance_sampling_ratio/max": 1.0001081228256226,
"sampling/importance_sampling_ratio/mean": 0.9999797344207764,
"sampling/importance_sampling_ratio/min": 0.9997093081474304,
"sampling/sampling_logp_difference/max": 0.0002909002359956503,
"sampling/sampling_logp_difference/mean": 1.2190073903184384e-05,
"step": 71,
"step_time": 12.77516862999937
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 176.96875,
"completions/mean_terminated_length": 176.96875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00026604766155458037,
"epoch": 0.00144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00023387807596009225,
"kl": 5.009636074304581,
"learning_rate": 9.663633873832725e-05,
"loss": -0.0061,
"num_tokens": 2700946.0,
"reward": 0.7043415307998657,
"reward_std": 0.4547346234321594,
"rewards/rollout_reward_func/mean": 0.7043415307998657,
"rewards/rollout_reward_func/std": 0.47734788060188293,
"sampling/importance_sampling_ratio/max": 1.000250220298767,
"sampling/importance_sampling_ratio/mean": 1.0000195503234863,
"sampling/importance_sampling_ratio/min": 0.9998472929000854,
"sampling/sampling_logp_difference/max": 0.000253763806540519,
"sampling/sampling_logp_difference/mean": 9.226798283634707e-06,
"step": 72,
"step_time": 12.709148162000474
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 249.34375,
"completions/mean_terminated_length": 249.34375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00020393275298147273,
"epoch": 0.00146,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0001513528113719076,
"kl": 4.2001279490068555,
"learning_rate": 9.644991927627566e-05,
"loss": 0.0157,
"num_tokens": 2738134.0,
"reward": 0.7374999523162842,
"reward_std": 0.4485630393028259,
"rewards/rollout_reward_func/mean": 0.7374999523162842,
"rewards/rollout_reward_func/std": 0.45219889283180237,
"sampling/importance_sampling_ratio/max": 1.0001038312911987,
"sampling/importance_sampling_ratio/mean": 1.000006914138794,
"sampling/importance_sampling_ratio/min": 0.9998868107795715,
"sampling/sampling_logp_difference/max": 0.0001100691151805222,
"sampling/sampling_logp_difference/mean": 4.743244971905369e-06,
"step": 73,
"step_time": 13.106646253999997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 263.4375,
"completions/mean_terminated_length": 263.4375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00018786232286061022,
"epoch": 0.00148,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0001859942130977288,
"kl": 5.355363771319389,
"learning_rate": 9.625872846002834e-05,
"loss": 0.0084,
"num_tokens": 2777654.0,
"reward": 0.6146707534790039,
"reward_std": 0.5113980770111084,
"rewards/rollout_reward_func/mean": 0.6146707534790039,
"rewards/rollout_reward_func/std": 0.49887460470199585,
"sampling/importance_sampling_ratio/max": 1.0002917051315308,
"sampling/importance_sampling_ratio/mean": 1.0000147819519043,
"sampling/importance_sampling_ratio/min": 0.9998480081558228,
"sampling/sampling_logp_difference/max": 0.0002930494665633887,
"sampling/sampling_logp_difference/mean": 8.865463314577937e-06,
"step": 74,
"step_time": 13.656730500999174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 273.375,
"completions/mean_terminated_length": 273.375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00020519902130899936,
"epoch": 0.0015,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0001783866318874061,
"kl": 4.964824467897415,
"learning_rate": 9.606279315972582e-05,
"loss": 0.021,
"num_tokens": 2816342.0,
"reward": 0.5827623605728149,
"reward_std": 0.4826240539550781,
"rewards/rollout_reward_func/mean": 0.5827623605728149,
"rewards/rollout_reward_func/std": 0.49690043926239014,
"sampling/importance_sampling_ratio/max": 1.000037670135498,
"sampling/importance_sampling_ratio/mean": 0.9999814033508301,
"sampling/importance_sampling_ratio/min": 0.9998151063919067,
"sampling/sampling_logp_difference/max": 0.00018129183445125818,
"sampling/sampling_logp_difference/mean": 6.284242772380821e-06,
"step": 75,
"step_time": 12.500332886000024
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 276.46875,
"completions/mean_terminated_length": 276.46875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.000170203812402292,
"epoch": 0.00152,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00013501929061021656,
"kl": 5.58905765414238,
"learning_rate": 9.586214091230304e-05,
"loss": 0.0046,
"num_tokens": 2856279.0,
"reward": 0.6765123605728149,
"reward_std": 0.4957999885082245,
"rewards/rollout_reward_func/mean": 0.6765123605728149,
"rewards/rollout_reward_func/std": 0.4851822853088379,
"sampling/importance_sampling_ratio/max": 1.0000401735305786,
"sampling/importance_sampling_ratio/mean": 0.99998939037323,
"sampling/importance_sampling_ratio/min": 0.9998947978019714,
"sampling/sampling_logp_difference/max": 8.464550046483055e-05,
"sampling/sampling_logp_difference/mean": 4.386237378639635e-06,
"step": 76,
"step_time": 12.885455153999828
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 227.0,
"completions/mean_terminated_length": 227.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00018892053580543688,
"epoch": 0.00154,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.705218013143167e-05,
"kl": 5.047797460108995,
"learning_rate": 9.565679991761914e-05,
"loss": 0.0011,
"num_tokens": 2894598.0,
"reward": 0.5208538770675659,
"reward_std": 0.6895849704742432,
"rewards/rollout_reward_func/mean": 0.5208538770675659,
"rewards/rollout_reward_func/std": 0.668798565864563,
"sampling/importance_sampling_ratio/max": 1.0001370906829834,
"sampling/importance_sampling_ratio/mean": 1.0000134706497192,
"sampling/importance_sampling_ratio/min": 0.9999558925628662,
"sampling/sampling_logp_difference/max": 0.0001458361221011728,
"sampling/sampling_logp_difference/mean": 6.068152288207784e-06,
"step": 77,
"step_time": 13.598546733000148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 236.6875,
"completions/mean_terminated_length": 236.6875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00014567816769783803,
"epoch": 0.00156,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.185791375581175e-05,
"kl": 4.4403251856565475,
"learning_rate": 9.544679903449437e-05,
"loss": -0.0022,
"num_tokens": 2932073.0,
"reward": 0.7705915570259094,
"reward_std": 0.45292043685913086,
"rewards/rollout_reward_func/mean": 0.7705915570259094,
"rewards/rollout_reward_func/std": 0.4436517357826233,
"sampling/importance_sampling_ratio/max": 1.0000391006469727,
"sampling/importance_sampling_ratio/mean": 0.999990701675415,
"sampling/importance_sampling_ratio/min": 0.9998592734336853,
"sampling/sampling_logp_difference/max": 0.00012469961075112224,
"sampling/sampling_logp_difference/mean": 3.843690137728117e-06,
"step": 78,
"step_time": 12.254022738000003
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 267.46875,
"completions/mean_terminated_length": 267.46875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00017591889431400887,
"epoch": 0.00158,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.5074844567570835e-05,
"kl": 5.288512364029884,
"learning_rate": 9.523216777665409e-05,
"loss": -0.0031,
"num_tokens": 2971826.0,
"reward": 0.6496707201004028,
"reward_std": 0.5136716365814209,
"rewards/rollout_reward_func/mean": 0.6496707201004028,
"rewards/rollout_reward_func/std": 0.49660006165504456,
"sampling/importance_sampling_ratio/max": 1.0001001358032227,
"sampling/importance_sampling_ratio/mean": 0.9999943375587463,
"sampling/importance_sampling_ratio/min": 0.9999160766601562,
"sampling/sampling_logp_difference/max": 0.0001301814045291394,
"sampling/sampling_logp_difference/mean": 6.416719315893715e-06,
"step": 79,
"step_time": 13.015393361999713
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 229.65625,
"completions/mean_terminated_length": 229.65625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.0001409291044751626,
"epoch": 0.0016,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00011761792848119512,
"kl": 4.374786123633385,
"learning_rate": 9.501293630858103e-05,
"loss": -0.0023,
"num_tokens": 3009012.0,
"reward": 0.7709207534790039,
"reward_std": 0.4299285411834717,
"rewards/rollout_reward_func/mean": 0.7709207534790039,
"rewards/rollout_reward_func/std": 0.44675153493881226,
"sampling/importance_sampling_ratio/max": 1.0000734329223633,
"sampling/importance_sampling_ratio/mean": 0.9999964237213135,
"sampling/importance_sampling_ratio/min": 0.9997733235359192,
"sampling/sampling_logp_difference/max": 0.0001550959568703547,
"sampling/sampling_logp_difference/mean": 5.282415258989204e-06,
"step": 80,
"step_time": 12.882759786000634
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 231.40625,
"completions/mean_terminated_length": 231.40625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.748665153352931e-05,
"epoch": 0.00162,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.834749077213928e-05,
"kl": 4.857630208134651,
"learning_rate": 9.478913544127583e-05,
"loss": 0.0111,
"num_tokens": 3045676.0,
"reward": 0.5168415307998657,
"reward_std": 0.5236372947692871,
"rewards/rollout_reward_func/mean": 0.5168415307998657,
"rewards/rollout_reward_func/std": 0.5058324337005615,
"sampling/importance_sampling_ratio/max": 1.000028133392334,
"sampling/importance_sampling_ratio/mean": 0.9999990463256836,
"sampling/importance_sampling_ratio/min": 0.9999288320541382,
"sampling/sampling_logp_difference/max": 6.878793647047132e-05,
"sampling/sampling_logp_difference/mean": 3.0579262784158345e-06,
"step": 81,
"step_time": 12.754008866000504
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 213.03125,
"completions/mean_terminated_length": 213.03125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.73558894056714e-05,
"epoch": 0.00164,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.405829481082037e-05,
"kl": 5.079872742295265,
"learning_rate": 9.45607966279269e-05,
"loss": -0.0035,
"num_tokens": 3082215.0,
"reward": 0.6434207558631897,
"reward_std": 0.4949180483818054,
"rewards/rollout_reward_func/mean": 0.6434207558631897,
"rewards/rollout_reward_func/std": 0.49626046419143677,
"sampling/importance_sampling_ratio/max": 1.0000373125076294,
"sampling/importance_sampling_ratio/mean": 0.9999943971633911,
"sampling/importance_sampling_ratio/min": 0.9998199939727783,
"sampling/sampling_logp_difference/max": 0.00016332250379491597,
"sampling/sampling_logp_difference/mean": 3.4263287034264067e-06,
"step": 82,
"step_time": 13.59404965099975
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 179.40625,
"completions/mean_terminated_length": 179.40625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.360899558113033e-05,
"epoch": 0.00166,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.702681952854618e-05,
"kl": 4.518397552892566,
"learning_rate": 9.432795195948994e-05,
"loss": 0.0149,
"num_tokens": 3116089.0,
"reward": 0.608420729637146,
"reward_std": 0.5539922714233398,
"rewards/rollout_reward_func/mean": 0.608420729637146,
"rewards/rollout_reward_func/std": 0.5539858341217041,
"sampling/importance_sampling_ratio/max": 1.00007164478302,
"sampling/importance_sampling_ratio/mean": 0.999993085861206,
"sampling/importance_sampling_ratio/min": 0.9998717308044434,
"sampling/sampling_logp_difference/max": 0.000115759510663338,
"sampling/sampling_logp_difference/mean": 3.67949360224884e-06,
"step": 83,
"step_time": 12.737816991999352
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 225.125,
"completions/mean_terminated_length": 225.125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00011713192122897453,
"epoch": 0.00168,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.1548224797006696e-05,
"kl": 5.284573458135128,
"learning_rate": 9.409063416017778e-05,
"loss": 0.0012,
"num_tokens": 3154571.0,
"reward": 0.7405915260314941,
"reward_std": 0.46528252959251404,
"rewards/rollout_reward_func/mean": 0.7405915260314941,
"rewards/rollout_reward_func/std": 0.4598024785518646,
"sampling/importance_sampling_ratio/max": 1.0000370740890503,
"sampling/importance_sampling_ratio/mean": 0.9999953508377075,
"sampling/importance_sampling_ratio/min": 0.9999337792396545,
"sampling/sampling_logp_difference/max": 4.374391573946923e-05,
"sampling/sampling_logp_difference/mean": 3.3529299798829015e-06,
"step": 84,
"step_time": 12.951694035000628
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 265.6875,
"completions/mean_terminated_length": 265.6875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.799402398869006e-05,
"epoch": 0.0017,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.943003321182914e-05,
"kl": 5.07642325758934,
"learning_rate": 9.384887658286146e-05,
"loss": 0.0153,
"num_tokens": 3193105.0,
"reward": 0.7087500095367432,
"reward_std": 0.486086905002594,
"rewards/rollout_reward_func/mean": 0.7087500095367432,
"rewards/rollout_reward_func/std": 0.4667232632637024,
"sampling/importance_sampling_ratio/max": 1.0000450611114502,
"sampling/importance_sampling_ratio/mean": 1.000002145767212,
"sampling/importance_sampling_ratio/min": 0.9999704360961914,
"sampling/sampling_logp_difference/max": 6.615445454372093e-05,
"sampling/sampling_logp_difference/mean": 3.7237202832329785e-06,
"step": 85,
"step_time": 12.888392118999946
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 198.28125,
"completions/mean_terminated_length": 198.28125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010015473532121177,
"epoch": 0.00172,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.718404569663107e-05,
"kl": 5.136753097176552,
"learning_rate": 9.360271320438257e-05,
"loss": 0.0072,
"num_tokens": 3228951.0,
"reward": 0.5455915331840515,
"reward_std": 0.5534976124763489,
"rewards/rollout_reward_func/mean": 0.5455915331840515,
"rewards/rollout_reward_func/std": 0.5654960870742798,
"sampling/importance_sampling_ratio/max": 1.000040888786316,
"sampling/importance_sampling_ratio/mean": 0.9999889135360718,
"sampling/importance_sampling_ratio/min": 0.9997588992118835,
"sampling/sampling_logp_difference/max": 0.00016045953088905662,
"sampling/sampling_logp_difference/mean": 4.547407570498763e-06,
"step": 86,
"step_time": 12.209375244000285
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 168.5625,
"completions/mean_terminated_length": 168.5625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.546316439606017e-05,
"epoch": 0.00174,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.010289169149473e-05,
"kl": 4.654582601040602,
"learning_rate": 9.33521786207783e-05,
"loss": -0.0081,
"num_tokens": 3264282.0,
"reward": 0.8309207558631897,
"reward_std": 0.3539258539676666,
"rewards/rollout_reward_func/mean": 0.8309207558631897,
"rewards/rollout_reward_func/std": 0.3995343744754791,
"sampling/importance_sampling_ratio/max": 1.0000208616256714,
"sampling/importance_sampling_ratio/mean": 0.9999948143959045,
"sampling/importance_sampling_ratio/min": 0.9999562501907349,
"sampling/sampling_logp_difference/max": 2.860301174223423e-05,
"sampling/sampling_logp_difference/mean": 2.7131702609040076e-06,
"step": 87,
"step_time": 13.400294012999666
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 199.28125,
"completions/mean_terminated_length": 199.28125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.928002892754193e-05,
"epoch": 0.00176,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.472797470749356e-05,
"kl": 5.448563948273659,
"learning_rate": 9.309730804241916e-05,
"loss": -0.0036,
"num_tokens": 3301625.0,
"reward": 0.767762303352356,
"reward_std": 0.43076157569885254,
"rewards/rollout_reward_func/mean": 0.767762303352356,
"rewards/rollout_reward_func/std": 0.44489601254463196,
"sampling/importance_sampling_ratio/max": 1.000015377998352,
"sampling/importance_sampling_ratio/mean": 0.9999972581863403,
"sampling/importance_sampling_ratio/min": 0.9999665021896362,
"sampling/sampling_logp_difference/max": 2.5505985831841826e-05,
"sampling/sampling_logp_difference/mean": 2.6097468435182236e-06,
"step": 88,
"step_time": 12.85785657000065
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 182.53125,
"completions/mean_terminated_length": 182.53125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.7089729438075665e-05,
"epoch": 0.00178,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0934026906616054e-05,
"kl": 5.0993258729577065,
"learning_rate": 9.283813728906054e-05,
"loss": 0.0125,
"num_tokens": 3337039.0,
"reward": 0.6074999570846558,
"reward_std": 0.49121958017349243,
"rewards/rollout_reward_func/mean": 0.6074999570846558,
"rewards/rollout_reward_func/std": 0.4953916370868683,
"sampling/importance_sampling_ratio/max": 1.0000306367874146,
"sampling/importance_sampling_ratio/mean": 0.9999954700469971,
"sampling/importance_sampling_ratio/min": 0.9999397993087769,
"sampling/sampling_logp_difference/max": 4.13682937505655e-05,
"sampling/sampling_logp_difference/mean": 2.6327832074457547e-06,
"step": 89,
"step_time": 12.809418414000447
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 270.0,
"completions/mean_terminated_length": 270.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00011780626084600954,
"epoch": 0.0018,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.4906221117125824e-05,
"kl": 4.517446521669626,
"learning_rate": 9.257470278480848e-05,
"loss": 0.0146,
"num_tokens": 3374681.0,
"reward": 0.7090123891830444,
"reward_std": 0.4307096004486084,
"rewards/rollout_reward_func/mean": 0.7090123891830444,
"rewards/rollout_reward_func/std": 0.46687158942222595,
"sampling/importance_sampling_ratio/max": 1.0000447034835815,
"sampling/importance_sampling_ratio/mean": 0.9999927282333374,
"sampling/importance_sampling_ratio/min": 0.9999160170555115,
"sampling/sampling_logp_difference/max": 5.054080247646198e-05,
"sampling/sampling_logp_difference/mean": 3.6263454603613354e-06,
"step": 90,
"step_time": 11.027926876000038
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 212.03125,
"completions/mean_terminated_length": 212.03125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.198726852981508e-05,
"epoch": 0.00182,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.5594642617506906e-05,
"kl": 4.633478656411171,
"learning_rate": 9.230704155300075e-05,
"loss": 0.002,
"num_tokens": 3411569.0,
"reward": 0.7068415880203247,
"reward_std": 0.4687871038913727,
"rewards/rollout_reward_func/mean": 0.7068415880203247,
"rewards/rollout_reward_func/std": 0.4736269414424896,
"sampling/importance_sampling_ratio/max": 1.0000184774398804,
"sampling/importance_sampling_ratio/mean": 0.9999944567680359,
"sampling/importance_sampling_ratio/min": 0.9999277591705322,
"sampling/sampling_logp_difference/max": 3.6478690162766725e-05,
"sampling/sampling_logp_difference/mean": 3.097064563917229e-06,
"step": 91,
"step_time": 10.657987567999498
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 309.34375,
"completions/mean_terminated_length": 309.34375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00011667752028188261,
"epoch": 0.00184,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.534836575156078e-05,
"kl": 5.510342627763748,
"learning_rate": 9.20351912110034e-05,
"loss": 0.0023,
"num_tokens": 3452832.0,
"reward": 0.7443415522575378,
"reward_std": 0.4789533019065857,
"rewards/rollout_reward_func/mean": 0.7443415522575378,
"rewards/rollout_reward_func/std": 0.45933249592781067,
"sampling/importance_sampling_ratio/max": 1.0000141859054565,
"sampling/importance_sampling_ratio/mean": 0.9999915361404419,
"sampling/importance_sampling_ratio/min": 0.9998993873596191,
"sampling/sampling_logp_difference/max": 0.00012767789303325117,
"sampling/sampling_logp_difference/mean": 3.875496531691169e-06,
"step": 92,
"step_time": 11.005892613000015
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 241.125,
"completions/mean_terminated_length": 241.125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.778749490862992e-05,
"epoch": 0.00186,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.869422653224319e-05,
"kl": 4.655470006167889,
"learning_rate": 9.175918996492408e-05,
"loss": 0.0075,
"num_tokens": 3490161.0,
"reward": 0.8015123009681702,
"reward_std": 0.4226231575012207,
"rewards/rollout_reward_func/mean": 0.8015123009681702,
"rewards/rollout_reward_func/std": 0.4206917881965637,
"sampling/importance_sampling_ratio/max": 1.0000184774398804,
"sampling/importance_sampling_ratio/mean": 0.9999892711639404,
"sampling/importance_sampling_ratio/min": 0.9999106526374817,
"sampling/sampling_logp_difference/max": 5.662531839334406e-05,
"sampling/sampling_logp_difference/mean": 4.012344561488135e-06,
"step": 93,
"step_time": 10.772258454000166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 678.0,
"completions/max_terminated_length": 678.0,
"completions/mean_length": 142.40625,
"completions/mean_terminated_length": 142.40625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.749645379182766e-05,
"epoch": 0.00188,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7582062355359085e-05,
"kl": 4.700259050820023,
"learning_rate": 9.147907660424242e-05,
"loss": 0.007,
"num_tokens": 3523365.0,
"reward": 0.7009207606315613,
"reward_std": 0.4892513155937195,
"rewards/rollout_reward_func/mean": 0.7009207606315613,
"rewards/rollout_reward_func/std": 0.4695405960083008,
"sampling/importance_sampling_ratio/max": 1.0000180006027222,
"sampling/importance_sampling_ratio/mean": 0.9999983906745911,
"sampling/importance_sampling_ratio/min": 0.9999682307243347,
"sampling/sampling_logp_difference/max": 2.3959782993188128e-05,
"sampling/sampling_logp_difference/mean": 1.8251080291520339e-06,
"step": 94,
"step_time": 10.420490644999745
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 629.0,
"completions/max_terminated_length": 629.0,
"completions/mean_length": 247.65625,
"completions/mean_terminated_length": 247.65625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010689802249430613,
"epoch": 0.0019,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.2199332054005936e-05,
"kl": 4.485873177647591,
"learning_rate": 9.119489049635865e-05,
"loss": 0.0058,
"num_tokens": 3561189.0,
"reward": 0.8030915260314941,
"reward_std": 0.4217682480812073,
"rewards/rollout_reward_func/mean": 0.8030915260314941,
"rewards/rollout_reward_func/std": 0.41993096470832825,
"sampling/importance_sampling_ratio/max": 1.000044345855713,
"sampling/importance_sampling_ratio/mean": 0.9999996423721313,
"sampling/importance_sampling_ratio/min": 0.9999737739562988,
"sampling/sampling_logp_difference/max": 6.555848085554317e-05,
"sampling/sampling_logp_difference/mean": 3.4570114166854182e-06,
"step": 95,
"step_time": 10.547222818000137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 229.625,
"completions/mean_terminated_length": 229.625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.808376419404794e-05,
"epoch": 0.00192,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.205911707482301e-05,
"kl": 5.648331835865974,
"learning_rate": 9.090667158106077e-05,
"loss": -0.002,
"num_tokens": 3599155.0,
"reward": 0.579670786857605,
"reward_std": 0.5128026008605957,
"rewards/rollout_reward_func/mean": 0.579670786857605,
"rewards/rollout_reward_func/std": 0.5076538324356079,
"sampling/importance_sampling_ratio/max": 1.0000417232513428,
"sampling/importance_sampling_ratio/mean": 0.9999995231628418,
"sampling/importance_sampling_ratio/min": 0.9999645948410034,
"sampling/sampling_logp_difference/max": 3.5042918170802295e-05,
"sampling/sampling_logp_difference/mean": 2.5084548269660445e-06,
"step": 96,
"step_time": 10.931292486000302
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 271.1875,
"completions/mean_terminated_length": 271.1875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010380294239098475,
"epoch": 0.00194,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.427110878983513e-05,
"kl": 4.673744417726994,
"learning_rate": 9.061446036491135e-05,
"loss": 0.0141,
"num_tokens": 3637695.0,
"reward": 0.6771038770675659,
"reward_std": 0.54509037733078,
"rewards/rollout_reward_func/mean": 0.6771038770675659,
"rewards/rollout_reward_func/std": 0.5397720336914062,
"sampling/importance_sampling_ratio/max": 1.0000097751617432,
"sampling/importance_sampling_ratio/mean": 0.9999973773956299,
"sampling/importance_sampling_ratio/min": 0.9999740123748779,
"sampling/sampling_logp_difference/max": 3.063724943785928e-05,
"sampling/sampling_logp_difference/mean": 2.753982471404015e-06,
"step": 97,
"step_time": 10.892550134999283
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 215.09375,
"completions/mean_terminated_length": 215.09375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.612215438257408e-05,
"epoch": 0.00196,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.8458416383946314e-05,
"kl": 4.893975809216499,
"learning_rate": 9.03182979155548e-05,
"loss": 0.0011,
"num_tokens": 3675048.0,
"reward": 0.6752623319625854,
"reward_std": 0.47441476583480835,
"rewards/rollout_reward_func/mean": 0.6752623319625854,
"rewards/rollout_reward_func/std": 0.48496752977371216,
"sampling/importance_sampling_ratio/max": 1.0000221729278564,
"sampling/importance_sampling_ratio/mean": 0.9999983310699463,
"sampling/importance_sampling_ratio/min": 0.999973475933075,
"sampling/sampling_logp_difference/max": 1.9550898286979645e-05,
"sampling/sampling_logp_difference/mean": 1.8493761899662786e-06,
"step": 98,
"step_time": 10.74181418300077
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 291.96875,
"completions/mean_terminated_length": 291.96875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00011381667358989489,
"epoch": 0.00198,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.915198562433943e-05,
"kl": 5.3638990968465805,
"learning_rate": 9.001822585594566e-05,
"loss": 0.0212,
"num_tokens": 3714784.0,
"reward": 0.5840122699737549,
"reward_std": 0.5640621185302734,
"rewards/rollout_reward_func/mean": 0.5840122699737549,
"rewards/rollout_reward_func/std": 0.5568755269050598,
"sampling/importance_sampling_ratio/max": 1.0000113248825073,
"sampling/importance_sampling_ratio/mean": 0.9999927878379822,
"sampling/importance_sampling_ratio/min": 0.9999212026596069,
"sampling/sampling_logp_difference/max": 5.3409283282235265e-05,
"sampling/sampling_logp_difference/mean": 3.0009853162482614e-06,
"step": 99,
"step_time": 10.928057961001741
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 302.25,
"completions/mean_terminated_length": 302.25,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010083374922942312,
"epoch": 0.002,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.809072536067106e-05,
"kl": 5.571109913289547,
"learning_rate": 8.971428635849876e-05,
"loss": -0.0019,
"num_tokens": 3755324.0,
"reward": 0.5843415260314941,
"reward_std": 0.4848282039165497,
"rewards/rollout_reward_func/mean": 0.5843415260314941,
"rewards/rollout_reward_func/std": 0.5085917711257935,
"sampling/importance_sampling_ratio/max": 1.0000355243682861,
"sampling/importance_sampling_ratio/mean": 0.9999938011169434,
"sampling/importance_sampling_ratio/min": 0.999915599822998,
"sampling/sampling_logp_difference/max": 6.294687045738101e-05,
"sampling/sampling_logp_difference/mean": 3.964934421674116e-06,
"step": 100,
"step_time": 11.060910541000794
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 150.3125,
"completions/mean_terminated_length": 150.3125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 4.915687395623536e-05,
"epoch": 0.00202,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.978915290441364e-05,
"kl": 5.539727114140987,
"learning_rate": 8.940652213916242e-05,
"loss": 0.0078,
"num_tokens": 3789997.0,
"reward": 0.5434207916259766,
"reward_std": 0.49296581745147705,
"rewards/rollout_reward_func/mean": 0.5434207916259766,
"rewards/rollout_reward_func/std": 0.5062321424484253,
"sampling/importance_sampling_ratio/max": 1.0000144243240356,
"sampling/importance_sampling_ratio/mean": 0.999999463558197,
"sampling/importance_sampling_ratio/min": 0.9999815225601196,
"sampling/sampling_logp_difference/max": 2.300502819707617e-05,
"sampling/sampling_logp_difference/mean": 1.5629962035745848e-06,
"step": 101,
"step_time": 10.688564576999852
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 659.0,
"completions/max_terminated_length": 659.0,
"completions/mean_length": 208.1875,
"completions/mean_terminated_length": 208.1875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.830299895772441e-05,
"epoch": 0.00204,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.874409958370961e-05,
"kl": 4.853120140731335,
"learning_rate": 8.9094976451415e-05,
"loss": 0.0016,
"num_tokens": 3826670.0,
"reward": 0.7693415284156799,
"reward_std": 0.4373759627342224,
"rewards/rollout_reward_func/mean": 0.7693415284156799,
"rewards/rollout_reward_func/std": 0.44000110030174255,
"sampling/importance_sampling_ratio/max": 1.0000156164169312,
"sampling/importance_sampling_ratio/mean": 0.9999966621398926,
"sampling/importance_sampling_ratio/min": 0.9999758005142212,
"sampling/sampling_logp_difference/max": 2.5749490305315703e-05,
"sampling/sampling_logp_difference/mean": 1.909265620270162e-06,
"step": 102,
"step_time": 10.570236231998933
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 669.0,
"completions/max_terminated_length": 669.0,
"completions/mean_length": 190.90625,
"completions/mean_terminated_length": 190.90625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010004358325943485,
"epoch": 0.00206,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00010144019324798137,
"kl": 4.018396964296699,
"learning_rate": 8.877969308018608e-05,
"loss": -0.0118,
"num_tokens": 3862077.0,
"reward": 0.8630915880203247,
"reward_std": 0.3141759932041168,
"rewards/rollout_reward_func/mean": 0.8630915880203247,
"rewards/rollout_reward_func/std": 0.3708895444869995,
"sampling/importance_sampling_ratio/max": 1.0000416040420532,
"sampling/importance_sampling_ratio/mean": 0.9999942779541016,
"sampling/importance_sampling_ratio/min": 0.9999680519104004,
"sampling/sampling_logp_difference/max": 8.009921293705702e-05,
"sampling/sampling_logp_difference/mean": 3.6002747947350144e-06,
"step": 103,
"step_time": 10.594433905999722
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 589.0,
"completions/max_terminated_length": 589.0,
"completions/mean_length": 147.15625,
"completions/mean_terminated_length": 147.15625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.807541271314221e-05,
"epoch": 0.00208,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.578004856943153e-05,
"kl": 4.207340374588966,
"learning_rate": 8.846071633570285e-05,
"loss": -0.0112,
"num_tokens": 3895407.0,
"reward": 0.890591561794281,
"reward_std": 0.30010443925857544,
"rewards/rollout_reward_func/mean": 0.890591561794281,
"rewards/rollout_reward_func/std": 0.3387019634246826,
"sampling/importance_sampling_ratio/max": 1.0000135898590088,
"sampling/importance_sampling_ratio/mean": 0.9999985098838806,
"sampling/importance_sampling_ratio/min": 0.9999701380729675,
"sampling/sampling_logp_difference/max": 2.5631481548771262e-05,
"sampling/sampling_logp_difference/mean": 2.3388356567011215e-06,
"step": 104,
"step_time": 10.663827376000881
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 243.15625,
"completions/mean_terminated_length": 243.15625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.019029707246773e-05,
"epoch": 0.0021,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.9671542506548576e-05,
"kl": 5.539078265428543,
"learning_rate": 8.81380910472627e-05,
"loss": 0.0103,
"num_tokens": 3933754.0,
"reward": 0.5509207844734192,
"reward_std": 0.5520662665367126,
"rewards/rollout_reward_func/mean": 0.5509207844734192,
"rewards/rollout_reward_func/std": 0.5649293065071106,
"sampling/importance_sampling_ratio/max": 1.0000245571136475,
"sampling/importance_sampling_ratio/mean": 0.999995768070221,
"sampling/importance_sampling_ratio/min": 0.9999408721923828,
"sampling/sampling_logp_difference/max": 4.3273397750454023e-05,
"sampling/sampling_logp_difference/mean": 3.680256440929952e-06,
"step": 105,
"step_time": 10.16665372999978
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 243.53125,
"completions/mean_terminated_length": 243.53125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.68805940715356e-05,
"epoch": 0.00212,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.179349939979147e-05,
"kl": 4.557143405079842,
"learning_rate": 8.78118625569329e-05,
"loss": 0.0137,
"num_tokens": 3971886.0,
"reward": 0.8346039056777954,
"reward_std": 0.33073580265045166,
"rewards/rollout_reward_func/mean": 0.8346039056777954,
"rewards/rollout_reward_func/std": 0.3947785496711731,
"sampling/importance_sampling_ratio/max": 1.0000149011611938,
"sampling/importance_sampling_ratio/mean": 0.9999919533729553,
"sampling/importance_sampling_ratio/min": 0.9999665021896362,
"sampling/sampling_logp_difference/max": 3.433300298638642e-05,
"sampling/sampling_logp_difference/mean": 2.8295610263739945e-06,
"step": 106,
"step_time": 11.465477359998204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 678.0,
"completions/max_terminated_length": 678.0,
"completions/mean_length": 244.5,
"completions/mean_terminated_length": 244.5,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.225094723319671e-05,
"epoch": 0.00214,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.9931542889680713e-05,
"kl": 5.053082346916199,
"learning_rate": 8.748207671317818e-05,
"loss": -0.001,
"num_tokens": 4010493.0,
"reward": 0.8046707510948181,
"reward_std": 0.43914103507995605,
"rewards/rollout_reward_func/mean": 0.8046707510948181,
"rewards/rollout_reward_func/std": 0.4238363802433014,
"sampling/importance_sampling_ratio/max": 1.0000298023223877,
"sampling/importance_sampling_ratio/mean": 0.9999991655349731,
"sampling/importance_sampling_ratio/min": 0.9999284744262695,
"sampling/sampling_logp_difference/max": 4.7092456952668726e-05,
"sampling/sampling_logp_difference/mean": 3.722727342392318e-06,
"step": 107,
"step_time": 10.213472404999266
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 687.0,
"completions/max_terminated_length": 687.0,
"completions/mean_length": 233.71875,
"completions/mean_terminated_length": 233.71875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.666757216564292e-05,
"epoch": 0.00216,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.740045839454979e-05,
"kl": 4.794837590306997,
"learning_rate": 8.714877986441713e-05,
"loss": -0.009,
"num_tokens": 4047992.0,
"reward": 0.8311830759048462,
"reward_std": 0.3405177891254425,
"rewards/rollout_reward_func/mean": 0.8311830759048462,
"rewards/rollout_reward_func/std": 0.3996248245239258,
"sampling/importance_sampling_ratio/max": 1.00002121925354,
"sampling/importance_sampling_ratio/mean": 0.9999939203262329,
"sampling/importance_sampling_ratio/min": 0.9999217987060547,
"sampling/sampling_logp_difference/max": 8.25024617370218e-05,
"sampling/sampling_logp_difference/mean": 3.614798288253951e-06,
"step": 108,
"step_time": 11.064788341000622
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 687.0,
"completions/max_terminated_length": 687.0,
"completions/mean_length": 299.90625,
"completions/mean_terminated_length": 299.90625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00011450917008914985,
"epoch": 0.00218,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.716579420957714e-05,
"kl": 5.604592651128769,
"learning_rate": 8.681201885250831e-05,
"loss": 0.0054,
"num_tokens": 4089355.0,
"reward": 0.7436830997467041,
"reward_std": 0.4545275866985321,
"rewards/rollout_reward_func/mean": 0.7436830997467041,
"rewards/rollout_reward_func/std": 0.45964205265045166,
"sampling/importance_sampling_ratio/max": 1.0000195503234863,
"sampling/importance_sampling_ratio/mean": 0.9999915361404419,
"sampling/importance_sampling_ratio/min": 0.9999645948410034,
"sampling/sampling_logp_difference/max": 3.063714029849507e-05,
"sampling/sampling_logp_difference/mean": 3.1274298635253217e-06,
"step": 109,
"step_time": 10.834878306000064
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 248.375,
"completions/mean_terminated_length": 248.375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010261963420532538,
"epoch": 0.0022,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.549662480712868e-05,
"kl": 4.847600609064102,
"learning_rate": 8.647184100616712e-05,
"loss": 0.0029,
"num_tokens": 4127050.0,
"reward": 0.7702623605728149,
"reward_std": 0.45045793056488037,
"rewards/rollout_reward_func/mean": 0.7702623605728149,
"rewards/rollout_reward_func/std": 0.44052988290786743,
"sampling/importance_sampling_ratio/max": 1.000023603439331,
"sampling/importance_sampling_ratio/mean": 0.9999868869781494,
"sampling/importance_sampling_ratio/min": 0.999665379524231,
"sampling/sampling_logp_difference/max": 0.0003045859048143029,
"sampling/sampling_logp_difference/mean": 4.818009529117262e-06,
"step": 110,
"step_time": 12.846799683999507
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 242.1875,
"completions/mean_terminated_length": 242.1875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.59633990586417e-05,
"epoch": 0.00222,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.0465465897577815e-05,
"kl": 4.926107108592987,
"learning_rate": 8.612829413431418e-05,
"loss": 0.0183,
"num_tokens": 4165291.0,
"reward": 0.6452623605728149,
"reward_std": 0.48441076278686523,
"rewards/rollout_reward_func/mean": 0.6452623605728149,
"rewards/rollout_reward_func/std": 0.48789316415786743,
"sampling/importance_sampling_ratio/max": 1.0000079870224,
"sampling/importance_sampling_ratio/mean": 0.9999897480010986,
"sampling/importance_sampling_ratio/min": 0.9999393224716187,
"sampling/sampling_logp_difference/max": 5.018767842557281e-05,
"sampling/sampling_logp_difference/mean": 3.0540300031134393e-06,
"step": 111,
"step_time": 12.97736248299907
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 678.0,
"completions/max_terminated_length": 678.0,
"completions/mean_length": 163.46875,
"completions/mean_terminated_length": 163.46875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.3820546966676375e-05,
"epoch": 0.00224,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5189647456281818e-05,
"kl": 5.1571872271597385,
"learning_rate": 8.578142651935609e-05,
"loss": 0.0077,
"num_tokens": 4199763.0,
"reward": 0.7949999570846558,
"reward_std": 0.43465879559516907,
"rewards/rollout_reward_func/mean": 0.7949999570846558,
"rewards/rollout_reward_func/std": 0.4186306297779083,
"sampling/importance_sampling_ratio/max": 1.000018835067749,
"sampling/importance_sampling_ratio/mean": 0.9999989867210388,
"sampling/importance_sampling_ratio/min": 0.9999740123748779,
"sampling/sampling_logp_difference/max": 2.9564285796368495e-05,
"sampling/sampling_logp_difference/mean": 2.1823295810463605e-06,
"step": 112,
"step_time": 12.87057027900073
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 639.0,
"completions/max_terminated_length": 639.0,
"completions/mean_length": 123.46875,
"completions/mean_terminated_length": 123.46875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.431075787498685e-05,
"epoch": 0.00226,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2450723918154836e-05,
"kl": 4.872902058064938,
"learning_rate": 8.543128691039995e-05,
"loss": -0.0007,
"num_tokens": 4232839.0,
"reward": 0.6693415641784668,
"reward_std": 0.46633830666542053,
"rewards/rollout_reward_func/mean": 0.6693415641784668,
"rewards/rollout_reward_func/std": 0.4878684878349304,
"sampling/importance_sampling_ratio/max": 1.0000355243682861,
"sampling/importance_sampling_ratio/mean": 0.9999990463256836,
"sampling/importance_sampling_ratio/min": 0.9999709725379944,
"sampling/sampling_logp_difference/max": 4.2314342863392085e-05,
"sampling/sampling_logp_difference/mean": 2.0633228814403992e-06,
"step": 113,
"step_time": 10.490328707000117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 269.5625,
"completions/mean_terminated_length": 269.5625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.317064824188037e-05,
"epoch": 0.00228,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.6714060368249193e-05,
"kl": 4.687108241021633,
"learning_rate": 8.507792451640202e-05,
"loss": 0.0053,
"num_tokens": 4271575.0,
"reward": 0.6162499785423279,
"reward_std": 0.5130974054336548,
"rewards/rollout_reward_func/mean": 0.6162499785423279,
"rewards/rollout_reward_func/std": 0.5002048015594482,
"sampling/importance_sampling_ratio/max": 1.0000026226043701,
"sampling/importance_sampling_ratio/mean": 0.9999919533729553,
"sampling/importance_sampling_ratio/min": 0.9999685287475586,
"sampling/sampling_logp_difference/max": 3.0636969313491136e-05,
"sampling/sampling_logp_difference/mean": 3.275076778663788e-06,
"step": 114,
"step_time": 11.029661345999557
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 649.0,
"completions/max_terminated_length": 649.0,
"completions/mean_length": 199.90625,
"completions/mean_terminated_length": 199.90625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.060472408682017e-05,
"epoch": 0.0023,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.878362960880622e-05,
"kl": 4.994293176879637,
"learning_rate": 8.472138899925184e-05,
"loss": -0.0077,
"num_tokens": 4308105.0,
"reward": 0.7680915594100952,
"reward_std": 0.43255943059921265,
"rewards/rollout_reward_func/mean": 0.7680915594100952,
"rewards/rollout_reward_func/std": 0.44509974122047424,
"sampling/importance_sampling_ratio/max": 1.0000261068344116,
"sampling/importance_sampling_ratio/mean": 0.9999980926513672,
"sampling/importance_sampling_ratio/min": 0.9999677538871765,
"sampling/sampling_logp_difference/max": 2.5987903427449055e-05,
"sampling/sampling_logp_difference/mean": 2.0921900159009965e-06,
"step": 115,
"step_time": 12.655563860999791
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 168.40625,
"completions/mean_terminated_length": 168.40625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.414693930561043e-05,
"epoch": 0.00232,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.48902283096686e-05,
"kl": 4.02911851555109,
"learning_rate": 8.43617304667927e-05,
"loss": 0.0119,
"num_tokens": 4341444.0,
"reward": 0.6993415355682373,
"reward_std": 0.4643377363681793,
"rewards/rollout_reward_func/mean": 0.6993415355682373,
"rewards/rollout_reward_func/std": 0.4691430330276489,
"sampling/importance_sampling_ratio/max": 1.0000131130218506,
"sampling/importance_sampling_ratio/mean": 0.9999961853027344,
"sampling/importance_sampling_ratio/min": 0.9999437928199768,
"sampling/sampling_logp_difference/max": 5.137961852597073e-05,
"sampling/sampling_logp_difference/mean": 2.4830555958033074e-06,
"step": 116,
"step_time": 12.967430730000615
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 169.0625,
"completions/mean_terminated_length": 169.0625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.054653645534927e-05,
"epoch": 0.00234,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.826282333349809e-05,
"kl": 5.1541957119479775,
"learning_rate": 8.399899946577953e-05,
"loss": -0.0054,
"num_tokens": 4376644.0,
"reward": 0.7355915307998657,
"reward_std": 0.48186829686164856,
"rewards/rollout_reward_func/mean": 0.7355915307998657,
"rewards/rollout_reward_func/std": 0.46224409341812134,
"sampling/importance_sampling_ratio/max": 1.000015377998352,
"sampling/importance_sampling_ratio/mean": 0.9999917149543762,
"sampling/importance_sampling_ratio/min": 0.9998599290847778,
"sampling/sampling_logp_difference/max": 0.00014258069859351963,
"sampling/sampling_logp_difference/mean": 3.6177268611936597e-06,
"step": 117,
"step_time": 12.858041905000846
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 221.46875,
"completions/mean_terminated_length": 221.46875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.304556062592837e-05,
"epoch": 0.00236,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.4655668312334456e-05,
"kl": 4.768407866358757,
"learning_rate": 8.363324697477484e-05,
"loss": 0.0086,
"num_tokens": 4413547.0,
"reward": 0.7377623319625854,
"reward_std": 0.4202122390270233,
"rewards/rollout_reward_func/mean": 0.7377623319625854,
"rewards/rollout_reward_func/std": 0.455178439617157,
"sampling/importance_sampling_ratio/max": 1.000011682510376,
"sampling/importance_sampling_ratio/mean": 0.9999964833259583,
"sampling/importance_sampling_ratio/min": 0.9999688267707825,
"sampling/sampling_logp_difference/max": 4.255828389432281e-05,
"sampling/sampling_logp_difference/mean": 2.690314659048454e-06,
"step": 118,
"step_time": 10.272596567999699
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 678.0,
"completions/max_terminated_length": 678.0,
"completions/mean_length": 196.96875,
"completions/mean_terminated_length": 196.96875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.680693799727578e-05,
"epoch": 0.00238,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0536966985673644e-05,
"kl": 4.59810471534729,
"learning_rate": 8.326452439698433e-05,
"loss": 0.0101,
"num_tokens": 4448788.0,
"reward": 0.642170786857605,
"reward_std": 0.5499499440193176,
"rewards/rollout_reward_func/mean": 0.642170786857605,
"rewards/rollout_reward_func/std": 0.5485014915466309,
"sampling/importance_sampling_ratio/max": 1.0000146627426147,
"sampling/importance_sampling_ratio/mean": 0.9999949932098389,
"sampling/importance_sampling_ratio/min": 0.9999575018882751,
"sampling/sampling_logp_difference/max": 3.0994589906185865e-05,
"sampling/sampling_logp_difference/mean": 2.7388678063289262e-06,
"step": 119,
"step_time": 11.08815586799983
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 242.3125,
"completions/mean_terminated_length": 242.3125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.888474092178967e-05,
"epoch": 0.0024,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.178436499089003e-05,
"kl": 5.197234347462654,
"learning_rate": 8.289288355303245e-05,
"loss": 0.0035,
"num_tokens": 4487700.0,
"reward": 0.7096707820892334,
"reward_std": 0.4924471080303192,
"rewards/rollout_reward_func/mean": 0.7096707820892334,
"rewards/rollout_reward_func/std": 0.473552942276001,
"sampling/importance_sampling_ratio/max": 1.0000423192977905,
"sampling/importance_sampling_ratio/mean": 0.9999993443489075,
"sampling/importance_sampling_ratio/min": 0.9999532699584961,
"sampling/sampling_logp_difference/max": 5.496320954989642e-05,
"sampling/sampling_logp_difference/mean": 3.605607162171509e-06,
"step": 120,
"step_time": 10.934329372000775
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 277.25,
"completions/mean_terminated_length": 277.25,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.522849164194213e-05,
"epoch": 0.00242,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.63542005995987e-05,
"kl": 4.920810654759407,
"learning_rate": 8.251837667367966e-05,
"loss": 0.0075,
"num_tokens": 4526577.0,
"reward": 0.7402623891830444,
"reward_std": 0.4214465022087097,
"rewards/rollout_reward_func/mean": 0.7402623891830444,
"rewards/rollout_reward_func/std": 0.4546830654144287,
"sampling/importance_sampling_ratio/max": 1.0000230073928833,
"sampling/importance_sampling_ratio/mean": 0.9999945759773254,
"sampling/importance_sampling_ratio/min": 0.9999340772628784,
"sampling/sampling_logp_difference/max": 5.650547973345965e-05,
"sampling/sampling_logp_difference/mean": 3.201313347744872e-06,
"step": 121,
"step_time": 11.145089898000151
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 253.9375,
"completions/mean_terminated_length": 253.9375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00011383450387825178,
"epoch": 0.00244,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0001262296427739784,
"kl": 4.25481122918427,
"learning_rate": 8.214105639248173e-05,
"loss": -0.008,
"num_tokens": 4564592.0,
"reward": 0.7399331331253052,
"reward_std": 0.3940573036670685,
"rewards/rollout_reward_func/mean": 0.7399331331253052,
"rewards/rollout_reward_func/std": 0.45654943585395813,
"sampling/importance_sampling_ratio/max": 1.0000048875808716,
"sampling/importance_sampling_ratio/mean": 0.999985933303833,
"sampling/importance_sampling_ratio/min": 0.9998538494110107,
"sampling/sampling_logp_difference/max": 0.0001305415207752958,
"sampling/sampling_logp_difference/mean": 4.2265378397132736e-06,
"step": 122,
"step_time": 11.035577829000886
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 219.96875,
"completions/mean_terminated_length": 219.96875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.981711987132712e-05,
"epoch": 0.00246,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.0030358175281435e-05,
"kl": 4.89252844452858,
"learning_rate": 8.176097573839265e-05,
"loss": -0.0091,
"num_tokens": 4601658.0,
"reward": 0.5161831378936768,
"reward_std": 0.4359714388847351,
"rewards/rollout_reward_func/mean": 0.5161831378936768,
"rewards/rollout_reward_func/std": 0.5666016936302185,
"sampling/importance_sampling_ratio/max": 1.0000238418579102,
"sampling/importance_sampling_ratio/mean": 0.999997079372406,
"sampling/importance_sampling_ratio/min": 0.9999577403068542,
"sampling/sampling_logp_difference/max": 4.756472844746895e-05,
"sampling/sampling_logp_difference/mean": 3.610257408581674e-06,
"step": 123,
"step_time": 10.925135986000441
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 212.90625,
"completions/mean_terminated_length": 212.90625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.236761294710959e-05,
"epoch": 0.00248,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.092013452667743e-05,
"kl": 5.190757237374783,
"learning_rate": 8.137818812831182e-05,
"loss": 0.001,
"num_tokens": 4638502.0,
"reward": 0.6746707558631897,
"reward_std": 0.46534547209739685,
"rewards/rollout_reward_func/mean": 0.6746707558631897,
"rewards/rollout_reward_func/std": 0.48581212759017944,
"sampling/importance_sampling_ratio/max": 1.0000137090682983,
"sampling/importance_sampling_ratio/mean": 0.9999908208847046,
"sampling/importance_sampling_ratio/min": 0.9998648762702942,
"sampling/sampling_logp_difference/max": 0.0001305415207752958,
"sampling/sampling_logp_difference/mean": 3.2529164855077397e-06,
"step": 124,
"step_time": 10.93318752300047
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 230.78125,
"completions/mean_terminated_length": 230.78125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.626912174885092e-05,
"epoch": 0.0025,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5847192116780207e-05,
"kl": 5.291407283395529,
"learning_rate": 8.09927473595769e-05,
"loss": 0.0025,
"num_tokens": 4675749.0,
"reward": 0.642170786857605,
"reward_std": 0.5106069445610046,
"rewards/rollout_reward_func/mean": 0.642170786857605,
"rewards/rollout_reward_func/std": 0.4933558702468872,
"sampling/importance_sampling_ratio/max": 1.0000139474868774,
"sampling/importance_sampling_ratio/mean": 0.9999961256980896,
"sampling/importance_sampling_ratio/min": 0.9999502301216125,
"sampling/sampling_logp_difference/max": 2.9326114599825814e-05,
"sampling/sampling_logp_difference/mean": 2.8105505407438613e-06,
"step": 125,
"step_time": 11.311041927999213
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 669.0,
"completions/max_terminated_length": 669.0,
"completions/mean_length": 265.96875,
"completions/mean_terminated_length": 265.96875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00011288215048921302,
"epoch": 0.00252,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.9042354071862064e-05,
"kl": 4.39733462408185,
"learning_rate": 8.060470760240294e-05,
"loss": 0.0024,
"num_tokens": 4714650.0,
"reward": 0.7424331307411194,
"reward_std": 0.5192468166351318,
"rewards/rollout_reward_func/mean": 0.7424331307411194,
"rewards/rollout_reward_func/std": 0.5208361744880676,
"sampling/importance_sampling_ratio/max": 1.0000265836715698,
"sampling/importance_sampling_ratio/mean": 0.9999986886978149,
"sampling/importance_sampling_ratio/min": 0.9999682307243347,
"sampling/sampling_logp_difference/max": 2.979917189804837e-05,
"sampling/sampling_logp_difference/mean": 2.8954166282346705e-06,
"step": 126,
"step_time": 11.478165510000053
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 230.59375,
"completions/mean_terminated_length": 230.59375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.604452432336984e-05,
"epoch": 0.00254,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.739314292441122e-05,
"kl": 5.027340762317181,
"learning_rate": 8.021412339226936e-05,
"loss": -0.003,
"num_tokens": 4752416.0,
"reward": 0.8005915880203247,
"reward_std": 0.4260421693325043,
"rewards/rollout_reward_func/mean": 0.8005915880203247,
"rewards/rollout_reward_func/std": 0.4247084856033325,
"sampling/importance_sampling_ratio/max": 1.0000354051589966,
"sampling/importance_sampling_ratio/mean": 0.9999985694885254,
"sampling/importance_sampling_ratio/min": 0.9999394416809082,
"sampling/sampling_logp_difference/max": 4.720142896985635e-05,
"sampling/sampling_logp_difference/mean": 3.4468034755263943e-06,
"step": 127,
"step_time": 10.348821588999726
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 259.0625,
"completions/mean_terminated_length": 259.0625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010858213161668573,
"epoch": 0.00256,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.580048673436977e-05,
"kl": 5.223805829882622,
"learning_rate": 7.982104962225541e-05,
"loss": -0.0116,
"num_tokens": 4791731.0,
"reward": 0.7111831307411194,
"reward_std": 0.43767261505126953,
"rewards/rollout_reward_func/mean": 0.7111831307411194,
"rewards/rollout_reward_func/std": 0.5364737510681152,
"sampling/importance_sampling_ratio/max": 1.0000152587890625,
"sampling/importance_sampling_ratio/mean": 0.9999892711639404,
"sampling/importance_sampling_ratio/min": 0.9999455213546753,
"sampling/sampling_logp_difference/max": 4.4942185922991484e-05,
"sampling/sampling_logp_difference/mean": 3.922757969121449e-06,
"step": 128,
"step_time": 11.726808570999765
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 237.65625,
"completions/mean_terminated_length": 237.65625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.304604658666449e-05,
"epoch": 0.00258,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.209320468362421e-05,
"kl": 4.889099486870691,
"learning_rate": 7.94255415353255e-05,
"loss": -0.0002,
"num_tokens": 4829324.0,
"reward": 0.7699999809265137,
"reward_std": 0.4524250626564026,
"rewards/rollout_reward_func/mean": 0.7699999809265137,
"rewards/rollout_reward_func/std": 0.4433304965496063,
"sampling/importance_sampling_ratio/max": 1.0000271797180176,
"sampling/importance_sampling_ratio/mean": 0.9999895691871643,
"sampling/importance_sampling_ratio/min": 0.9998538494110107,
"sampling/sampling_logp_difference/max": 0.0001435376179870218,
"sampling/sampling_logp_difference/mean": 4.068881935381796e-06,
"step": 129,
"step_time": 10.755176805000701
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 208.875,
"completions/mean_terminated_length": 208.875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.984883568847636e-05,
"epoch": 0.0026,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.964017691439949e-05,
"kl": 4.550747729837894,
"learning_rate": 7.902765471656524e-05,
"loss": -0.0008,
"num_tokens": 4865040.0,
"reward": 0.7049331665039062,
"reward_std": 0.48293593525886536,
"rewards/rollout_reward_func/mean": 0.7049331665039062,
"rewards/rollout_reward_func/std": 0.4722973704338074,
"sampling/importance_sampling_ratio/max": 1.0000436305999756,
"sampling/importance_sampling_ratio/mean": 0.9999958276748657,
"sampling/importance_sampling_ratio/min": 0.999826967716217,
"sampling/sampling_logp_difference/max": 0.00013281148858368397,
"sampling/sampling_logp_difference/mean": 3.5681759982253425e-06,
"step": 130,
"step_time": 10.911314888001016
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 257.78125,
"completions/mean_terminated_length": 257.78125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010051704717284338,
"epoch": 0.00262,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.389891445403919e-05,
"kl": 5.61663281917572,
"learning_rate": 7.862744508536953e-05,
"loss": -0.0145,
"num_tokens": 4905023.0,
"reward": 0.8059207797050476,
"reward_std": 0.3718266785144806,
"rewards/rollout_reward_func/mean": 0.8059207797050476,
"rewards/rollout_reward_func/std": 0.42751839756965637,
"sampling/importance_sampling_ratio/max": 1.0000563859939575,
"sampling/importance_sampling_ratio/mean": 0.9999959468841553,
"sampling/importance_sampling_ratio/min": 0.9999661445617676,
"sampling/sampling_logp_difference/max": 6.972985283937305e-05,
"sampling/sampling_logp_difference/mean": 3.5875398225471145e-06,
"step": 131,
"step_time": 11.427704178000113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 251.46875,
"completions/mean_terminated_length": 251.46875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010139288470156771,
"epoch": 0.00264,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.9901922213612124e-05,
"kl": 5.053040578961372,
"learning_rate": 7.822496888758351e-05,
"loss": -0.0125,
"num_tokens": 4943228.0,
"reward": 0.7086831331253052,
"reward_std": 0.45560967922210693,
"rewards/rollout_reward_func/mean": 0.7086831331253052,
"rewards/rollout_reward_func/std": 0.5349801182746887,
"sampling/importance_sampling_ratio/max": 1.0000299215316772,
"sampling/importance_sampling_ratio/mean": 0.9999942779541016,
"sampling/importance_sampling_ratio/min": 0.999942421913147,
"sampling/sampling_logp_difference/max": 4.243637522449717e-05,
"sampling/sampling_logp_difference/mean": 3.3543090012244647e-06,
"step": 132,
"step_time": 10.83788310799946
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 166.5,
"completions/mean_terminated_length": 166.5,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.230003301117449e-05,
"epoch": 0.00266,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2420093955588527e-05,
"kl": 4.749419122934341,
"learning_rate": 7.782028268759781e-05,
"loss": 0.0022,
"num_tokens": 4978386.0,
"reward": 0.7043415307998657,
"reward_std": 0.5322943925857544,
"rewards/rollout_reward_func/mean": 0.7043415307998657,
"rewards/rollout_reward_func/std": 0.5347846150398254,
"sampling/importance_sampling_ratio/max": 1.000034213066101,
"sampling/importance_sampling_ratio/mean": 0.9999971389770508,
"sampling/importance_sampling_ratio/min": 0.9999690651893616,
"sampling/sampling_logp_difference/max": 5.220848834142089e-05,
"sampling/sampling_logp_difference/mean": 2.8167501113784965e-06,
"step": 133,
"step_time": 10.881401281999842
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 147.5,
"completions/mean_terminated_length": 147.5,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.72966548304521e-05,
"epoch": 0.00268,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.369628368294798e-05,
"kl": 5.001729235053062,
"learning_rate": 7.741344336039886e-05,
"loss": -0.0074,
"num_tokens": 5013351.0,
"reward": 0.7330915927886963,
"reward_std": 0.4458242356777191,
"rewards/rollout_reward_func/mean": 0.7330915927886963,
"rewards/rollout_reward_func/std": 0.4634391963481903,
"sampling/importance_sampling_ratio/max": 1.0000064373016357,
"sampling/importance_sampling_ratio/mean": 0.9999979734420776,
"sampling/importance_sampling_ratio/min": 0.9999818801879883,
"sampling/sampling_logp_difference/max": 2.2650128812529147e-05,
"sampling/sampling_logp_difference/mean": 1.603030568730901e-06,
"step": 134,
"step_time": 10.908871880999868
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 305.875,
"completions/mean_terminated_length": 305.875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.725786842944672e-05,
"epoch": 0.0027,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.0055830797646195e-05,
"kl": 5.2666705548763275,
"learning_rate": 7.700450808357564e-05,
"loss": 0.0223,
"num_tokens": 5053684.0,
"reward": 0.6452623605728149,
"reward_std": 0.46077656745910645,
"rewards/rollout_reward_func/mean": 0.6452623605728149,
"rewards/rollout_reward_func/std": 0.4859411120414734,
"sampling/importance_sampling_ratio/max": 1.000022053718567,
"sampling/importance_sampling_ratio/mean": 0.9999962449073792,
"sampling/importance_sampling_ratio/min": 0.999959409236908,
"sampling/sampling_logp_difference/max": 3.814770025201142e-05,
"sampling/sampling_logp_difference/mean": 2.4756843686191132e-06,
"step": 135,
"step_time": 10.426622540999233
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 254.15625,
"completions/mean_terminated_length": 254.15625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.59571040479068e-05,
"epoch": 0.00272,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.841377712087706e-05,
"kl": 4.508408114314079,
"learning_rate": 7.659353432928393e-05,
"loss": -0.0018,
"num_tokens": 5091253.0,
"reward": 0.8015123605728149,
"reward_std": 0.36576953530311584,
"rewards/rollout_reward_func/mean": 0.8015123605728149,
"rewards/rollout_reward_func/std": 0.4206917881965637,
"sampling/importance_sampling_ratio/max": 1.0000181198120117,
"sampling/importance_sampling_ratio/mean": 0.999996542930603,
"sampling/importance_sampling_ratio/min": 0.9999696016311646,
"sampling/sampling_logp_difference/max": 3.4213535400340334e-05,
"sampling/sampling_logp_difference/mean": 2.4184487301681656e-06,
"step": 136,
"step_time": 11.579949559998568
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 217.71875,
"completions/mean_terminated_length": 217.71875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.95244829624653e-05,
"epoch": 0.00274,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.87391875544563e-05,
"kl": 4.599743388593197,
"learning_rate": 7.618057985616908e-05,
"loss": 0.0095,
"num_tokens": 5127521.0,
"reward": 0.6746707558631897,
"reward_std": 0.5018748044967651,
"rewards/rollout_reward_func/mean": 0.6746707558631897,
"rewards/rollout_reward_func/std": 0.48047077655792236,
"sampling/importance_sampling_ratio/max": 1.0000325441360474,
"sampling/importance_sampling_ratio/mean": 0.9999880790710449,
"sampling/importance_sampling_ratio/min": 0.9998170137405396,
"sampling/sampling_logp_difference/max": 0.00016952332225628197,
"sampling/sampling_logp_difference/mean": 4.749942490889225e-06,
"step": 137,
"step_time": 11.03167146700116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 259.0,
"completions/mean_terminated_length": 259.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.630876831399291e-05,
"epoch": 0.00276,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.7878466426045634e-05,
"kl": 4.510915584862232,
"learning_rate": 7.576570270124853e-05,
"loss": 0.0126,
"num_tokens": 5165374.0,
"reward": 0.7396707534790039,
"reward_std": 0.4501022398471832,
"rewards/rollout_reward_func/mean": 0.7396707534790039,
"rewards/rollout_reward_func/std": 0.45358023047447205,
"sampling/importance_sampling_ratio/max": 1.0000441074371338,
"sampling/importance_sampling_ratio/mean": 0.9999974370002747,
"sampling/importance_sampling_ratio/min": 0.9999577403068542,
"sampling/sampling_logp_difference/max": 4.2433915950823575e-05,
"sampling/sampling_logp_difference/mean": 2.6292764232493937e-06,
"step": 138,
"step_time": 10.937990653000725
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 269.28125,
"completions/mean_terminated_length": 269.28125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.21896111915521e-05,
"epoch": 0.00278,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.454186975839548e-05,
"kl": 5.0575916320085526,
"learning_rate": 7.53489611717553e-05,
"loss": 0.0222,
"num_tokens": 5204102.0,
"reward": 0.7071039080619812,
"reward_std": 0.46030738949775696,
"rewards/rollout_reward_func/mean": 0.7071039080619812,
"rewards/rollout_reward_func/std": 0.46494990587234497,
"sampling/importance_sampling_ratio/max": 1.0000200271606445,
"sampling/importance_sampling_ratio/mean": 0.9999970197677612,
"sampling/importance_sampling_ratio/min": 0.999968409538269,
"sampling/sampling_logp_difference/max": 3.2663992897141725e-05,
"sampling/sampling_logp_difference/mean": 3.1937224775901996e-06,
"step": 139,
"step_time": 10.921687967998878
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 149.65625,
"completions/mean_terminated_length": 149.65625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.3915793799319545e-05,
"epoch": 0.0028,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.388841625768691e-05,
"kl": 5.497643873095512,
"learning_rate": 7.49304138369434e-05,
"loss": -0.0117,
"num_tokens": 5238438.0,
"reward": 0.7018415927886963,
"reward_std": 0.49856051802635193,
"rewards/rollout_reward_func/mean": 0.7018415927886963,
"rewards/rollout_reward_func/std": 0.4783366620540619,
"sampling/importance_sampling_ratio/max": 1.0000146627426147,
"sampling/importance_sampling_ratio/mean": 0.9999945759773254,
"sampling/importance_sampling_ratio/min": 0.9999581575393677,
"sampling/sampling_logp_difference/max": 5.030676402384415e-05,
"sampling/sampling_logp_difference/mean": 2.4461330667691072e-06,
"step": 140,
"step_time": 10.575326776998736
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 244.78125,
"completions/mean_terminated_length": 244.78125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.234672896241136e-05,
"epoch": 0.00282,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.646247882395983e-05,
"kl": 5.654919885098934,
"learning_rate": 7.45101195198564e-05,
"loss": -0.0082,
"num_tokens": 5277978.0,
"reward": 0.49026232957839966,
"reward_std": 0.5400711297988892,
"rewards/rollout_reward_func/mean": 0.49026232957839966,
"rewards/rollout_reward_func/std": 0.5695264935493469,
"sampling/importance_sampling_ratio/max": 1.0000135898590088,
"sampling/importance_sampling_ratio/mean": 0.9999983310699463,
"sampling/importance_sampling_ratio/min": 0.9999774098396301,
"sampling/sampling_logp_difference/max": 2.241195397800766e-05,
"sampling/sampling_logp_difference/mean": 2.3845029772928683e-06,
"step": 141,
"step_time": 11.646618943000249
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 232.78125,
"completions/mean_terminated_length": 232.78125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010328506519385883,
"epoch": 0.00284,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.90917952649761e-05,
"kl": 5.274715817227843,
"learning_rate": 7.408813728906053e-05,
"loss": -0.0046,
"num_tokens": 5315780.0,
"reward": 0.6771707534790039,
"reward_std": 0.49981045722961426,
"rewards/rollout_reward_func/mean": 0.6771707534790039,
"rewards/rollout_reward_func/std": 0.4876364767551422,
"sampling/importance_sampling_ratio/max": 1.0000065565109253,
"sampling/importance_sampling_ratio/mean": 0.9999916553497314,
"sampling/importance_sampling_ratio/min": 0.9999037981033325,
"sampling/sampling_logp_difference/max": 0.00010908614785876125,
"sampling/sampling_logp_difference/mean": 3.764534085348714e-06,
"step": 142,
"step_time": 10.2494408320008
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 192.40625,
"completions/mean_terminated_length": 192.40625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.100872606396024e-05,
"epoch": 0.00286,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.320547668612562e-05,
"kl": 4.761083319783211,
"learning_rate": 7.366452645034293e-05,
"loss": -0.0062,
"num_tokens": 5351444.0,
"reward": 0.703091561794281,
"reward_std": 0.44097042083740234,
"rewards/rollout_reward_func/mean": 0.703091561794281,
"rewards/rollout_reward_func/std": 0.4737764596939087,
"sampling/importance_sampling_ratio/max": 1.000017523765564,
"sampling/importance_sampling_ratio/mean": 0.9999960064888,
"sampling/importance_sampling_ratio/min": 0.9999603033065796,
"sampling/sampling_logp_difference/max": 3.778961399802938e-05,
"sampling/sampling_logp_difference/mean": 2.097834112646524e-06,
"step": 143,
"step_time": 11.270066180999493
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 218.09375,
"completions/mean_terminated_length": 218.09375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.329525671982083e-05,
"epoch": 0.00288,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.2181723731337115e-05,
"kl": 5.640114210546017,
"learning_rate": 7.32393465383769e-05,
"loss": 0.016,
"num_tokens": 5388860.0,
"reward": 0.5475000143051147,
"reward_std": 0.5258115530014038,
"rewards/rollout_reward_func/mean": 0.5475000143051147,
"rewards/rollout_reward_func/std": 0.5025290250778198,
"sampling/importance_sampling_ratio/max": 1.0000290870666504,
"sampling/importance_sampling_ratio/mean": 0.9999995231628418,
"sampling/importance_sampling_ratio/min": 0.9999827742576599,
"sampling/sampling_logp_difference/max": 4.2435844079591334e-05,
"sampling/sampling_logp_difference/mean": 1.7509868257548078e-06,
"step": 144,
"step_time": 10.207922464999228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 247.71875,
"completions/mean_terminated_length": 247.71875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.949859710583041e-05,
"epoch": 0.0029,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.013792103156447e-05,
"kl": 5.88670526444912,
"learning_rate": 7.281265730835482e-05,
"loss": -0.0042,
"num_tokens": 5428231.0,
"reward": 0.5490123629570007,
"reward_std": 0.4685298204421997,
"rewards/rollout_reward_func/mean": 0.5490123629570007,
"rewards/rollout_reward_func/std": 0.5122498869895935,
"sampling/importance_sampling_ratio/max": 1.0000377893447876,
"sampling/importance_sampling_ratio/mean": 0.9999960660934448,
"sampling/importance_sampling_ratio/min": 0.9999233484268188,
"sampling/sampling_logp_difference/max": 6.354815559461713e-05,
"sampling/sampling_logp_difference/mean": 3.663993084046524e-06,
"step": 145,
"step_time": 11.4028208120003
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 213.125,
"completions/mean_terminated_length": 213.125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.420744133275093e-05,
"epoch": 0.00292,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.134711005259305e-05,
"kl": 5.1818161606788635,
"learning_rate": 7.238451872759005e-05,
"loss": 0.0084,
"num_tokens": 5464823.0,
"reward": 0.8305915594100952,
"reward_std": 0.40696918964385986,
"rewards/rollout_reward_func/mean": 0.8305915594100952,
"rewards/rollout_reward_func/std": 0.396116703748703,
"sampling/importance_sampling_ratio/max": 1.0000261068344116,
"sampling/importance_sampling_ratio/mean": 0.9999996423721313,
"sampling/importance_sampling_ratio/min": 0.999976396560669,
"sampling/sampling_logp_difference/max": 2.4913773813750595e-05,
"sampling/sampling_logp_difference/mean": 2.3334573597821873e-06,
"step": 146,
"step_time": 10.321809514000051
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 303.0,
"completions/mean_terminated_length": 303.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010614871865755049,
"epoch": 0.00294,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.479826697614044e-05,
"kl": 5.353869080543518,
"learning_rate": 7.195499096708908e-05,
"loss": 0.0023,
"num_tokens": 5505107.0,
"reward": 0.6477623581886292,
"reward_std": 0.3948605954647064,
"rewards/rollout_reward_func/mean": 0.6477623581886292,
"rewards/rollout_reward_func/std": 0.489864706993103,
"sampling/importance_sampling_ratio/max": 1.0000009536743164,
"sampling/importance_sampling_ratio/mean": 0.9999884963035583,
"sampling/importance_sampling_ratio/min": 0.9998813271522522,
"sampling/sampling_logp_difference/max": 7.403669587802142e-05,
"sampling/sampling_logp_difference/mean": 3.598812554628239e-06,
"step": 147,
"step_time": 11.450413204000597
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 198.65625,
"completions/mean_terminated_length": 198.65625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.508765266095452e-05,
"epoch": 0.00296,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.576803909614682e-05,
"kl": 5.068997144699097,
"learning_rate": 7.152413439309507e-05,
"loss": -0.0066,
"num_tokens": 5540702.0,
"reward": 0.7034207582473755,
"reward_std": 0.3982081413269043,
"rewards/rollout_reward_func/mean": 0.7034207582473755,
"rewards/rollout_reward_func/std": 0.4712841808795929,
"sampling/importance_sampling_ratio/max": 1.000001072883606,
"sampling/importance_sampling_ratio/mean": 0.9999918937683105,
"sampling/importance_sampling_ratio/min": 0.9999278783798218,
"sampling/sampling_logp_difference/max": 3.9339749491773546e-05,
"sampling/sampling_logp_difference/mean": 2.7800087991636246e-06,
"step": 148,
"step_time": 10.281147137000517
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 182.40625,
"completions/mean_terminated_length": 182.40625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.207081082720833e-05,
"epoch": 0.00298,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.50255832017865e-05,
"kl": 5.564902618527412,
"learning_rate": 7.109200955860391e-05,
"loss": -0.0056,
"num_tokens": 5577184.0,
"reward": 0.7362499833106995,
"reward_std": 0.46878689527511597,
"rewards/rollout_reward_func/mean": 0.7362499833106995,
"rewards/rollout_reward_func/std": 0.4626832604408264,
"sampling/importance_sampling_ratio/max": 1.000007152557373,
"sampling/importance_sampling_ratio/mean": 0.9999955892562866,
"sampling/importance_sampling_ratio/min": 0.9999680519104004,
"sampling/sampling_logp_difference/max": 3.171010030200705e-05,
"sampling/sampling_logp_difference/mean": 2.3544655505247647e-06,
"step": 149,
"step_time": 11.345688200000495
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 292.65625,
"completions/mean_terminated_length": 292.65625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010053441897639459,
"epoch": 0.003,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2554990209755488e-05,
"kl": 4.625477809458971,
"learning_rate": 7.065867719485405e-05,
"loss": -0.004,
"num_tokens": 5617306.0,
"reward": 0.7446038722991943,
"reward_std": 0.5122137665748596,
"rewards/rollout_reward_func/mean": 0.7446038722991943,
"rewards/rollout_reward_func/std": 0.5244807600975037,
"sampling/importance_sampling_ratio/max": 1.00002920627594,
"sampling/importance_sampling_ratio/mean": 0.9999974370002747,
"sampling/importance_sampling_ratio/min": 0.9999731183052063,
"sampling/sampling_logp_difference/max": 3.337630914757028e-05,
"sampling/sampling_logp_difference/mean": 2.951200485767913e-06,
"step": 150,
"step_time": 10.506773899000109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 222.65625,
"completions/mean_terminated_length": 222.65625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.440468580772631e-05,
"epoch": 0.00302,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.207312304060906e-05,
"kl": 4.787262956320774,
"learning_rate": 7.022419820279119e-05,
"loss": 0.0178,
"num_tokens": 5654322.0,
"reward": 0.673420786857605,
"reward_std": 0.4894208014011383,
"rewards/rollout_reward_func/mean": 0.673420786857605,
"rewards/rollout_reward_func/std": 0.477554589509964,
"sampling/importance_sampling_ratio/max": 1.0000011920928955,
"sampling/importance_sampling_ratio/mean": 0.9999935626983643,
"sampling/importance_sampling_ratio/min": 0.9999630451202393,
"sampling/sampling_logp_difference/max": 3.206796827726066e-05,
"sampling/sampling_logp_difference/mean": 2.7845492240885505e-06,
"step": 151,
"step_time": 11.023018873000638
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 187.1875,
"completions/mean_terminated_length": 187.1875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.2811070446523445e-05,
"epoch": 0.00304,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.808218985388521e-05,
"kl": 5.205201223492622,
"learning_rate": 6.978863364450934e-05,
"loss": 0.0038,
"num_tokens": 5689557.0,
"reward": 0.6399999856948853,
"reward_std": 0.4861735701560974,
"rewards/rollout_reward_func/mean": 0.6399999856948853,
"rewards/rollout_reward_func/std": 0.49357154965400696,
"sampling/importance_sampling_ratio/max": 1.0000072717666626,
"sampling/importance_sampling_ratio/mean": 0.9999970197677612,
"sampling/importance_sampling_ratio/min": 0.9999759197235107,
"sampling/sampling_logp_difference/max": 2.753835724433884e-05,
"sampling/sampling_logp_difference/mean": 1.748717750160722e-06,
"step": 152,
"step_time": 10.42158465600005
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 649.0,
"completions/max_terminated_length": 649.0,
"completions/mean_length": 236.3125,
"completions/mean_terminated_length": 236.3125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.034046837224196e-05,
"epoch": 0.00306,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.1351421057479456e-05,
"kl": 4.991829484701157,
"learning_rate": 6.935204473466904e-05,
"loss": 0.0026,
"num_tokens": 5727626.0,
"reward": 0.7721707820892334,
"reward_std": 0.4510332942008972,
"rewards/rollout_reward_func/mean": 0.7721707820892334,
"rewards/rollout_reward_func/std": 0.4416636824607849,
"sampling/importance_sampling_ratio/max": 1.0000555515289307,
"sampling/importance_sampling_ratio/mean": 0.9999988079071045,
"sampling/importance_sampling_ratio/min": 0.9999600648880005,
"sampling/sampling_logp_difference/max": 5.256607983028516e-05,
"sampling/sampling_logp_difference/mean": 2.8970951007067924e-06,
"step": 153,
"step_time": 11.246140894999826
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 285.09375,
"completions/mean_terminated_length": 285.09375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.816164853759801e-05,
"epoch": 0.00308,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.816457294509746e-05,
"kl": 5.398580640554428,
"learning_rate": 6.891449283189408e-05,
"loss": -0.0029,
"num_tokens": 5767888.0,
"reward": 0.7736831307411194,
"reward_std": 0.4541749954223633,
"rewards/rollout_reward_func/mean": 0.7736831307411194,
"rewards/rollout_reward_func/std": 0.4432695508003235,
"sampling/importance_sampling_ratio/max": 1.0000208616256714,
"sampling/importance_sampling_ratio/mean": 0.9999924302101135,
"sampling/importance_sampling_ratio/min": 0.9999641180038452,
"sampling/sampling_logp_difference/max": 4.196213558316231e-05,
"sampling/sampling_logp_difference/mean": 3.3789206099754665e-06,
"step": 154,
"step_time": 10.789620635000574
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 168.5,
"completions/mean_terminated_length": 168.5,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.32246807899628e-05,
"epoch": 0.0031,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.940055754967034e-05,
"kl": 3.8871512040495872,
"learning_rate": 6.847603943014831e-05,
"loss": -0.0094,
"num_tokens": 5801517.0,
"reward": 0.9227623343467712,
"reward_std": 0.2715562582015991,
"rewards/rollout_reward_func/mean": 0.9227623343467712,
"rewards/rollout_reward_func/std": 0.29780396819114685,
"sampling/importance_sampling_ratio/max": 1.000018835067749,
"sampling/importance_sampling_ratio/mean": 0.9999949932098389,
"sampling/importance_sampling_ratio/min": 0.9999276399612427,
"sampling/sampling_logp_difference/max": 7.188395102275535e-05,
"sampling/sampling_logp_difference/mean": 2.5055819605768193e-06,
"step": 155,
"step_time": 11.14235565700028
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 291.6875,
"completions/mean_terminated_length": 291.6875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.755855474224973e-05,
"epoch": 0.00312,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.6484132326440886e-05,
"kl": 5.283691808581352,
"learning_rate": 6.803674615009306e-05,
"loss": 0.0013,
"num_tokens": 5842047.0,
"reward": 0.6480915546417236,
"reward_std": 0.42874252796173096,
"rewards/rollout_reward_func/mean": 0.6480915546417236,
"rewards/rollout_reward_func/std": 0.49206840991973877,
"sampling/importance_sampling_ratio/max": 1.0000343322753906,
"sampling/importance_sampling_ratio/mean": 0.9999889135360718,
"sampling/importance_sampling_ratio/min": 0.9998984336853027,
"sampling/sampling_logp_difference/max": 5.508240428753197e-05,
"sampling/sampling_logp_difference/mean": 3.7744730434496887e-06,
"step": 156,
"step_time": 10.346119937999902
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 629.0,
"completions/max_terminated_length": 629.0,
"completions/mean_length": 192.3125,
"completions/mean_terminated_length": 192.3125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.816107470797306e-05,
"epoch": 0.00314,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.39569957961794e-05,
"kl": 5.069719024002552,
"learning_rate": 6.759667473042693e-05,
"loss": -0.0042,
"num_tokens": 5877952.0,
"reward": 0.6434208154678345,
"reward_std": 0.5129468441009521,
"rewards/rollout_reward_func/mean": 0.6434208154678345,
"rewards/rollout_reward_func/std": 0.49626046419143677,
"sampling/importance_sampling_ratio/max": 1.0000133514404297,
"sampling/importance_sampling_ratio/mean": 0.9999951124191284,
"sampling/importance_sampling_ratio/min": 0.9999496936798096,
"sampling/sampling_logp_difference/max": 3.862451922032051e-05,
"sampling/sampling_logp_difference/mean": 2.2066908513806993e-06,
"step": 157,
"step_time": 10.50308546999986
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 155.96875,
"completions/mean_terminated_length": 155.96875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 4.922382314020979e-05,
"epoch": 0.00316,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.8020789108704776e-05,
"kl": 4.0720951023104135,
"learning_rate": 6.71558870192091e-05,
"loss": 0.0129,
"num_tokens": 5909654.0,
"reward": 0.7300000190734863,
"reward_std": 0.44941431283950806,
"rewards/rollout_reward_func/mean": 0.7300000190734863,
"rewards/rollout_reward_func/std": 0.45300430059432983,
"sampling/importance_sampling_ratio/max": 1.000025749206543,
"sampling/importance_sampling_ratio/mean": 0.9999992847442627,
"sampling/importance_sampling_ratio/min": 0.9999586343765259,
"sampling/sampling_logp_difference/max": 3.397205000510439e-05,
"sampling/sampling_logp_difference/mean": 2.13175098906504e-06,
"step": 158,
"step_time": 10.794820879999861
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 235.4375,
"completions/mean_terminated_length": 235.4375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.976622259775468e-05,
"epoch": 0.00318,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.966442353790626e-05,
"kl": 4.3015144392848015,
"learning_rate": 6.671444496516697e-05,
"loss": 0.0141,
"num_tokens": 5945688.0,
"reward": 0.7380915284156799,
"reward_std": 0.4598575234413147,
"rewards/rollout_reward_func/mean": 0.7380915284156799,
"rewards/rollout_reward_func/std": 0.45405706763267517,
"sampling/importance_sampling_ratio/max": 1.0000113248825073,
"sampling/importance_sampling_ratio/mean": 0.9999938011169434,
"sampling/importance_sampling_ratio/min": 0.9999215602874756,
"sampling/sampling_logp_difference/max": 8.154689567163587e-05,
"sampling/sampling_logp_difference/mean": 2.5779245333978906e-06,
"step": 159,
"step_time": 10.786944689001302
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 233.53125,
"completions/mean_terminated_length": 233.53125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.402424188602708e-05,
"epoch": 0.0032,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.717422416433692e-05,
"kl": 5.396564111113548,
"learning_rate": 6.627241060898992e-05,
"loss": 0.0013,
"num_tokens": 5983663.0,
"reward": 0.6759207844734192,
"reward_std": 0.49697309732437134,
"rewards/rollout_reward_func/mean": 0.6759207844734192,
"rewards/rollout_reward_func/std": 0.48406854271888733,
"sampling/importance_sampling_ratio/max": 1.0000170469284058,
"sampling/importance_sampling_ratio/mean": 0.9999961853027344,
"sampling/importance_sampling_ratio/min": 0.9998974204063416,
"sampling/sampling_logp_difference/max": 0.00010955836478387937,
"sampling/sampling_logp_difference/mean": 2.9753630315099144e-06,
"step": 160,
"step_time": 10.427135486000225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 659.0,
"completions/max_terminated_length": 659.0,
"completions/mean_length": 219.5625,
"completions/mean_terminated_length": 219.5625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.343402867832083e-05,
"epoch": 0.00322,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.0754632209427655e-05,
"kl": 4.563273504376411,
"learning_rate": 6.582984607461005e-05,
"loss": -0.0,
"num_tokens": 6020228.0,
"reward": 0.8640123605728149,
"reward_std": 0.32256150245666504,
"rewards/rollout_reward_func/mean": 0.8640123605728149,
"rewards/rollout_reward_func/std": 0.3677850067615509,
"sampling/importance_sampling_ratio/max": 1.0000296831130981,
"sampling/importance_sampling_ratio/mean": 0.9999971389770508,
"sampling/importance_sampling_ratio/min": 0.9999282360076904,
"sampling/sampling_logp_difference/max": 4.3036350689362735e-05,
"sampling/sampling_logp_difference/mean": 2.5196563910867553e-06,
"step": 161,
"step_time": 11.209509552999862
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 687.0,
"completions/max_terminated_length": 687.0,
"completions/mean_length": 229.59375,
"completions/mean_terminated_length": 229.59375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.595711480869795e-05,
"epoch": 0.00324,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.432530790334567e-05,
"kl": 4.6679516807198524,
"learning_rate": 6.538681356047126e-05,
"loss": -0.0007,
"num_tokens": 6057489.0,
"reward": 0.7705915570259094,
"reward_std": 0.33682525157928467,
"rewards/rollout_reward_func/mean": 0.7705915570259094,
"rewards/rollout_reward_func/std": 0.4407337009906769,
"sampling/importance_sampling_ratio/max": 1.0000135898590088,
"sampling/importance_sampling_ratio/mean": 0.999994158744812,
"sampling/importance_sampling_ratio/min": 0.9999637603759766,
"sampling/sampling_logp_difference/max": 4.1843191866064444e-05,
"sampling/sampling_logp_difference/mean": 2.773166670522187e-06,
"step": 162,
"step_time": 10.208484574999602
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 649.0,
"completions/max_terminated_length": 649.0,
"completions/mean_length": 207.3125,
"completions/mean_terminated_length": 207.3125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.422831117764872e-05,
"epoch": 0.00326,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0685478375526145e-05,
"kl": 4.779223263263702,
"learning_rate": 6.494337533078768e-05,
"loss": 0.0052,
"num_tokens": 6093527.0,
"reward": 0.6440123319625854,
"reward_std": 0.5318774580955505,
"rewards/rollout_reward_func/mean": 0.6440123319625854,
"rewards/rollout_reward_func/std": 0.5527451038360596,
"sampling/importance_sampling_ratio/max": 1.0000094175338745,
"sampling/importance_sampling_ratio/mean": 0.9999961853027344,
"sampling/importance_sampling_ratio/min": 0.99996018409729,
"sampling/sampling_logp_difference/max": 2.408051113889087e-05,
"sampling/sampling_logp_difference/mean": 2.0797965589736123e-06,
"step": 163,
"step_time": 11.039077626001017
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 687.0,
"completions/max_terminated_length": 687.0,
"completions/mean_length": 224.78125,
"completions/mean_terminated_length": 224.78125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010505592480569703,
"epoch": 0.00328,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.714211197802797e-05,
"kl": 4.768290672451258,
"learning_rate": 6.449959370679315e-05,
"loss": -0.0051,
"num_tokens": 6130535.0,
"reward": 0.6136831045150757,
"reward_std": 0.5993773937225342,
"rewards/rollout_reward_func/mean": 0.6136831045150757,
"rewards/rollout_reward_func/std": 0.6141568422317505,
"sampling/importance_sampling_ratio/max": 1.0000272989273071,
"sampling/importance_sampling_ratio/mean": 0.9999916553497314,
"sampling/importance_sampling_ratio/min": 0.9998797178268433,
"sampling/sampling_logp_difference/max": 0.00010908614785876125,
"sampling/sampling_logp_difference/mean": 3.312961098345113e-06,
"step": 164,
"step_time": 10.286591632999716
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 220.5625,
"completions/mean_terminated_length": 220.5625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.280044454522795e-05,
"epoch": 0.0033,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.6241592422593385e-05,
"kl": 5.127960130572319,
"learning_rate": 6.40555310579825e-05,
"loss": 0.0136,
"num_tokens": 6167111.0,
"reward": 0.736512303352356,
"reward_std": 0.46028703451156616,
"rewards/rollout_reward_func/mean": 0.736512303352356,
"rewards/rollout_reward_func/std": 0.45586925745010376,
"sampling/importance_sampling_ratio/max": 1.0000137090682983,
"sampling/importance_sampling_ratio/mean": 0.9999931454658508,
"sampling/importance_sampling_ratio/min": 0.999862790107727,
"sampling/sampling_logp_difference/max": 0.00010932452278211713,
"sampling/sampling_logp_difference/mean": 3.687848220579326e-06,
"step": 165,
"step_time": 11.5190469510012
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 211.75,
"completions/mean_terminated_length": 211.75,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.565107993277252e-05,
"epoch": 0.00332,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.7061549189966172e-05,
"kl": 5.664609462022781,
"learning_rate": 6.3611249793346e-05,
"loss": 0.0054,
"num_tokens": 6203530.0,
"reward": 0.5784207582473755,
"reward_std": 0.5085035562515259,
"rewards/rollout_reward_func/mean": 0.5784207582473755,
"rewards/rollout_reward_func/std": 0.5033032298088074,
"sampling/importance_sampling_ratio/max": 1.000006914138794,
"sampling/importance_sampling_ratio/mean": 0.9999963641166687,
"sampling/importance_sampling_ratio/min": 0.9999628663063049,
"sampling/sampling_logp_difference/max": 3.2067451684270054e-05,
"sampling/sampling_logp_difference/mean": 1.9461851934465813e-06,
"step": 166,
"step_time": 10.759499068000423
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 243.09375,
"completions/mean_terminated_length": 243.09375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.166922380643427e-05,
"epoch": 0.00334,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.269746957812458e-05,
"kl": 5.353539019823074,
"learning_rate": 6.316681235259841e-05,
"loss": -0.0023,
"num_tokens": 6241522.0,
"reward": 0.7074331045150757,
"reward_std": 0.4500606656074524,
"rewards/rollout_reward_func/mean": 0.7074331045150757,
"rewards/rollout_reward_func/std": 0.5293745398521423,
"sampling/importance_sampling_ratio/max": 1.000022292137146,
"sampling/importance_sampling_ratio/mean": 0.9999964833259583,
"sampling/importance_sampling_ratio/min": 0.9999686479568481,
"sampling/sampling_logp_difference/max": 3.9577906136401e-05,
"sampling/sampling_logp_difference/mean": 2.4976732220238773e-06,
"step": 167,
"step_time": 10.744948780001323
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 687.0,
"completions/max_terminated_length": 687.0,
"completions/mean_length": 316.25,
"completions/mean_terminated_length": 316.25,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.831737940795392e-05,
"epoch": 0.00336,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.081060589873232e-05,
"kl": 5.351280067116022,
"learning_rate": 6.272228119740365e-05,
"loss": -0.0001,
"num_tokens": 6283073.0,
"reward": 0.6187499761581421,
"reward_std": 0.5161277651786804,
"rewards/rollout_reward_func/mean": 0.6187499761581421,
"rewards/rollout_reward_func/std": 0.5022770762443542,
"sampling/importance_sampling_ratio/max": 1.0000226497650146,
"sampling/importance_sampling_ratio/mean": 0.9999954700469971,
"sampling/importance_sampling_ratio/min": 0.9999697208404541,
"sampling/sampling_logp_difference/max": 2.861044958990533e-05,
"sampling/sampling_logp_difference/mean": 2.5904687390720937e-06,
"step": 168,
"step_time": 10.471195426000122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 281.9375,
"completions/mean_terminated_length": 281.9375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.795457219434866e-05,
"epoch": 0.00338,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.220425009611063e-05,
"kl": 5.150362059473991,
"learning_rate": 6.227771880259637e-05,
"loss": 0.0149,
"num_tokens": 6323047.0,
"reward": 0.6793415546417236,
"reward_std": 0.5005303621292114,
"rewards/rollout_reward_func/mean": 0.6793415546417236,
"rewards/rollout_reward_func/std": 0.4792368412017822,
"sampling/importance_sampling_ratio/max": 1.00005042552948,
"sampling/importance_sampling_ratio/mean": 0.9999979138374329,
"sampling/importance_sampling_ratio/min": 0.9999761581420898,
"sampling/sampling_logp_difference/max": 4.2317173210904e-05,
"sampling/sampling_logp_difference/mean": 2.44653210756951e-06,
"step": 169,
"step_time": 11.402160383000137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 142.375,
"completions/mean_terminated_length": 142.375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 4.836189982881933e-05,
"epoch": 0.0034,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.064507109229453e-05,
"kl": 4.963020361959934,
"learning_rate": 6.183318764740161e-05,
"loss": -0.0042,
"num_tokens": 6356441.0,
"reward": 0.7005915641784668,
"reward_std": 0.48488616943359375,
"rewards/rollout_reward_func/mean": 0.7005915641784668,
"rewards/rollout_reward_func/std": 0.4747658967971802,
"sampling/importance_sampling_ratio/max": 1.000011682510376,
"sampling/importance_sampling_ratio/mean": 0.999997079372406,
"sampling/importance_sampling_ratio/min": 0.9999744892120361,
"sampling/sampling_logp_difference/max": 3.075617496506311e-05,
"sampling/sampling_logp_difference/mean": 1.6916058029892156e-06,
"step": 170,
"step_time": 10.331447902999571
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 649.0,
"completions/max_terminated_length": 649.0,
"completions/mean_length": 191.6875,
"completions/mean_terminated_length": 191.6875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.195598889353505e-05,
"epoch": 0.00342,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.0986659112386405e-05,
"kl": 4.95799808204174,
"learning_rate": 6.138875020665402e-05,
"loss": -0.001,
"num_tokens": 6392772.0,
"reward": 0.7986831665039062,
"reward_std": 0.43947798013687134,
"rewards/rollout_reward_func/mean": 0.7986831665039062,
"rewards/rollout_reward_func/std": 0.4244628846645355,
"sampling/importance_sampling_ratio/max": 1.000011682510376,
"sampling/importance_sampling_ratio/mean": 0.9999972581863403,
"sampling/importance_sampling_ratio/min": 0.999981164932251,
"sampling/sampling_logp_difference/max": 2.6346318918513134e-05,
"sampling/sampling_logp_difference/mean": 2.3348138711298816e-06,
"step": 171,
"step_time": 10.487647148000178
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 304.59375,
"completions/mean_terminated_length": 304.59375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.308483593566507e-05,
"epoch": 0.00344,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.993174686911516e-05,
"kl": 5.391621388494968,
"learning_rate": 6.0944468942017506e-05,
"loss": -0.0032,
"num_tokens": 6433785.0,
"reward": 0.6780915260314941,
"reward_std": 0.47621750831604004,
"rewards/rollout_reward_func/mean": 0.6780915260314941,
"rewards/rollout_reward_func/std": 0.4856433868408203,
"sampling/importance_sampling_ratio/max": 1.0000230073928833,
"sampling/importance_sampling_ratio/mean": 0.9999980330467224,
"sampling/importance_sampling_ratio/min": 0.9999696016311646,
"sampling/sampling_logp_difference/max": 3.433286474319175e-05,
"sampling/sampling_logp_difference/mean": 2.4288412987516494e-06,
"step": 172,
"step_time": 11.141727451000861
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 228.15625,
"completions/mean_terminated_length": 228.15625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.799076345487265e-05,
"epoch": 0.00346,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.5050336009589955e-05,
"kl": 5.1544738709926605,
"learning_rate": 6.050040629320685e-05,
"loss": -0.0035,
"num_tokens": 6471147.0,
"reward": 0.6752623319625854,
"reward_std": 0.36605003476142883,
"rewards/rollout_reward_func/mean": 0.6752623319625854,
"rewards/rollout_reward_func/std": 0.4842662811279297,
"sampling/importance_sampling_ratio/max": 1.0000275373458862,
"sampling/importance_sampling_ratio/mean": 0.9999979734420776,
"sampling/importance_sampling_ratio/min": 0.9999769926071167,
"sampling/sampling_logp_difference/max": 3.2067451684270054e-05,
"sampling/sampling_logp_difference/mean": 1.975222858163761e-06,
"step": 173,
"step_time": 10.814325706000545
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 216.21875,
"completions/mean_terminated_length": 216.21875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.060093649930877e-05,
"epoch": 0.00348,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.7824895116500556e-05,
"kl": 4.555030801682733,
"learning_rate": 6.0056624669212335e-05,
"loss": -0.0107,
"num_tokens": 6508087.0,
"reward": 0.8005247116088867,
"reward_std": 0.4093303382396698,
"rewards/rollout_reward_func/mean": 0.8005247116088867,
"rewards/rollout_reward_func/std": 0.4889991879463196,
"sampling/importance_sampling_ratio/max": 1.0000141859054565,
"sampling/importance_sampling_ratio/mean": 0.9999945163726807,
"sampling/importance_sampling_ratio/min": 0.9999269247055054,
"sampling/sampling_logp_difference/max": 6.58156059216708e-05,
"sampling/sampling_logp_difference/mean": 2.803930328809656e-06,
"step": 174,
"step_time": 10.91736506000052
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 224.6875,
"completions/mean_terminated_length": 224.6875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.575894181854892e-05,
"epoch": 0.0035,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.9673146855202504e-05,
"kl": 4.930005133152008,
"learning_rate": 5.961318643952876e-05,
"loss": -0.001,
"num_tokens": 6545080.0,
"reward": 0.5815123319625854,
"reward_std": 0.5429329872131348,
"rewards/rollout_reward_func/mean": 0.5815123319625854,
"rewards/rollout_reward_func/std": 0.5628648996353149,
"sampling/importance_sampling_ratio/max": 1.0000128746032715,
"sampling/importance_sampling_ratio/mean": 0.9999929666519165,
"sampling/importance_sampling_ratio/min": 0.9999529123306274,
"sampling/sampling_logp_difference/max": 5.090353079140186e-05,
"sampling/sampling_logp_difference/mean": 2.7515507099451497e-06,
"step": 175,
"step_time": 11.003749063999749
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 260.0,
"completions/mean_terminated_length": 260.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.6100228437449e-05,
"epoch": 0.00352,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7575541278347373e-05,
"kl": 4.587189957499504,
"learning_rate": 5.917015392538995e-05,
"loss": 0.0078,
"num_tokens": 6582632.0,
"reward": 0.7384207844734192,
"reward_std": 0.4618578553199768,
"rewards/rollout_reward_func/mean": 0.7384207844734192,
"rewards/rollout_reward_func/std": 0.4556211829185486,
"sampling/importance_sampling_ratio/max": 1.000009298324585,
"sampling/importance_sampling_ratio/mean": 0.9999972581863403,
"sampling/importance_sampling_ratio/min": 0.9999776482582092,
"sampling/sampling_logp_difference/max": 2.0385952666401863e-05,
"sampling/sampling_logp_difference/mean": 2.221093836851651e-06,
"step": 176,
"step_time": 10.7899746460007
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 669.0,
"completions/max_terminated_length": 669.0,
"completions/mean_length": 235.1875,
"completions/mean_terminated_length": 235.1875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.040411874153051e-05,
"epoch": 0.00354,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.335916335345246e-05,
"kl": 5.1722188803069,
"learning_rate": 5.872758939101011e-05,
"loss": -0.0138,
"num_tokens": 6620470.0,
"reward": 0.6118415594100952,
"reward_std": 0.3704405725002289,
"rewards/rollout_reward_func/mean": 0.6118415594100952,
"rewards/rollout_reward_func/std": 0.5016486644744873,
"sampling/importance_sampling_ratio/max": 1.0000079870224,
"sampling/importance_sampling_ratio/mean": 0.9999971389770508,
"sampling/importance_sampling_ratio/min": 0.9999728202819824,
"sampling/sampling_logp_difference/max": 2.1219901100266725e-05,
"sampling/sampling_logp_difference/mean": 1.7425376199753373e-06,
"step": 177,
"step_time": 10.707822592999946
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 255.03125,
"completions/mean_terminated_length": 255.03125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.830880042230092e-05,
"epoch": 0.00356,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.9521069993497804e-05,
"kl": 5.192914575338364,
"learning_rate": 5.828555503483305e-05,
"loss": -0.0072,
"num_tokens": 6659130.0,
"reward": 0.6134207844734192,
"reward_std": 0.48705360293388367,
"rewards/rollout_reward_func/mean": 0.6134207844734192,
"rewards/rollout_reward_func/std": 0.5055395364761353,
"sampling/importance_sampling_ratio/max": 1.0000206232070923,
"sampling/importance_sampling_ratio/mean": 0.9999992847442627,
"sampling/importance_sampling_ratio/min": 0.9999740123748779,
"sampling/sampling_logp_difference/max": 2.705884980969131e-05,
"sampling/sampling_logp_difference/mean": 2.5482090677542146e-06,
"step": 178,
"step_time": 10.992413979999128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 225.25,
"completions/mean_terminated_length": 225.25,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.845150574325999e-05,
"epoch": 0.00358,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.7154950657859445e-05,
"kl": 4.836911082267761,
"learning_rate": 5.784411298079091e-05,
"loss": 0.001,
"num_tokens": 6696650.0,
"reward": 0.6777623891830444,
"reward_std": 0.5198129415512085,
"rewards/rollout_reward_func/mean": 0.6777623891830444,
"rewards/rollout_reward_func/std": 0.5467191338539124,
"sampling/importance_sampling_ratio/max": 1.0000206232070923,
"sampling/importance_sampling_ratio/mean": 0.9999957084655762,
"sampling/importance_sampling_ratio/min": 0.999954104423523,
"sampling/sampling_logp_difference/max": 3.7670324672944844e-05,
"sampling/sampling_logp_difference/mean": 3.4365023111604387e-06,
"step": 179,
"step_time": 10.803954972000156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 295.0625,
"completions/mean_terminated_length": 295.0625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.146770800327886e-05,
"epoch": 0.0036,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.348440052126534e-05,
"kl": 5.05779367685318,
"learning_rate": 5.740332526957307e-05,
"loss": 0.0057,
"num_tokens": 6736701.0,
"reward": 0.6796707510948181,
"reward_std": 0.46409231424331665,
"rewards/rollout_reward_func/mean": 0.6796707510948181,
"rewards/rollout_reward_func/std": 0.48484089970588684,
"sampling/importance_sampling_ratio/max": 1.0000330209732056,
"sampling/importance_sampling_ratio/mean": 0.999993622303009,
"sampling/importance_sampling_ratio/min": 0.9999408721923828,
"sampling/sampling_logp_difference/max": 5.281723861116916e-05,
"sampling/sampling_logp_difference/mean": 3.24379061567015e-06,
"step": 180,
"step_time": 11.004632821999621
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 307.5625,
"completions/mean_terminated_length": 307.5625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.063932723383459e-05,
"epoch": 0.00362,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0221406884957105e-05,
"kl": 5.286682575941086,
"learning_rate": 5.696325384990696e-05,
"loss": 0.0138,
"num_tokens": 6777629.0,
"reward": 0.5859208106994629,
"reward_std": 0.46358904242515564,
"rewards/rollout_reward_func/mean": 0.5859208106994629,
"rewards/rollout_reward_func/std": 0.5023459792137146,
"sampling/importance_sampling_ratio/max": 1.0000137090682983,
"sampling/importance_sampling_ratio/mean": 0.9999953508377075,
"sampling/importance_sampling_ratio/min": 0.9999548196792603,
"sampling/sampling_logp_difference/max": 4.208114114589989e-05,
"sampling/sampling_logp_difference/mean": 2.680135366972536e-06,
"step": 181,
"step_time": 11.124628821999067
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 659.0,
"completions/max_terminated_length": 659.0,
"completions/mean_length": 222.6875,
"completions/mean_terminated_length": 222.6875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.661826836624641e-05,
"epoch": 0.00364,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.2959258027840406e-05,
"kl": 5.0524120181798935,
"learning_rate": 5.6523960569851695e-05,
"loss": -0.0093,
"num_tokens": 6814561.0,
"reward": 0.7361831665039062,
"reward_std": 0.4711652994155884,
"rewards/rollout_reward_func/mean": 0.7361831665039062,
"rewards/rollout_reward_func/std": 0.4640645682811737,
"sampling/importance_sampling_ratio/max": 1.0000171661376953,
"sampling/importance_sampling_ratio/mean": 0.9999977350234985,
"sampling/importance_sampling_ratio/min": 0.9999733567237854,
"sampling/sampling_logp_difference/max": 3.1111223506741226e-05,
"sampling/sampling_logp_difference/mean": 2.562888994361856e-06,
"step": 182,
"step_time": 10.962344157000189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 649.0,
"completions/max_terminated_length": 649.0,
"completions/mean_length": 253.375,
"completions/mean_terminated_length": 253.375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.879004907351828e-05,
"epoch": 0.00366,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.388055094750598e-05,
"kl": 5.53626848757267,
"learning_rate": 5.6085507168105936e-05,
"loss": 0.0083,
"num_tokens": 6853636.0,
"reward": 0.6771707534790039,
"reward_std": 0.47990167140960693,
"rewards/rollout_reward_func/mean": 0.6771707534790039,
"rewards/rollout_reward_func/std": 0.4823152720928192,
"sampling/importance_sampling_ratio/max": 1.000022053718567,
"sampling/importance_sampling_ratio/mean": 0.999996542930603,
"sampling/importance_sampling_ratio/min": 0.9998987913131714,
"sampling/sampling_logp_difference/max": 8.393276948481798e-05,
"sampling/sampling_logp_difference/mean": 3.3215160328836646e-06,
"step": 183,
"step_time": 10.043853558999217
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 278.5,
"completions/mean_terminated_length": 278.5,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010487484094312549,
"epoch": 0.00368,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.7909642085433006e-05,
"kl": 4.555940642952919,
"learning_rate": 5.5647955265330974e-05,
"loss": 0.0018,
"num_tokens": 6893156.0,
"reward": 0.6486831307411194,
"reward_std": 0.5506794452667236,
"rewards/rollout_reward_func/mean": 0.6486831307411194,
"rewards/rollout_reward_func/std": 0.5536602139472961,
"sampling/importance_sampling_ratio/max": 1.000025987625122,
"sampling/importance_sampling_ratio/mean": 0.9999922513961792,
"sampling/importance_sampling_ratio/min": 0.9998563528060913,
"sampling/sampling_logp_difference/max": 0.00014341842324938625,
"sampling/sampling_logp_difference/mean": 3.5290327105030883e-06,
"step": 184,
"step_time": 11.444932727999912
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 669.0,
"completions/max_terminated_length": 669.0,
"completions/mean_length": 215.75,
"completions/mean_terminated_length": 215.75,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.568410264013892e-05,
"epoch": 0.0037,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.518177618389018e-05,
"kl": 4.69642661511898,
"learning_rate": 5.5211366355490666e-05,
"loss": -0.0085,
"num_tokens": 6930186.0,
"reward": 0.8646707534790039,
"reward_std": 0.3878023624420166,
"rewards/rollout_reward_func/mean": 0.8646707534790039,
"rewards/rollout_reward_func/std": 0.37504494190216064,
"sampling/importance_sampling_ratio/max": 1.0000090599060059,
"sampling/importance_sampling_ratio/mean": 0.9999953508377075,
"sampling/importance_sampling_ratio/min": 0.9999690651893616,
"sampling/sampling_logp_difference/max": 4.506219192990102e-05,
"sampling/sampling_logp_difference/mean": 2.9929221909696935e-06,
"step": 185,
"step_time": 10.029982729999574
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 193.375,
"completions/mean_terminated_length": 193.375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.111554115761919e-05,
"epoch": 0.00372,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.515595744829625e-05,
"kl": 5.0036322847008705,
"learning_rate": 5.4775801797208824e-05,
"loss": -0.0062,
"num_tokens": 6966304.0,
"reward": 0.7059208154678345,
"reward_std": 0.4823508858680725,
"rewards/rollout_reward_func/mean": 0.7059208154678345,
"rewards/rollout_reward_func/std": 0.47572779655456543,
"sampling/importance_sampling_ratio/max": 1.0000250339508057,
"sampling/importance_sampling_ratio/mean": 0.9999973773956299,
"sampling/importance_sampling_ratio/min": 0.9999443292617798,
"sampling/sampling_logp_difference/max": 4.649218681151979e-05,
"sampling/sampling_logp_difference/mean": 2.706846771616256e-06,
"step": 186,
"step_time": 11.153666612000507
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 225.21875,
"completions/mean_terminated_length": 225.21875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.528406706886926e-05,
"epoch": 0.00374,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.1905819923849776e-05,
"kl": 4.318456851877272,
"learning_rate": 5.434132280514597e-05,
"loss": -0.0056,
"num_tokens": 7003007.0,
"reward": 0.7396707534790039,
"reward_std": 0.4821699857711792,
"rewards/rollout_reward_func/mean": 0.7396707534790039,
"rewards/rollout_reward_func/std": 0.4620356559753418,
"sampling/importance_sampling_ratio/max": 1.0000288486480713,
"sampling/importance_sampling_ratio/mean": 0.9999913573265076,
"sampling/importance_sampling_ratio/min": 0.9999553561210632,
"sampling/sampling_logp_difference/max": 4.5657627197215334e-05,
"sampling/sampling_logp_difference/mean": 3.5564712561608758e-06,
"step": 187,
"step_time": 10.209738572000333
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 156.25,
"completions/mean_terminated_length": 156.25,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 4.4102910081278424e-05,
"epoch": 0.00376,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1708140846167225e-05,
"kl": 4.209350232034922,
"learning_rate": 5.39079904413961e-05,
"loss": 0.008,
"num_tokens": 7036551.0,
"reward": 0.6680915355682373,
"reward_std": 0.49390894174575806,
"rewards/rollout_reward_func/mean": 0.6680915355682373,
"rewards/rollout_reward_func/std": 0.4823147654533386,
"sampling/importance_sampling_ratio/max": 1.000010371208191,
"sampling/importance_sampling_ratio/mean": 0.9999985694885254,
"sampling/importance_sampling_ratio/min": 0.9999769926071167,
"sampling/sampling_logp_difference/max": 2.1696479961974546e-05,
"sampling/sampling_logp_difference/mean": 1.3542378383135656e-06,
"step": 188,
"step_time": 11.282029985001373
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 211.25,
"completions/mean_terminated_length": 211.25,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.414088152302156e-05,
"epoch": 0.00378,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.6090696085011587e-05,
"kl": 4.265987630002201,
"learning_rate": 5.347586560690494e-05,
"loss": -0.0067,
"num_tokens": 7072324.0,
"reward": 0.7690123319625854,
"reward_std": 0.37984663248062134,
"rewards/rollout_reward_func/mean": 0.7690123319625854,
"rewards/rollout_reward_func/std": 0.43979594111442566,
"sampling/importance_sampling_ratio/max": 1.0000287294387817,
"sampling/importance_sampling_ratio/mean": 0.9999973177909851,
"sampling/importance_sampling_ratio/min": 0.9999397397041321,
"sampling/sampling_logp_difference/max": 4.434636502992362e-05,
"sampling/sampling_logp_difference/mean": 2.806390966725303e-06,
"step": 189,
"step_time": 10.240539944999455
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 217.5625,
"completions/mean_terminated_length": 217.5625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.151194306549314e-05,
"epoch": 0.0038,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.1802683477289975e-05,
"kl": 5.158947452902794,
"learning_rate": 5.304500903291094e-05,
"loss": 0.0011,
"num_tokens": 7109869.0,
"reward": 0.7086831331253052,
"reward_std": 0.5108458399772644,
"rewards/rollout_reward_func/mean": 0.7086831331253052,
"rewards/rollout_reward_func/std": 0.5356149673461914,
"sampling/importance_sampling_ratio/max": 1.0000258684158325,
"sampling/importance_sampling_ratio/mean": 0.9999958872795105,
"sampling/importance_sampling_ratio/min": 0.9999440908432007,
"sampling/sampling_logp_difference/max": 2.7300320653012022e-05,
"sampling/sampling_logp_difference/mean": 2.319904979231069e-06,
"step": 190,
"step_time": 11.589442052000777
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 167.09375,
"completions/mean_terminated_length": 167.09375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.836062217199014e-05,
"epoch": 0.00382,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.9179961933987215e-05,
"kl": 3.6461506029590964,
"learning_rate": 5.261548127240997e-05,
"loss": -0.0062,
"num_tokens": 7142827.0,
"reward": 0.7652623057365417,
"reward_std": 0.3935619592666626,
"rewards/rollout_reward_func/mean": 0.7652623057365417,
"rewards/rollout_reward_func/std": 0.5054081082344055,
"sampling/importance_sampling_ratio/max": 1.0000618696212769,
"sampling/importance_sampling_ratio/mean": 0.9999986290931702,
"sampling/importance_sampling_ratio/min": 0.9999737739562988,
"sampling/sampling_logp_difference/max": 9.068677900359035e-05,
"sampling/sampling_logp_difference/mean": 2.8852020932390587e-06,
"step": 191,
"step_time": 10.1199132500019
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 295.90625,
"completions/mean_terminated_length": 295.90625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.60994704762652e-05,
"epoch": 0.00384,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.605450547183864e-05,
"kl": 4.935833718627691,
"learning_rate": 5.218734269164519e-05,
"loss": 0.0128,
"num_tokens": 7183293.0,
"reward": 0.5855915546417236,
"reward_std": 0.5059728026390076,
"rewards/rollout_reward_func/mean": 0.5855915546417236,
"rewards/rollout_reward_func/std": 0.5033941864967346,
"sampling/importance_sampling_ratio/max": 1.0000156164169312,
"sampling/importance_sampling_ratio/mean": 0.9999943971633911,
"sampling/importance_sampling_ratio/min": 0.9999115467071533,
"sampling/sampling_logp_difference/max": 8.84538603713736e-05,
"sampling/sampling_logp_difference/mean": 2.977001713588834e-06,
"step": 192,
"step_time": 11.594122014000277
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 224.625,
"completions/mean_terminated_length": 224.625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.150646666786997e-05,
"epoch": 0.00386,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4954373909858987e-05,
"kl": 4.582384366542101,
"learning_rate": 5.176065346162311e-05,
"loss": -0.0057,
"num_tokens": 7220905.0,
"reward": 0.6765123009681702,
"reward_std": 0.5298900604248047,
"rewards/rollout_reward_func/mean": 0.6765123009681702,
"rewards/rollout_reward_func/std": 0.5459093451499939,
"sampling/importance_sampling_ratio/max": 1.0000386238098145,
"sampling/importance_sampling_ratio/mean": 0.9999974370002747,
"sampling/importance_sampling_ratio/min": 0.9999622106552124,
"sampling/sampling_logp_difference/max": 3.5402728826738894e-05,
"sampling/sampling_logp_difference/mean": 2.2574688500753837e-06,
"step": 193,
"step_time": 10.853508633000274
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 350.03125,
"completions/mean_terminated_length": 350.03125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.001271817192901e-05,
"epoch": 0.00388,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.810703492490575e-05,
"kl": 5.076992951333523,
"learning_rate": 5.1335473549657084e-05,
"loss": 0.0259,
"num_tokens": 7262276.0,
"reward": 0.6180915236473083,
"reward_std": 0.4720328152179718,
"rewards/rollout_reward_func/mean": 0.6180915236473083,
"rewards/rollout_reward_func/std": 0.49200955033302307,
"sampling/importance_sampling_ratio/max": 1.0000076293945312,
"sampling/importance_sampling_ratio/mean": 0.999992847442627,
"sampling/importance_sampling_ratio/min": 0.9999713897705078,
"sampling/sampling_logp_difference/max": 2.563028465374373e-05,
"sampling/sampling_logp_difference/mean": 2.8041777113685384e-06,
"step": 194,
"step_time": 10.52614910399916
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 172.90625,
"completions/mean_terminated_length": 172.90625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.115065123106888e-05,
"epoch": 0.0039,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.7086392189376056e-05,
"kl": 5.180005930364132,
"learning_rate": 5.0911862710939485e-05,
"loss": -0.0109,
"num_tokens": 7298179.0,
"reward": 0.578091561794281,
"reward_std": 0.4882219433784485,
"rewards/rollout_reward_func/mean": 0.578091561794281,
"rewards/rollout_reward_func/std": 0.5080995559692383,
"sampling/importance_sampling_ratio/max": 1.0000019073486328,
"sampling/importance_sampling_ratio/mean": 0.9999942183494568,
"sampling/importance_sampling_ratio/min": 0.9999573230743408,
"sampling/sampling_logp_difference/max": 3.194832243025303e-05,
"sampling/sampling_logp_difference/mean": 2.1487360299943248e-06,
"step": 195,
"step_time": 11.431366815999809
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 247.6875,
"completions/mean_terminated_length": 247.6875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.820983522004553e-05,
"epoch": 0.00392,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9182505639037117e-05,
"kl": 4.828364592045546,
"learning_rate": 5.0489880480143605e-05,
"loss": 0.0046,
"num_tokens": 7336132.0,
"reward": 0.706183135509491,
"reward_std": 0.4439047574996948,
"rewards/rollout_reward_func/mean": 0.706183135509491,
"rewards/rollout_reward_func/std": 0.473155677318573,
"sampling/importance_sampling_ratio/max": 1.000019907951355,
"sampling/importance_sampling_ratio/mean": 0.9999977350234985,
"sampling/importance_sampling_ratio/min": 0.9999704360961914,
"sampling/sampling_logp_difference/max": 4.1127488657366484e-05,
"sampling/sampling_logp_difference/mean": 1.965895080502378e-06,
"step": 196,
"step_time": 10.270143649001056
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 243.25,
"completions/mean_terminated_length": 243.25,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.612786135448914e-05,
"epoch": 0.00394,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.341598393395543e-05,
"kl": 5.151996046304703,
"learning_rate": 5.0069586163056615e-05,
"loss": -0.0132,
"num_tokens": 7375208.0,
"reward": 0.6177623271942139,
"reward_std": 0.5074785947799683,
"rewards/rollout_reward_func/mean": 0.6177623271942139,
"rewards/rollout_reward_func/std": 0.6169306039810181,
"sampling/importance_sampling_ratio/max": 1.0000261068344116,
"sampling/importance_sampling_ratio/mean": 0.9999943971633911,
"sampling/importance_sampling_ratio/min": 0.9998465776443481,
"sampling/sampling_logp_difference/max": 0.00015128619270399213,
"sampling/sampling_logp_difference/mean": 3.3790561246860307e-06,
"step": 197,
"step_time": 10.915793876999032
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 678.0,
"completions/max_terminated_length": 678.0,
"completions/mean_length": 254.09375,
"completions/mean_terminated_length": 254.09375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.961041109500911e-05,
"epoch": 0.00396,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.339655060903169e-05,
"kl": 5.318888820707798,
"learning_rate": 4.96510388282447e-05,
"loss": 0.0028,
"num_tokens": 7413139.0,
"reward": 0.6443415880203247,
"reward_std": 0.4514845907688141,
"rewards/rollout_reward_func/mean": 0.6443415880203247,
"rewards/rollout_reward_func/std": 0.49244067072868347,
"sampling/importance_sampling_ratio/max": 1.0000090599060059,
"sampling/importance_sampling_ratio/mean": 0.9999953508377075,
"sampling/importance_sampling_ratio/min": 0.9998794198036194,
"sampling/sampling_logp_difference/max": 9.40619720495306e-05,
"sampling/sampling_logp_difference/mean": 2.942206265288405e-06,
"step": 198,
"step_time": 10.681011859999671
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 687.0,
"completions/max_terminated_length": 687.0,
"completions/mean_length": 242.28125,
"completions/mean_terminated_length": 242.28125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.84905787330581e-05,
"epoch": 0.00398,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2986649128142744e-05,
"kl": 5.349530559033155,
"learning_rate": 4.9234297298751484e-05,
"loss": 0.0035,
"num_tokens": 7451845.0,
"reward": 0.5530915260314941,
"reward_std": 0.5549728870391846,
"rewards/rollout_reward_func/mean": 0.5530915260314941,
"rewards/rollout_reward_func/std": 0.5644925236701965,
"sampling/importance_sampling_ratio/max": 1.000011682510376,
"sampling/importance_sampling_ratio/mean": 0.9999925494194031,
"sampling/importance_sampling_ratio/min": 0.9999537467956543,
"sampling/sampling_logp_difference/max": 3.862451922032051e-05,
"sampling/sampling_logp_difference/mean": 2.9472043934219982e-06,
"step": 199,
"step_time": 10.867003692000253
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 271.6875,
"completions/mean_terminated_length": 271.6875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.556861286024287e-05,
"epoch": 0.004,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.539603105513379e-05,
"kl": 5.291132360696793,
"learning_rate": 4.881942014383094e-05,
"loss": 0.0038,
"num_tokens": 7491724.0,
"reward": 0.7399331331253052,
"reward_std": 0.4636785089969635,
"rewards/rollout_reward_func/mean": 0.7399331331253052,
"rewards/rollout_reward_func/std": 0.4593670070171356,
"sampling/importance_sampling_ratio/max": 1.000006079673767,
"sampling/importance_sampling_ratio/mean": 0.9999874830245972,
"sampling/importance_sampling_ratio/min": 0.9997283816337585,
"sampling/sampling_logp_difference/max": 0.00025416005519218743,
"sampling/sampling_logp_difference/mean": 3.923657914128853e-06,
"step": 200,
"step_time": 11.005329466000148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 221.09375,
"completions/mean_terminated_length": 221.09375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.380929744726927e-05,
"epoch": 0.00402,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.2547788072843105e-05,
"kl": 5.06697791069746,
"learning_rate": 4.8406465670716076e-05,
"loss": -0.0034,
"num_tokens": 7529039.0,
"reward": 0.7384207844734192,
"reward_std": 0.48120927810668945,
"rewards/rollout_reward_func/mean": 0.7384207844734192,
"rewards/rollout_reward_func/std": 0.4612503945827484,
"sampling/importance_sampling_ratio/max": 1.000004529953003,
"sampling/importance_sampling_ratio/mean": 0.9999886155128479,
"sampling/importance_sampling_ratio/min": 0.999925434589386,
"sampling/sampling_logp_difference/max": 8.261243056040257e-05,
"sampling/sampling_logp_difference/mean": 3.649963218776975e-06,
"step": 201,
"step_time": 11.135630174998823
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 218.53125,
"completions/mean_terminated_length": 218.53125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.144260962694716e-05,
"epoch": 0.00404,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.080134284682572e-05,
"kl": 4.722892113029957,
"learning_rate": 4.799549191642437e-05,
"loss": -0.0093,
"num_tokens": 7565841.0,
"reward": 0.8027622699737549,
"reward_std": 0.27080461382865906,
"rewards/rollout_reward_func/mean": 0.8027622699737549,
"rewards/rollout_reward_func/std": 0.42055216431617737,
"sampling/importance_sampling_ratio/max": 1.0000193119049072,
"sampling/importance_sampling_ratio/mean": 0.9999892711639404,
"sampling/importance_sampling_ratio/min": 0.9997215867042542,
"sampling/sampling_logp_difference/max": 0.0002580939617473632,
"sampling/sampling_logp_difference/mean": 4.462499873625347e-06,
"step": 202,
"step_time": 10.46230581699956
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 295.09375,
"completions/mean_terminated_length": 295.09375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.879004566608728e-05,
"epoch": 0.00406,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.695196548709646e-05,
"kl": 4.424702264368534,
"learning_rate": 4.7586556639601154e-05,
"loss": -0.0063,
"num_tokens": 7605323.0,
"reward": 0.8055915832519531,
"reward_std": 0.3435424566268921,
"rewards/rollout_reward_func/mean": 0.8055915832519531,
"rewards/rollout_reward_func/std": 0.4181799292564392,
"sampling/importance_sampling_ratio/max": 1.0000042915344238,
"sampling/importance_sampling_ratio/mean": 0.9999926090240479,
"sampling/importance_sampling_ratio/min": 0.9999536871910095,
"sampling/sampling_logp_difference/max": 4.494252061704174e-05,
"sampling/sampling_logp_difference/mean": 2.7982086976408027e-06,
"step": 203,
"step_time": 11.714568361000147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 238.375,
"completions/mean_terminated_length": 238.375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.828146078419195e-05,
"epoch": 0.00408,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.576986611937173e-05,
"kl": 4.861797966063023,
"learning_rate": 4.71797173124022e-05,
"loss": 0.002,
"num_tokens": 7642942.0,
"reward": 0.7380915284156799,
"reward_std": 0.4791000783443451,
"rewards/rollout_reward_func/mean": 0.7380915284156799,
"rewards/rollout_reward_func/std": 0.45896562933921814,
"sampling/importance_sampling_ratio/max": 1.0000057220458984,
"sampling/importance_sampling_ratio/mean": 0.9999947547912598,
"sampling/importance_sampling_ratio/min": 0.9999723434448242,
"sampling/sampling_logp_difference/max": 3.445236143306829e-05,
"sampling/sampling_logp_difference/mean": 2.5225040189980064e-06,
"step": 204,
"step_time": 10.50498939000181
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 218.1875,
"completions/mean_terminated_length": 218.1875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 4.999684122708459e-05,
"epoch": 0.0041,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3910033885622397e-05,
"kl": 5.3215382900089025,
"learning_rate": 4.67750311124165e-05,
"loss": 0.0004,
"num_tokens": 7679934.0,
"reward": 0.6737499833106995,
"reward_std": 0.381944477558136,
"rewards/rollout_reward_func/mean": 0.6737499833106995,
"rewards/rollout_reward_func/std": 0.477099746465683,
"sampling/importance_sampling_ratio/max": 1.000003457069397,
"sampling/importance_sampling_ratio/mean": 0.9999958276748657,
"sampling/importance_sampling_ratio/min": 0.9999797344207764,
"sampling/sampling_logp_difference/max": 2.3484337361878715e-05,
"sampling/sampling_logp_difference/mean": 1.814927827581414e-06,
"step": 205,
"step_time": 11.114921720999519
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 248.03125,
"completions/mean_terminated_length": 248.03125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.976049931812668e-05,
"epoch": 0.00412,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.381532613071613e-05,
"kl": 4.561372932046652,
"learning_rate": 4.63725549146305e-05,
"loss": 0.0129,
"num_tokens": 7718197.0,
"reward": 0.8037499785423279,
"reward_std": 0.43239593505859375,
"rewards/rollout_reward_func/mean": 0.8037499785423279,
"rewards/rollout_reward_func/std": 0.4172239303588867,
"sampling/importance_sampling_ratio/max": 1.0000351667404175,
"sampling/importance_sampling_ratio/mean": 0.9999947547912598,
"sampling/importance_sampling_ratio/min": 0.9998741149902344,
"sampling/sampling_logp_difference/max": 0.0001127816503867507,
"sampling/sampling_logp_difference/mean": 2.9079683372401632e-06,
"step": 206,
"step_time": 10.3316515169995
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 218.03125,
"completions/mean_terminated_length": 218.03125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.278466614162426e-05,
"epoch": 0.00414,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.739985888591036e-05,
"kl": 4.738906025886536,
"learning_rate": 4.597234528343477e-05,
"loss": -0.0071,
"num_tokens": 7754920.0,
"reward": 0.7693415880203247,
"reward_std": 0.3719741702079773,
"rewards/rollout_reward_func/mean": 0.7693415880203247,
"rewards/rollout_reward_func/std": 0.44077280163764954,
"sampling/importance_sampling_ratio/max": 1.0000015497207642,
"sampling/importance_sampling_ratio/mean": 0.9999948740005493,
"sampling/importance_sampling_ratio/min": 0.9999737739562988,
"sampling/sampling_logp_difference/max": 2.1100262529216707e-05,
"sampling/sampling_logp_difference/mean": 2.0317766029620543e-06,
"step": 207,
"step_time": 11.240013908000037
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 214.9375,
"completions/mean_terminated_length": 214.9375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.187105017829708e-05,
"epoch": 0.00416,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3428838428808376e-05,
"kl": 4.688595987856388,
"learning_rate": 4.557445846467452e-05,
"loss": -0.0029,
"num_tokens": 7791160.0,
"reward": 0.7068415284156799,
"reward_std": 0.5244907140731812,
"rewards/rollout_reward_func/mean": 0.7068415284156799,
"rewards/rollout_reward_func/std": 0.5362996459007263,
"sampling/importance_sampling_ratio/max": 1.0000072717666626,
"sampling/importance_sampling_ratio/mean": 0.9999967813491821,
"sampling/importance_sampling_ratio/min": 0.9999647736549377,
"sampling/sampling_logp_difference/max": 3.254435432609171e-05,
"sampling/sampling_logp_difference/mean": 1.924766593219829e-06,
"step": 208,
"step_time": 10.564521855001658
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 270.09375,
"completions/mean_terminated_length": 270.09375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.67929339673401e-05,
"epoch": 0.00418,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.7854244763148017e-05,
"kl": 5.123309299349785,
"learning_rate": 4.517895037774461e-05,
"loss": -0.0006,
"num_tokens": 7830838.0,
"reward": 0.7405915260314941,
"reward_std": 0.3506433665752411,
"rewards/rollout_reward_func/mean": 0.7405915260314941,
"rewards/rollout_reward_func/std": 0.4569876194000244,
"sampling/importance_sampling_ratio/max": 1.000009298324585,
"sampling/importance_sampling_ratio/mean": 0.9999930262565613,
"sampling/importance_sampling_ratio/min": 0.9999588131904602,
"sampling/sampling_logp_difference/max": 5.2810326451435685e-05,
"sampling/sampling_logp_difference/mean": 3.302905042801285e-06,
"step": 209,
"step_time": 11.46979012699967
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 250.09375,
"completions/mean_terminated_length": 250.09375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.607456470599573e-05,
"epoch": 0.0042,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.8605783882085234e-05,
"kl": 5.365914463996887,
"learning_rate": 4.478587660773065e-05,
"loss": 0.0039,
"num_tokens": 7869530.0,
"reward": 0.7415122985839844,
"reward_std": 0.4781748354434967,
"rewards/rollout_reward_func/mean": 0.7415122985839844,
"rewards/rollout_reward_func/std": 0.45829862356185913,
"sampling/importance_sampling_ratio/max": 1.0000122785568237,
"sampling/importance_sampling_ratio/mean": 0.999998152256012,
"sampling/importance_sampling_ratio/min": 0.9999728202819824,
"sampling/sampling_logp_difference/max": 2.2650128812529147e-05,
"sampling/sampling_logp_difference/mean": 2.2131084733700845e-06,
"step": 210,
"step_time": 10.400364204000653
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 196.0625,
"completions/mean_terminated_length": 196.0625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.971475469834786e-05,
"epoch": 0.00422,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.156006303266622e-05,
"kl": 5.336601361632347,
"learning_rate": 4.439529239759708e-05,
"loss": 0.01,
"num_tokens": 7905392.0,
"reward": 0.6737499833106995,
"reward_std": 0.4696081280708313,
"rewards/rollout_reward_func/mean": 0.6737499833106995,
"rewards/rollout_reward_func/std": 0.48247846961021423,
"sampling/importance_sampling_ratio/max": 1.0000094175338745,
"sampling/importance_sampling_ratio/mean": 0.999995768070221,
"sampling/importance_sampling_ratio/min": 0.9999589920043945,
"sampling/sampling_logp_difference/max": 4.8401976528111845e-05,
"sampling/sampling_logp_difference/mean": 2.4310015760420356e-06,
"step": 211,
"step_time": 11.25777174300083
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 678.0,
"completions/max_terminated_length": 678.0,
"completions/mean_length": 215.5625,
"completions/mean_terminated_length": 215.5625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.468110858610544e-05,
"epoch": 0.00424,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.0994404116645455e-05,
"kl": 5.020519495010376,
"learning_rate": 4.4007252640423116e-05,
"loss": -0.0123,
"num_tokens": 7943157.0,
"reward": 0.7718415260314941,
"reward_std": 0.4587705433368683,
"rewards/rollout_reward_func/mean": 0.7718415260314941,
"rewards/rollout_reward_func/std": 0.4480281174182892,
"sampling/importance_sampling_ratio/max": 1.0000126361846924,
"sampling/importance_sampling_ratio/mean": 0.9999984502792358,
"sampling/importance_sampling_ratio/min": 0.999977707862854,
"sampling/sampling_logp_difference/max": 2.8371961889206432e-05,
"sampling/sampling_logp_difference/mean": 2.205347755079856e-06,
"step": 212,
"step_time": 10.416242442000112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 250.625,
"completions/mean_terminated_length": 250.625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.606689459294103e-05,
"epoch": 0.00426,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3088554624118842e-05,
"kl": 4.440442498773336,
"learning_rate": 4.3621811871688186e-05,
"loss": 0.0017,
"num_tokens": 7980729.0,
"reward": 0.6758539080619812,
"reward_std": 0.4955633580684662,
"rewards/rollout_reward_func/mean": 0.6758539080619812,
"rewards/rollout_reward_func/std": 0.5430920124053955,
"sampling/importance_sampling_ratio/max": 1.0000171661376953,
"sampling/importance_sampling_ratio/mean": 0.9999970197677612,
"sampling/importance_sampling_ratio/min": 0.9999763369560242,
"sampling/sampling_logp_difference/max": 1.823993807192892e-05,
"sampling/sampling_logp_difference/mean": 2.135067461495055e-06,
"step": 213,
"step_time": 11.03314153600013
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 130.3125,
"completions/mean_terminated_length": 130.3125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 4.7448618119005914e-05,
"epoch": 0.00428,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5431114661623724e-05,
"kl": 4.4410514533519745,
"learning_rate": 4.323902426160737e-05,
"loss": 0.005,
"num_tokens": 8013047.0,
"reward": 0.8249331116676331,
"reward_std": 0.40843328833580017,
"rewards/rollout_reward_func/mean": 0.8249331116676331,
"rewards/rollout_reward_func/std": 0.3973701298236847,
"sampling/importance_sampling_ratio/max": 1.0000035762786865,
"sampling/importance_sampling_ratio/mean": 0.9999982118606567,
"sampling/importance_sampling_ratio/min": 0.9999793171882629,
"sampling/sampling_logp_difference/max": 2.1219655536697246e-05,
"sampling/sampling_logp_difference/mean": 1.4284883036452811e-06,
"step": 214,
"step_time": 10.193582248000894
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 687.0,
"completions/max_terminated_length": 687.0,
"completions/mean_length": 221.90625,
"completions/mean_terminated_length": 221.90625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.51787586636965e-05,
"epoch": 0.0043,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.6926014470518567e-05,
"kl": 3.983197882771492,
"learning_rate": 4.285894360751829e-05,
"loss": -0.0012,
"num_tokens": 8048381.0,
"reward": 0.8018415570259094,
"reward_std": 0.4044482707977295,
"rewards/rollout_reward_func/mean": 0.8018415570259094,
"rewards/rollout_reward_func/std": 0.4863073229789734,
"sampling/importance_sampling_ratio/max": 1.0000040531158447,
"sampling/importance_sampling_ratio/mean": 0.9999951124191284,
"sampling/importance_sampling_ratio/min": 0.9999686479568481,
"sampling/sampling_logp_difference/max": 3.111379555775784e-05,
"sampling/sampling_logp_difference/mean": 2.425235834380146e-06,
"step": 215,
"step_time": 10.813712388001022
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 284.3125,
"completions/mean_terminated_length": 284.3125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.533502838758977e-05,
"epoch": 0.00432,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.587458483409137e-05,
"kl": 4.82944827247411,
"learning_rate": 4.2481623326320364e-05,
"loss": 0.0117,
"num_tokens": 8087461.0,
"reward": 0.5843415260314941,
"reward_std": 0.5055504441261292,
"rewards/rollout_reward_func/mean": 0.5843415260314941,
"rewards/rollout_reward_func/std": 0.5016008019447327,
"sampling/importance_sampling_ratio/max": 1.0000051259994507,
"sampling/importance_sampling_ratio/mean": 0.9999934434890747,
"sampling/importance_sampling_ratio/min": 0.999961793422699,
"sampling/sampling_logp_difference/max": 3.683606337290257e-05,
"sampling/sampling_logp_difference/mean": 3.0676592359668575e-06,
"step": 216,
"step_time": 10.44111612099914
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 235.59375,
"completions/mean_terminated_length": 235.59375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.00010566368675313242,
"epoch": 0.00434,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.359190618037246e-05,
"kl": 5.429783314466476,
"learning_rate": 4.210711644696756e-05,
"loss": -0.0041,
"num_tokens": 8126489.0,
"reward": 0.7111831307411194,
"reward_std": 0.525292158126831,
"rewards/rollout_reward_func/mean": 0.7111831307411194,
"rewards/rollout_reward_func/std": 0.5364737510681152,
"sampling/importance_sampling_ratio/max": 1.0000098943710327,
"sampling/importance_sampling_ratio/mean": 0.9999901056289673,
"sampling/importance_sampling_ratio/min": 0.9998805522918701,
"sampling/sampling_logp_difference/max": 9.84744110610336e-05,
"sampling/sampling_logp_difference/mean": 3.6738169910677243e-06,
"step": 217,
"step_time": 11.342535938998935
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 299.75,
"completions/mean_terminated_length": 299.75,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.36404399251478e-05,
"epoch": 0.00436,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.6273870389559306e-05,
"kl": 4.871564254164696,
"learning_rate": 4.1735475603015697e-05,
"loss": 0.0077,
"num_tokens": 8166512.0,
"reward": 0.6790122985839844,
"reward_std": 0.4945233464241028,
"rewards/rollout_reward_func/mean": 0.6790122985839844,
"rewards/rollout_reward_func/std": 0.4823760390281677,
"sampling/importance_sampling_ratio/max": 1.0000158548355103,
"sampling/importance_sampling_ratio/mean": 0.999993085861206,
"sampling/importance_sampling_ratio/min": 0.9999461770057678,
"sampling/sampling_logp_difference/max": 3.6239842302165926e-05,
"sampling/sampling_logp_difference/mean": 2.806388010867522e-06,
"step": 218,
"step_time": 10.86233605400048
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 252.75,
"completions/mean_terminated_length": 252.75,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.950771052695927e-05,
"epoch": 0.00438,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.0681065254611894e-05,
"kl": 4.676598273217678,
"learning_rate": 4.136675302522517e-05,
"loss": -0.0047,
"num_tokens": 8204946.0,
"reward": 0.6799330711364746,
"reward_std": 0.5543248653411865,
"rewards/rollout_reward_func/mean": 0.6799330711364746,
"rewards/rollout_reward_func/std": 0.5487261414527893,
"sampling/importance_sampling_ratio/max": 1.0000182390213013,
"sampling/importance_sampling_ratio/mean": 0.9999966025352478,
"sampling/importance_sampling_ratio/min": 0.9999750852584839,
"sampling/sampling_logp_difference/max": 2.6107110898010433e-05,
"sampling/sampling_logp_difference/mean": 2.1445025595312472e-06,
"step": 219,
"step_time": 10.943576970999402
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 247.71875,
"completions/mean_terminated_length": 247.71875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.87366666941125e-05,
"epoch": 0.0044,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.677894761087373e-05,
"kl": 5.05719730257988,
"learning_rate": 4.1001000534220484e-05,
"loss": 0.0043,
"num_tokens": 8243453.0,
"reward": 0.4586831331253052,
"reward_std": 0.6123698949813843,
"rewards/rollout_reward_func/mean": 0.4586831331253052,
"rewards/rollout_reward_func/std": 0.616819441318512,
"sampling/importance_sampling_ratio/max": 1.0000048875808716,
"sampling/importance_sampling_ratio/mean": 0.999995231628418,
"sampling/importance_sampling_ratio/min": 0.9999604821205139,
"sampling/sampling_logp_difference/max": 3.4690252505242825e-05,
"sampling/sampling_logp_difference/mean": 2.33208902500337e-06,
"step": 220,
"step_time": 11.04217300100072
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 261.25,
"completions/mean_terminated_length": 261.25,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.492780096323258e-05,
"epoch": 0.00442,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8693634046940133e-05,
"kl": 5.244952633976936,
"learning_rate": 4.063826953320731e-05,
"loss": 0.0021,
"num_tokens": 8282852.0,
"reward": 0.6468415260314941,
"reward_std": 0.510591983795166,
"rewards/rollout_reward_func/mean": 0.6468415260314941,
"rewards/rollout_reward_func/std": 0.49439889192581177,
"sampling/importance_sampling_ratio/max": 1.0000091791152954,
"sampling/importance_sampling_ratio/mean": 0.9999966621398926,
"sampling/importance_sampling_ratio/min": 0.9999747276306152,
"sampling/sampling_logp_difference/max": 2.1458068658830598e-05,
"sampling/sampling_logp_difference/mean": 1.838789557950804e-06,
"step": 221,
"step_time": 11.379838227001073
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 231.0,
"completions/mean_terminated_length": 231.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.502011356701587e-05,
"epoch": 0.00444,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.6576681193546392e-05,
"kl": 5.2187200747430325,
"learning_rate": 4.027861100074818e-05,
"loss": 0.0005,
"num_tokens": 8320490.0,
"reward": 0.642762303352356,
"reward_std": 0.501641035079956,
"rewards/rollout_reward_func/mean": 0.642762303352356,
"rewards/rollout_reward_func/std": 0.4938026964664459,
"sampling/importance_sampling_ratio/max": 1.0000317096710205,
"sampling/importance_sampling_ratio/mean": 0.9999995231628418,
"sampling/importance_sampling_ratio/min": 0.9999289512634277,
"sampling/sampling_logp_difference/max": 6.830880738561973e-05,
"sampling/sampling_logp_difference/mean": 3.455873411439825e-06,
"step": 222,
"step_time": 10.861577565999141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 249.0,
"completions/mean_terminated_length": 249.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.842954086039299e-05,
"epoch": 0.00446,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4798371012147982e-05,
"kl": 4.837060697376728,
"learning_rate": 3.9922075483597984e-05,
"loss": 0.0038,
"num_tokens": 8358120.0,
"reward": 0.7390123605728149,
"reward_std": 0.4545978307723999,
"rewards/rollout_reward_func/mean": 0.7390123605728149,
"rewards/rollout_reward_func/std": 0.4567206799983978,
"sampling/importance_sampling_ratio/max": 1.0000172853469849,
"sampling/importance_sampling_ratio/mean": 0.9999964833259583,
"sampling/importance_sampling_ratio/min": 0.9999744892120361,
"sampling/sampling_logp_difference/max": 3.647844278020784e-05,
"sampling/sampling_logp_difference/mean": 2.729870857365313e-06,
"step": 223,
"step_time": 11.067154321999169
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 669.0,
"completions/max_terminated_length": 669.0,
"completions/mean_length": 220.34375,
"completions/mean_terminated_length": 220.34375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.793645808045312e-05,
"epoch": 0.00448,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.114427843480371e-05,
"kl": 4.609769219532609,
"learning_rate": 3.956871308960006e-05,
"loss": -0.0024,
"num_tokens": 8395318.0,
"reward": 0.7071707844734192,
"reward_std": 0.37846216559410095,
"rewards/rollout_reward_func/mean": 0.7071707844734192,
"rewards/rollout_reward_func/std": 0.47386202216148376,
"sampling/importance_sampling_ratio/max": 1.0000035762786865,
"sampling/importance_sampling_ratio/mean": 0.9999899864196777,
"sampling/importance_sampling_ratio/min": 0.9998646378517151,
"sampling/sampling_logp_difference/max": 0.0001251772919204086,
"sampling/sampling_logp_difference/mean": 2.8750114324793685e-06,
"step": 224,
"step_time": 10.648732830999961
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 678.0,
"completions/max_terminated_length": 678.0,
"completions/mean_length": 194.875,
"completions/mean_terminated_length": 194.875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.4564909845566945e-05,
"epoch": 0.0045,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.99508774332935e-05,
"kl": 4.295398809015751,
"learning_rate": 3.921857348064393e-05,
"loss": -0.0028,
"num_tokens": 8429759.0,
"reward": 0.735920786857605,
"reward_std": 0.3834129571914673,
"rewards/rollout_reward_func/mean": 0.735920786857605,
"rewards/rollout_reward_func/std": 0.45476430654525757,
"sampling/importance_sampling_ratio/max": 1.0000051259994507,
"sampling/importance_sampling_ratio/mean": 0.9999962449073792,
"sampling/importance_sampling_ratio/min": 0.99997878074646,
"sampling/sampling_logp_difference/max": 2.408070577075705e-05,
"sampling/sampling_logp_difference/mean": 2.0818633856833912e-06,
"step": 225,
"step_time": 10.618216946999382
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 213.4375,
"completions/mean_terminated_length": 213.4375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.468187297059558e-05,
"epoch": 0.00452,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.314897210453637e-05,
"kl": 4.907812386751175,
"learning_rate": 3.8871705865685835e-05,
"loss": -0.0071,
"num_tokens": 8466101.0,
"reward": 0.7993415594100952,
"reward_std": 0.35336440801620483,
"rewards/rollout_reward_func/mean": 0.7993415594100952,
"rewards/rollout_reward_func/std": 0.42098554968833923,
"sampling/importance_sampling_ratio/max": 1.000029444694519,
"sampling/importance_sampling_ratio/mean": 0.9999977350234985,
"sampling/importance_sampling_ratio/min": 0.9999707937240601,
"sampling/sampling_logp_difference/max": 3.9220289181685075e-05,
"sampling/sampling_logp_difference/mean": 2.529017365304753e-06,
"step": 226,
"step_time": 10.741557626999565
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 223.03125,
"completions/mean_terminated_length": 223.03125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.120953486923099e-05,
"epoch": 0.00454,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.4079289839137346e-05,
"kl": 4.519463703036308,
"learning_rate": 3.852815899383288e-05,
"loss": -0.0039,
"num_tokens": 8502768.0,
"reward": 0.642762303352356,
"reward_std": 0.43429845571517944,
"rewards/rollout_reward_func/mean": 0.642762303352356,
"rewards/rollout_reward_func/std": 0.55186527967453,
"sampling/importance_sampling_ratio/max": 1.000009536743164,
"sampling/importance_sampling_ratio/mean": 0.9999961853027344,
"sampling/importance_sampling_ratio/min": 0.9999439120292664,
"sampling/sampling_logp_difference/max": 6.046371709089726e-05,
"sampling/sampling_logp_difference/mean": 2.2524543510371586e-06,
"step": 227,
"step_time": 10.909075683000992
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 678.0,
"completions/max_terminated_length": 678.0,
"completions/mean_length": 238.4375,
"completions/mean_terminated_length": 238.4375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.815549295424717e-05,
"epoch": 0.00456,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.8888143788208254e-05,
"kl": 5.122124448418617,
"learning_rate": 3.81879811474917e-05,
"loss": 0.0128,
"num_tokens": 8540887.0,
"reward": 0.6134207248687744,
"reward_std": 0.4781406819820404,
"rewards/rollout_reward_func/mean": 0.6134207248687744,
"rewards/rollout_reward_func/std": 0.49850568175315857,
"sampling/importance_sampling_ratio/max": 1.0000128746032715,
"sampling/importance_sampling_ratio/mean": 0.9999950528144836,
"sampling/importance_sampling_ratio/min": 0.9999350309371948,
"sampling/sampling_logp_difference/max": 5.8532186812954023e-05,
"sampling/sampling_logp_difference/mean": 2.7186433726456016e-06,
"step": 228,
"step_time": 10.663067908001267
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 255.21875,
"completions/mean_terminated_length": 255.21875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.427226666185561e-05,
"epoch": 0.00458,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2373380488716066e-05,
"kl": 4.516269013285637,
"learning_rate": 3.785122013558288e-05,
"loss": 0.0073,
"num_tokens": 8578873.0,
"reward": 0.7715123891830444,
"reward_std": 0.4250052869319916,
"rewards/rollout_reward_func/mean": 0.7715123891830444,
"rewards/rollout_reward_func/std": 0.44125890731811523,
"sampling/importance_sampling_ratio/max": 1.0000226497650146,
"sampling/importance_sampling_ratio/mean": 0.9999961853027344,
"sampling/importance_sampling_ratio/min": 0.9999725818634033,
"sampling/sampling_logp_difference/max": 2.968106127809733e-05,
"sampling/sampling_logp_difference/mean": 2.688705535547342e-06,
"step": 229,
"step_time": 10.989937573000134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 261.40625,
"completions/mean_terminated_length": 261.40625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.805091253487717e-05,
"epoch": 0.0046,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.7097617930849083e-05,
"kl": 4.258571729063988,
"learning_rate": 3.751792328682183e-05,
"loss": 0.0047,
"num_tokens": 8617352.0,
"reward": 0.8043415546417236,
"reward_std": 0.33875972032546997,
"rewards/rollout_reward_func/mean": 0.8043415546417236,
"rewards/rollout_reward_func/std": 0.41523313522338867,
"sampling/importance_sampling_ratio/max": 1.0000033378601074,
"sampling/importance_sampling_ratio/mean": 0.9999909400939941,
"sampling/importance_sampling_ratio/min": 0.9999553561210632,
"sampling/sampling_logp_difference/max": 3.457104321569204e-05,
"sampling/sampling_logp_difference/mean": 2.853767227861681e-06,
"step": 230,
"step_time": 10.89791804999868
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 185.6875,
"completions/mean_terminated_length": 185.6875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.286828362396136e-05,
"epoch": 0.00462,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.8086083804955706e-05,
"kl": 5.058218787424266,
"learning_rate": 3.718813744306712e-05,
"loss": -0.0022,
"num_tokens": 8653215.0,
"reward": 0.7049999833106995,
"reward_std": 0.49435633420944214,
"rewards/rollout_reward_func/mean": 0.7049999833106995,
"rewards/rollout_reward_func/std": 0.4751094877719879,
"sampling/importance_sampling_ratio/max": 1.0000234842300415,
"sampling/importance_sampling_ratio/mean": 0.9999891519546509,
"sampling/importance_sampling_ratio/min": 0.9998546838760376,
"sampling/sampling_logp_difference/max": 0.00014317341265268624,
"sampling/sampling_logp_difference/mean": 3.555632019924815e-06,
"step": 231,
"step_time": 10.807909351999115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 250.9375,
"completions/mean_terminated_length": 250.9375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.626304455106037e-05,
"epoch": 0.00464,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0001353168481728062,
"kl": 5.0585897117853165,
"learning_rate": 3.686190895273733e-05,
"loss": 0.0117,
"num_tokens": 8691158.0,
"reward": 0.7077623605728149,
"reward_std": 0.44103604555130005,
"rewards/rollout_reward_func/mean": 0.7077623605728149,
"rewards/rollout_reward_func/std": 0.468770295381546,
"sampling/importance_sampling_ratio/max": 1.0000251531600952,
"sampling/importance_sampling_ratio/mean": 0.9999915957450867,
"sampling/importance_sampling_ratio/min": 0.999932050704956,
"sampling/sampling_logp_difference/max": 7.951720181154087e-05,
"sampling/sampling_logp_difference/mean": 3.854770056932466e-06,
"step": 232,
"step_time": 11.1020100679998
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 227.96875,
"completions/mean_terminated_length": 227.96875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.657698354535114e-05,
"epoch": 0.00466,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.930617771809921e-05,
"kl": 5.246766164898872,
"learning_rate": 3.653928366429717e-05,
"loss": -0.0081,
"num_tokens": 8729310.0,
"reward": 0.6768415570259094,
"reward_std": 0.5108159184455872,
"rewards/rollout_reward_func/mean": 0.6768415570259094,
"rewards/rollout_reward_func/std": 0.49002739787101746,
"sampling/importance_sampling_ratio/max": 1.0000180006027222,
"sampling/importance_sampling_ratio/mean": 0.9999940395355225,
"sampling/importance_sampling_ratio/min": 0.9999337792396545,
"sampling/sampling_logp_difference/max": 4.6849683712935075e-05,
"sampling/sampling_logp_difference/mean": 2.78436164080631e-06,
"step": 233,
"step_time": 11.074858544001472
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 248.59375,
"completions/mean_terminated_length": 248.59375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.97915432571017e-05,
"epoch": 0.00468,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8650109268492088e-05,
"kl": 4.891276657581329,
"learning_rate": 3.6220306919813934e-05,
"loss": 0.0071,
"num_tokens": 8767949.0,
"reward": 0.5517747402191162,
"reward_std": 0.501990556716919,
"rewards/rollout_reward_func/mean": 0.5517747402191162,
"rewards/rollout_reward_func/std": 0.5072534680366516,
"sampling/importance_sampling_ratio/max": 1.0000141859054565,
"sampling/importance_sampling_ratio/mean": 0.9999977946281433,
"sampling/importance_sampling_ratio/min": 0.9999755024909973,
"sampling/sampling_logp_difference/max": 3.135273800580762e-05,
"sampling/sampling_logp_difference/mean": 2.4732930796744768e-06,
"step": 234,
"step_time": 11.137182370001028
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 224.40625,
"completions/mean_terminated_length": 224.40625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.745976586193137e-05,
"epoch": 0.0047,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.0286239052657038e-05,
"kl": 5.41153359413147,
"learning_rate": 3.590502354858501e-05,
"loss": -0.0122,
"num_tokens": 8805306.0,
"reward": 0.6440123319625854,
"reward_std": 0.4558776617050171,
"rewards/rollout_reward_func/mean": 0.6440123319625854,
"rewards/rollout_reward_func/std": 0.5544620156288147,
"sampling/importance_sampling_ratio/max": 1.0000075101852417,
"sampling/importance_sampling_ratio/mean": 0.9999954104423523,
"sampling/importance_sampling_ratio/min": 0.9999374151229858,
"sampling/sampling_logp_difference/max": 5.960933049209416e-05,
"sampling/sampling_logp_difference/mean": 2.1439589090732625e-06,
"step": 235,
"step_time": 11.217590768001173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 179.5,
"completions/mean_terminated_length": 179.5,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.373513450341761e-05,
"epoch": 0.00472,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.1967592803994194e-05,
"kl": 5.293322175741196,
"learning_rate": 3.559347786083758e-05,
"loss": -0.0122,
"num_tokens": 8841204.0,
"reward": 0.579012393951416,
"reward_std": 0.5713227987289429,
"rewards/rollout_reward_func/mean": 0.579012393951416,
"rewards/rollout_reward_func/std": 0.5699735283851624,
"sampling/importance_sampling_ratio/max": 1.0000178813934326,
"sampling/importance_sampling_ratio/mean": 0.9999982118606567,
"sampling/importance_sampling_ratio/min": 0.999977171421051,
"sampling/sampling_logp_difference/max": 3.111419209744781e-05,
"sampling/sampling_logp_difference/mean": 2.388872417213861e-06,
"step": 236,
"step_time": 11.031479144000059
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 220.28125,
"completions/mean_terminated_length": 220.28125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.39780113640154e-05,
"epoch": 0.00474,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.93100511044031e-05,
"kl": 4.748119935393333,
"learning_rate": 3.528571364150124e-05,
"loss": -0.0089,
"num_tokens": 8879050.0,
"reward": 0.7405247092247009,
"reward_std": 0.43948662281036377,
"rewards/rollout_reward_func/mean": 0.7405247092247009,
"rewards/rollout_reward_func/std": 0.5215878486633301,
"sampling/importance_sampling_ratio/max": 1.0000333786010742,
"sampling/importance_sampling_ratio/mean": 0.9999979734420776,
"sampling/importance_sampling_ratio/min": 0.9999390840530396,
"sampling/sampling_logp_difference/max": 4.208123209537007e-05,
"sampling/sampling_logp_difference/mean": 3.0910557597962907e-06,
"step": 237,
"step_time": 11.245172328000535
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 678.0,
"completions/max_terminated_length": 678.0,
"completions/mean_length": 221.4375,
"completions/mean_terminated_length": 221.4375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.091031755772747e-05,
"epoch": 0.00476,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5538744264631532e-05,
"kl": 4.544229738414288,
"learning_rate": 3.4981774144054344e-05,
"loss": 0.0045,
"num_tokens": 8915726.0,
"reward": 0.6730915904045105,
"reward_std": 0.4944014549255371,
"rewards/rollout_reward_func/mean": 0.6730915904045105,
"rewards/rollout_reward_func/std": 0.4819692075252533,
"sampling/importance_sampling_ratio/max": 1.0000104904174805,
"sampling/importance_sampling_ratio/mean": 0.9999960064888,
"sampling/importance_sampling_ratio/min": 0.9999375939369202,
"sampling/sampling_logp_difference/max": 3.111407204414718e-05,
"sampling/sampling_logp_difference/mean": 2.156180244128336e-06,
"step": 238,
"step_time": 11.206029839998337
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 220.9375,
"completions/mean_terminated_length": 220.9375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.803557426986572e-05,
"epoch": 0.00478,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.782759449677542e-05,
"kl": 5.248299717903137,
"learning_rate": 3.46817020844452e-05,
"loss": 0.0036,
"num_tokens": 8952836.0,
"reward": 0.5790123343467712,
"reward_std": 0.5091844797134399,
"rewards/rollout_reward_func/mean": 0.5790123343467712,
"rewards/rollout_reward_func/std": 0.5044925808906555,
"sampling/importance_sampling_ratio/max": 1.000012755393982,
"sampling/importance_sampling_ratio/mean": 0.9999973773956299,
"sampling/importance_sampling_ratio/min": 0.9999673366546631,
"sampling/sampling_logp_difference/max": 2.6941650503431447e-05,
"sampling/sampling_logp_difference/mean": 1.8537682535679778e-06,
"step": 239,
"step_time": 11.89933117200053
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 678.0,
"completions/max_terminated_length": 678.0,
"completions/mean_length": 200.9375,
"completions/mean_terminated_length": 200.9375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.659828022748115e-05,
"epoch": 0.0048,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.700514520867728e-05,
"kl": 4.181779149919748,
"learning_rate": 3.438553963508866e-05,
"loss": -0.0069,
"num_tokens": 8988037.0,
"reward": 0.7680915594100952,
"reward_std": 0.3802240490913391,
"rewards/rollout_reward_func/mean": 0.7680915594100952,
"rewards/rollout_reward_func/std": 0.43926358222961426,
"sampling/importance_sampling_ratio/max": 1.0000133514404297,
"sampling/importance_sampling_ratio/mean": 0.9999954700469971,
"sampling/importance_sampling_ratio/min": 0.9999701976776123,
"sampling/sampling_logp_difference/max": 3.194832243025303e-05,
"sampling/sampling_logp_difference/mean": 2.628037236718228e-06,
"step": 240,
"step_time": 10.429010280000512
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 687.0,
"completions/max_terminated_length": 687.0,
"completions/mean_length": 190.59375,
"completions/mean_terminated_length": 190.59375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.779884174079598e-05,
"epoch": 0.00482,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.3270024030352943e-05,
"kl": 4.673696734011173,
"learning_rate": 3.409332841893925e-05,
"loss": 0.0024,
"num_tokens": 9023151.0,
"reward": 0.735920786857605,
"reward_std": 0.4644585847854614,
"rewards/rollout_reward_func/mean": 0.735920786857605,
"rewards/rollout_reward_func/std": 0.4568495452404022,
"sampling/importance_sampling_ratio/max": 1.0000066757202148,
"sampling/importance_sampling_ratio/mean": 0.9999940991401672,
"sampling/importance_sampling_ratio/min": 0.9998875260353088,
"sampling/sampling_logp_difference/max": 7.773219840601087e-05,
"sampling/sampling_logp_difference/mean": 2.4395403670496307e-06,
"step": 241,
"step_time": 12.248209440999744
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 276.96875,
"completions/mean_terminated_length": 276.96875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.92703500422931e-05,
"epoch": 0.00484,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.4162562112906016e-05,
"kl": 4.796989947557449,
"learning_rate": 3.3805109503641356e-05,
"loss": 0.0094,
"num_tokens": 9062474.0,
"reward": 0.7093415260314941,
"reward_std": 0.4662177562713623,
"rewards/rollout_reward_func/mean": 0.7093415260314941,
"rewards/rollout_reward_func/std": 0.4678354859352112,
"sampling/importance_sampling_ratio/max": 1.000012993812561,
"sampling/importance_sampling_ratio/mean": 0.9999938011169434,
"sampling/importance_sampling_ratio/min": 0.9999693036079407,
"sampling/sampling_logp_difference/max": 3.433253732509911e-05,
"sampling/sampling_logp_difference/mean": 2.6973182229994563e-06,
"step": 242,
"step_time": 11.291534194999713
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 307.125,
"completions/mean_terminated_length": 307.125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.851263379166085e-05,
"epoch": 0.00486,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.707106457615737e-05,
"kl": 5.56565897166729,
"learning_rate": 3.352092339575757e-05,
"loss": 0.0095,
"num_tokens": 9103556.0,
"reward": 0.5833538770675659,
"reward_std": 0.5242961645126343,
"rewards/rollout_reward_func/mean": 0.5833538770675659,
"rewards/rollout_reward_func/std": 0.5032540559768677,
"sampling/importance_sampling_ratio/max": 1.000014305114746,
"sampling/importance_sampling_ratio/mean": 0.9999957084655762,
"sampling/importance_sampling_ratio/min": 0.9999706745147705,
"sampling/sampling_logp_difference/max": 3.0160426831571385e-05,
"sampling/sampling_logp_difference/mean": 2.1681942143914057e-06,
"step": 243,
"step_time": 11.62106315400024
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 231.46875,
"completions/mean_terminated_length": 231.46875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.005396872410529e-05,
"epoch": 0.00488,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6445228538941592e-05,
"kl": 5.512866705656052,
"learning_rate": 3.324081003507593e-05,
"loss": 0.0098,
"num_tokens": 9141136.0,
"reward": 0.6134207844734192,
"reward_std": 0.4750402569770813,
"rewards/rollout_reward_func/mean": 0.6134207844734192,
"rewards/rollout_reward_func/std": 0.5004087090492249,
"sampling/importance_sampling_ratio/max": 1.0000059604644775,
"sampling/importance_sampling_ratio/mean": 0.9999967813491821,
"sampling/importance_sampling_ratio/min": 0.999974250793457,
"sampling/sampling_logp_difference/max": 2.849124211934395e-05,
"sampling/sampling_logp_difference/mean": 2.1164394183870172e-06,
"step": 244,
"step_time": 11.151449438999407
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 201.875,
"completions/mean_terminated_length": 201.875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.096377369464335e-05,
"epoch": 0.0049,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.411771311017219e-05,
"kl": 4.767334572970867,
"learning_rate": 3.29648087889966e-05,
"loss": 0.0072,
"num_tokens": 9177427.0,
"reward": 0.5493415594100952,
"reward_std": 0.5647812485694885,
"rewards/rollout_reward_func/mean": 0.5493415594100952,
"rewards/rollout_reward_func/std": 0.5635629296302795,
"sampling/importance_sampling_ratio/max": 1.000023365020752,
"sampling/importance_sampling_ratio/mean": 0.999997615814209,
"sampling/importance_sampling_ratio/min": 0.9999508857727051,
"sampling/sampling_logp_difference/max": 5.1029317546635866e-05,
"sampling/sampling_logp_difference/mean": 2.1633254618791398e-06,
"step": 245,
"step_time": 11.763949329999832
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 678.0,
"completions/max_terminated_length": 678.0,
"completions/mean_length": 160.0625,
"completions/mean_terminated_length": 160.0625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 4.01286881839269e-05,
"epoch": 0.00492,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5201369023998268e-05,
"kl": 4.452401959657436,
"learning_rate": 3.269295844699925e-05,
"loss": 0.0038,
"num_tokens": 9211181.0,
"reward": 0.7321707606315613,
"reward_std": 0.44021376967430115,
"rewards/rollout_reward_func/mean": 0.7321707606315613,
"rewards/rollout_reward_func/std": 0.4572508633136749,
"sampling/importance_sampling_ratio/max": 1.0000144243240356,
"sampling/importance_sampling_ratio/mean": 0.999998927116394,
"sampling/importance_sampling_ratio/min": 0.9999858140945435,
"sampling/sampling_logp_difference/max": 1.8358508896199055e-05,
"sampling/sampling_logp_difference/mean": 1.4965582977310987e-06,
"step": 246,
"step_time": 11.372876166001333
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 639.0,
"completions/max_terminated_length": 639.0,
"completions/mean_length": 150.46875,
"completions/mean_terminated_length": 150.46875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.801312877906639e-05,
"epoch": 0.00494,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5779214663780294e-05,
"kl": 5.292075231671333,
"learning_rate": 3.242529721519152e-05,
"loss": -0.0026,
"num_tokens": 9245647.0,
"reward": 0.5455915927886963,
"reward_std": 0.5671359300613403,
"rewards/rollout_reward_func/mean": 0.5455915927886963,
"rewards/rollout_reward_func/std": 0.5694448947906494,
"sampling/importance_sampling_ratio/max": 1.0000114440917969,
"sampling/importance_sampling_ratio/mean": 0.9999982118606567,
"sampling/importance_sampling_ratio/min": 0.9999804496765137,
"sampling/sampling_logp_difference/max": 2.324626620975323e-05,
"sampling/sampling_logp_difference/mean": 1.994621015910525e-06,
"step": 247,
"step_time": 10.116187335000177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 316.5,
"completions/mean_terminated_length": 316.5,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.037800134592544e-05,
"epoch": 0.00496,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.4790740503231063e-05,
"kl": 4.973954036831856,
"learning_rate": 3.2161862710939476e-05,
"loss": 0.0204,
"num_tokens": 9286864.0,
"reward": 0.6830915212631226,
"reward_std": 0.4971539378166199,
"rewards/rollout_reward_func/mean": 0.6830915212631226,
"rewards/rollout_reward_func/std": 0.47586745023727417,
"sampling/importance_sampling_ratio/max": 1.0000207424163818,
"sampling/importance_sampling_ratio/mean": 0.9999940395355225,
"sampling/importance_sampling_ratio/min": 0.9999626874923706,
"sampling/sampling_logp_difference/max": 3.242294769734144e-05,
"sampling/sampling_logp_difference/mean": 3.037049282283988e-06,
"step": 248,
"step_time": 11.992384031000256
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 247.65625,
"completions/mean_terminated_length": 247.65625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.774960132498563e-05,
"epoch": 0.00498,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.237016203958774e-05,
"kl": 4.647655583918095,
"learning_rate": 3.1902691957580834e-05,
"loss": 0.0102,
"num_tokens": 9324389.0,
"reward": 0.6430915594100952,
"reward_std": 0.4474893808364868,
"rewards/rollout_reward_func/mean": 0.6430915594100952,
"rewards/rollout_reward_func/std": 0.49145370721817017,
"sampling/importance_sampling_ratio/max": 1.0000030994415283,
"sampling/importance_sampling_ratio/mean": 0.9999960660934448,
"sampling/importance_sampling_ratio/min": 0.9999740123748779,
"sampling/sampling_logp_difference/max": 2.6822204745258205e-05,
"sampling/sampling_logp_difference/mean": 2.2784347493143287e-06,
"step": 249,
"step_time": 10.406291274999603
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 302.9375,
"completions/mean_terminated_length": 302.9375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.460602839159947e-05,
"epoch": 0.005,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.1649214836070314e-05,
"kl": 5.410085678100586,
"learning_rate": 3.1647821379221695e-05,
"loss": 0.0123,
"num_tokens": 9365019.0,
"reward": 0.7090123295783997,
"reward_std": 0.47402238845825195,
"rewards/rollout_reward_func/mean": 0.7090123295783997,
"rewards/rollout_reward_func/std": 0.46759894490242004,
"sampling/importance_sampling_ratio/max": 1.0000172853469849,
"sampling/importance_sampling_ratio/mean": 0.9999974966049194,
"sampling/importance_sampling_ratio/min": 0.9999734163284302,
"sampling/sampling_logp_difference/max": 2.479633258190006e-05,
"sampling/sampling_logp_difference/mean": 2.1298787942214403e-06,
"step": 250,
"step_time": 11.301801955999508
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 247.96875,
"completions/mean_terminated_length": 247.96875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.343430341994917e-05,
"epoch": 0.00502,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.20609363168478e-05,
"kl": 5.1174429804086685,
"learning_rate": 3.139728679561744e-05,
"loss": 0.0003,
"num_tokens": 9403754.0,
"reward": 0.6780915260314941,
"reward_std": 0.5075234174728394,
"rewards/rollout_reward_func/mean": 0.6780915260314941,
"rewards/rollout_reward_func/std": 0.4863426387310028,
"sampling/importance_sampling_ratio/max": 1.0000110864639282,
"sampling/importance_sampling_ratio/mean": 0.9999961853027344,
"sampling/importance_sampling_ratio/min": 0.9999733567237854,
"sampling/sampling_logp_difference/max": 3.040073352167383e-05,
"sampling/sampling_logp_difference/mean": 2.2713497855875175e-06,
"step": 251,
"step_time": 11.164583231999586
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 678.0,
"completions/max_terminated_length": 678.0,
"completions/mean_length": 243.9375,
"completions/mean_terminated_length": 243.9375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.330492835601945e-05,
"epoch": 0.00504,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.416094452608377e-05,
"kl": 5.1367504596710205,
"learning_rate": 3.1151123417138556e-05,
"loss": 0.0115,
"num_tokens": 9442316.0,
"reward": 0.5840123295783997,
"reward_std": 0.5598544478416443,
"rewards/rollout_reward_func/mean": 0.5840123295783997,
"rewards/rollout_reward_func/std": 0.5602787733078003,
"sampling/importance_sampling_ratio/max": 1.0000160932540894,
"sampling/importance_sampling_ratio/mean": 0.9999907612800598,
"sampling/importance_sampling_ratio/min": 0.9998561143875122,
"sampling/sampling_logp_difference/max": 0.00012445918400771916,
"sampling/sampling_logp_difference/mean": 3.5559787647798657e-06,
"step": 252,
"step_time": 11.626074560000234
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 549.0,
"completions/max_terminated_length": 549.0,
"completions/mean_length": 198.71875,
"completions/mean_terminated_length": 198.71875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.531994623377614e-05,
"epoch": 0.00506,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.664622873766348e-05,
"kl": 4.52351340779569,
"learning_rate": 3.090936583982223e-05,
"loss": -0.0237,
"num_tokens": 9478355.0,
"reward": 0.8955914974212646,
"reward_std": 0.23238001763820648,
"rewards/rollout_reward_func/mean": 0.8955914974212646,
"rewards/rollout_reward_func/std": 0.34063035249710083,
"sampling/importance_sampling_ratio/max": 1.0000280141830444,
"sampling/importance_sampling_ratio/mean": 0.9999982118606567,
"sampling/importance_sampling_ratio/min": 0.999969482421875,
"sampling/sampling_logp_difference/max": 3.838152042590082e-05,
"sampling/sampling_logp_difference/mean": 2.3958950805536006e-06,
"step": 253,
"step_time": 9.807439852000243
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 687.0,
"completions/max_terminated_length": 687.0,
"completions/mean_length": 182.40625,
"completions/mean_terminated_length": 182.40625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.432378395653359e-05,
"epoch": 0.00508,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9421602701186202e-05,
"kl": 4.590497374534607,
"learning_rate": 3.067204804051008e-05,
"loss": 0.0097,
"num_tokens": 9512954.0,
"reward": 0.7674999833106995,
"reward_std": 0.44722259044647217,
"rewards/rollout_reward_func/mean": 0.7674999833106995,
"rewards/rollout_reward_func/std": 0.43893563747406006,
"sampling/importance_sampling_ratio/max": 1.0000064373016357,
"sampling/importance_sampling_ratio/mean": 0.9999945759773254,
"sampling/importance_sampling_ratio/min": 0.9999449253082275,
"sampling/sampling_logp_difference/max": 3.469131115707569e-05,
"sampling/sampling_logp_difference/mean": 2.1483758700924227e-06,
"step": 254,
"step_time": 11.132221287000448
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 230.28125,
"completions/mean_terminated_length": 230.28125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.0313648162946265e-05,
"epoch": 0.0051,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7147687685792334e-05,
"kl": 4.427471227943897,
"learning_rate": 3.04392033720731e-05,
"loss": 0.0068,
"num_tokens": 9549669.0,
"reward": 0.7380915284156799,
"reward_std": 0.47702929377555847,
"rewards/rollout_reward_func/mean": 0.7380915284156799,
"rewards/rollout_reward_func/std": 0.5209257006645203,
"sampling/importance_sampling_ratio/max": 1.0000137090682983,
"sampling/importance_sampling_ratio/mean": 0.9999964833259583,
"sampling/importance_sampling_ratio/min": 0.9999605417251587,
"sampling/sampling_logp_difference/max": 3.421324800001457e-05,
"sampling/sampling_logp_difference/mean": 2.215445874753641e-06,
"step": 255,
"step_time": 10.346654982000928
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 242.65625,
"completions/mean_terminated_length": 242.65625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.65414812885956e-05,
"epoch": 0.00512,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.15218052896671e-05,
"kl": 5.355410695075989,
"learning_rate": 3.0210864558724166e-05,
"loss": -0.0081,
"num_tokens": 9588652.0,
"reward": 0.5530915260314941,
"reward_std": 0.5290231108665466,
"rewards/rollout_reward_func/mean": 0.5530915260314941,
"rewards/rollout_reward_func/std": 0.513593852519989,
"sampling/importance_sampling_ratio/max": 1.0000211000442505,
"sampling/importance_sampling_ratio/mean": 0.9999995827674866,
"sampling/importance_sampling_ratio/min": 0.9999812841415405,
"sampling/sampling_logp_difference/max": 2.8729844416375272e-05,
"sampling/sampling_logp_difference/mean": 2.4928499442467e-06,
"step": 256,
"step_time": 11.143621553000685
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 669.0,
"completions/max_terminated_length": 669.0,
"completions/mean_length": 209.78125,
"completions/mean_terminated_length": 209.78125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.505985173850149e-05,
"epoch": 0.00514,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.374975055223331e-05,
"kl": 4.920541919767857,
"learning_rate": 2.9987063691418976e-05,
"loss": 0.0012,
"num_tokens": 9625429.0,
"reward": 0.7693415880203247,
"reward_std": 0.4513539671897888,
"rewards/rollout_reward_func/mean": 0.7693415880203247,
"rewards/rollout_reward_func/std": 0.4429239332675934,
"sampling/importance_sampling_ratio/max": 1.0000171661376953,
"sampling/importance_sampling_ratio/mean": 0.9999936819076538,
"sampling/importance_sampling_ratio/min": 0.9998266100883484,
"sampling/sampling_logp_difference/max": 0.00015843386063352227,
"sampling/sampling_logp_difference/mean": 2.968661647173576e-06,
"step": 257,
"step_time": 10.483305359999576
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 245.9375,
"completions/mean_terminated_length": 245.9375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.710858835707768e-05,
"epoch": 0.00516,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.126638537447434e-05,
"kl": 6.137430131435394,
"learning_rate": 2.9767832223345916e-05,
"loss": 0.0035,
"num_tokens": 9665099.0,
"reward": 0.36309152841567993,
"reward_std": 0.5653538107872009,
"rewards/rollout_reward_func/mean": 0.36309152841567993,
"rewards/rollout_reward_func/std": 0.5455530285835266,
"sampling/importance_sampling_ratio/max": 1.000001072883606,
"sampling/importance_sampling_ratio/mean": 0.9999943971633911,
"sampling/importance_sampling_ratio/min": 0.9999680519104004,
"sampling/sampling_logp_difference/max": 3.0160221285768785e-05,
"sampling/sampling_logp_difference/mean": 2.584924914117437e-06,
"step": 258,
"step_time": 11.087042529000428
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 222.40625,
"completions/mean_terminated_length": 222.40625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.469891036748777e-05,
"epoch": 0.00518,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8265869584865868e-05,
"kl": 4.711505472660065,
"learning_rate": 2.9553200965505647e-05,
"loss": 0.0056,
"num_tokens": 9702437.0,
"reward": 0.7709207534790039,
"reward_std": 0.4500967860221863,
"rewards/rollout_reward_func/mean": 0.7709207534790039,
"rewards/rollout_reward_func/std": 0.4417072534561157,
"sampling/importance_sampling_ratio/max": 1.0000097751617432,
"sampling/importance_sampling_ratio/mean": 0.9999932646751404,
"sampling/importance_sampling_ratio/min": 0.9999553561210632,
"sampling/sampling_logp_difference/max": 4.124702900298871e-05,
"sampling/sampling_logp_difference/mean": 2.601570486149285e-06,
"step": 259,
"step_time": 10.336212901999716
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 295.53125,
"completions/mean_terminated_length": 295.53125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.37967036804821e-05,
"epoch": 0.0052,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2233665731619112e-05,
"kl": 5.106555700302124,
"learning_rate": 2.9343200082380866e-05,
"loss": 0.009,
"num_tokens": 9742666.0,
"reward": 0.7430247068405151,
"reward_std": 0.5055485963821411,
"rewards/rollout_reward_func/mean": 0.7430247068405151,
"rewards/rollout_reward_func/std": 0.5186600685119629,
"sampling/importance_sampling_ratio/max": 1.000009298324585,
"sampling/importance_sampling_ratio/mean": 0.9999918937683105,
"sampling/importance_sampling_ratio/min": 0.9999312162399292,
"sampling/sampling_logp_difference/max": 6.461232260335237e-05,
"sampling/sampling_logp_difference/mean": 2.8128415578976274e-06,
"step": 260,
"step_time": 11.534494020001148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 316.65625,
"completions/mean_terminated_length": 316.65625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.77153407369724e-05,
"epoch": 0.00522,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.157628038432449e-05,
"kl": 5.550658762454987,
"learning_rate": 2.9137859087696982e-05,
"loss": 0.0096,
"num_tokens": 9783874.0,
"reward": 0.4912499785423279,
"reward_std": 0.5208456516265869,
"rewards/rollout_reward_func/mean": 0.4912499785423279,
"rewards/rollout_reward_func/std": 0.5062942504882812,
"sampling/importance_sampling_ratio/max": 1.0000102519989014,
"sampling/importance_sampling_ratio/mean": 0.9999921321868896,
"sampling/importance_sampling_ratio/min": 0.999884843826294,
"sampling/sampling_logp_difference/max": 0.00010276297689415514,
"sampling/sampling_logp_difference/mean": 3.043264314328553e-06,
"step": 261,
"step_time": 11.332538519000991
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 232.4375,
"completions/mean_terminated_length": 232.4375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.221832322239607e-05,
"epoch": 0.00524,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.8388696591719054e-05,
"kl": 5.240327179431915,
"learning_rate": 2.8937206840274185e-05,
"loss": -0.0008,
"num_tokens": 9822174.0,
"reward": 0.6765123605728149,
"reward_std": 0.46574604511260986,
"rewards/rollout_reward_func/mean": 0.6765123605728149,
"rewards/rollout_reward_func/std": 0.4871373772621155,
"sampling/importance_sampling_ratio/max": 1.0000027418136597,
"sampling/importance_sampling_ratio/mean": 0.9999960660934448,
"sampling/importance_sampling_ratio/min": 0.9999709725379944,
"sampling/sampling_logp_difference/max": 1.7762320567271672e-05,
"sampling/sampling_logp_difference/mean": 2.006978093049838e-06,
"step": 262,
"step_time": 11.319322815998476
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 162.5,
"completions/mean_terminated_length": 162.5,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 3.778687000988157e-05,
"epoch": 0.00526,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1966519195993897e-05,
"kl": 4.899482652544975,
"learning_rate": 2.8741271539971675e-05,
"loss": 0.0028,
"num_tokens": 9856162.0,
"reward": 0.6996707916259766,
"reward_std": 0.48187097907066345,
"rewards/rollout_reward_func/mean": 0.6996707916259766,
"rewards/rollout_reward_func/std": 0.47140640020370483,
"sampling/importance_sampling_ratio/max": 1.0000178813934326,
"sampling/importance_sampling_ratio/mean": 1.000000238418579,
"sampling/importance_sampling_ratio/min": 0.999981164932251,
"sampling/sampling_logp_difference/max": 1.8714017642196268e-05,
"sampling/sampling_logp_difference/mean": 1.450816625947482e-06,
"step": 263,
"step_time": 10.3198294609997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 199.28125,
"completions/mean_terminated_length": 199.28125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.570318305103683e-05,
"epoch": 0.00528,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.6365134544903412e-05,
"kl": 4.995322413742542,
"learning_rate": 2.8550080723724342e-05,
"loss": 0.0073,
"num_tokens": 9891750.0,
"reward": 0.7674999833106995,
"reward_std": 0.44868531823158264,
"rewards/rollout_reward_func/mean": 0.7674999833106995,
"rewards/rollout_reward_func/std": 0.43893563747406006,
"sampling/importance_sampling_ratio/max": 1.0000247955322266,
"sampling/importance_sampling_ratio/mean": 0.9999992251396179,
"sampling/importance_sampling_ratio/min": 0.9999715089797974,
"sampling/sampling_logp_difference/max": 2.860890526790172e-05,
"sampling/sampling_logp_difference/mean": 2.4780854346317938e-06,
"step": 264,
"step_time": 11.099350021000191
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 284.59375,
"completions/mean_terminated_length": 284.59375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.459991841254123e-05,
"epoch": 0.0053,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00013145462435204536,
"kl": 4.65108397603035,
"learning_rate": 2.8363661261672758e-05,
"loss": 0.015,
"num_tokens": 9930243.0,
"reward": 0.7093415260314941,
"reward_std": 0.48638349771499634,
"rewards/rollout_reward_func/mean": 0.7093415260314941,
"rewards/rollout_reward_func/std": 0.4678354859352112,
"sampling/importance_sampling_ratio/max": 1.0000114440917969,
"sampling/importance_sampling_ratio/mean": 0.9999954700469971,
"sampling/importance_sampling_ratio/min": 0.9999157786369324,
"sampling/sampling_logp_difference/max": 6.986263178987429e-05,
"sampling/sampling_logp_difference/mean": 2.4661187580932165e-06,
"step": 265,
"step_time": 10.411334865998924
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 161.78125,
"completions/mean_terminated_length": 161.78125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.54108203374426e-05,
"epoch": 0.00532,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4415406440093648e-05,
"kl": 4.443069338798523,
"learning_rate": 2.8182039353386807e-05,
"loss": -0.0007,
"num_tokens": 9963996.0,
"reward": 0.764012336730957,
"reward_std": 0.5061614513397217,
"rewards/rollout_reward_func/mean": 0.764012336730957,
"rewards/rollout_reward_func/std": 0.506635308265686,
"sampling/importance_sampling_ratio/max": 1.00002920627594,
"sampling/importance_sampling_ratio/mean": 0.9999999403953552,
"sampling/importance_sampling_ratio/min": 0.9999759197235107,
"sampling/sampling_logp_difference/max": 3.4687778679654e-05,
"sampling/sampling_logp_difference/mean": 1.926028062371188e-06,
"step": 266,
"step_time": 11.121112325000013
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 233.5625,
"completions/mean_terminated_length": 233.5625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.09017165121395e-05,
"epoch": 0.00534,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.388181357877329e-05,
"kl": 5.19572713971138,
"learning_rate": 2.800524052418356e-05,
"loss": -0.0123,
"num_tokens": 10001566.0,
"reward": 0.7383538484573364,
"reward_std": 0.4403288960456848,
"rewards/rollout_reward_func/mean": 0.7383538484573364,
"rewards/rollout_reward_func/std": 0.5228644013404846,
"sampling/importance_sampling_ratio/max": 1.0000412464141846,
"sampling/importance_sampling_ratio/mean": 0.9999992251396179,
"sampling/importance_sampling_ratio/min": 0.999980628490448,
"sampling/sampling_logp_difference/max": 3.74296578229405e-05,
"sampling/sampling_logp_difference/mean": 1.971904794118018e-06,
"step": 267,
"step_time": 10.228456085999824
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 273.5625,
"completions/mean_terminated_length": 273.5625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.407312296299096e-05,
"epoch": 0.00536,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.846478699008003e-05,
"kl": 5.062362797558308,
"learning_rate": 2.7833289621539925e-05,
"loss": 0.0089,
"num_tokens": 10040557.0,
"reward": 0.6143415570259094,
"reward_std": 0.4798487722873688,
"rewards/rollout_reward_func/mean": 0.6143415570259094,
"rewards/rollout_reward_func/std": 0.5538265109062195,
"sampling/importance_sampling_ratio/max": 1.0000193119049072,
"sampling/importance_sampling_ratio/mean": 0.9999940395355225,
"sampling/importance_sampling_ratio/min": 0.9999433755874634,
"sampling/sampling_logp_difference/max": 5.412838072516024e-05,
"sampling/sampling_logp_difference/mean": 2.6531811272434425e-06,
"step": 268,
"step_time": 10.941937255999164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 231.4375,
"completions/mean_terminated_length": 231.4375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.330135456129483e-05,
"epoch": 0.00538,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.1427702197106555e-05,
"kl": 4.888749971985817,
"learning_rate": 2.766621081160059e-05,
"loss": 0.0038,
"num_tokens": 10077746.0,
"reward": 0.7699331045150757,
"reward_std": 0.42729663848876953,
"rewards/rollout_reward_func/mean": 0.7699331045150757,
"rewards/rollout_reward_func/std": 0.44109678268432617,
"sampling/importance_sampling_ratio/max": 1.0000017881393433,
"sampling/importance_sampling_ratio/mean": 0.9999940991401672,
"sampling/importance_sampling_ratio/min": 0.999950647354126,
"sampling/sampling_logp_difference/max": 5.555623647524044e-05,
"sampling/sampling_logp_difference/mean": 2.4010546439967584e-06,
"step": 269,
"step_time": 10.250042127999677
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 315.34375,
"completions/mean_terminated_length": 315.34375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.789425674786798e-05,
"epoch": 0.0054,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.233926690882072e-05,
"kl": 5.45307557284832,
"learning_rate": 2.7504027575781634e-05,
"loss": -0.0033,
"num_tokens": 10119423.0,
"reward": 0.6190123558044434,
"reward_std": 0.5169612169265747,
"rewards/rollout_reward_func/mean": 0.6190123558044434,
"rewards/rollout_reward_func/std": 0.5024634003639221,
"sampling/importance_sampling_ratio/max": 1.0000137090682983,
"sampling/importance_sampling_ratio/mean": 0.9999967813491821,
"sampling/importance_sampling_ratio/min": 0.9999703764915466,
"sampling/sampling_logp_difference/max": 3.6239842302165926e-05,
"sampling/sampling_logp_difference/mean": 2.813545734170475e-06,
"step": 270,
"step_time": 11.126338212000519
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 233.75,
"completions/mean_terminated_length": 233.75,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.70626255470097e-05,
"epoch": 0.00542,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.3942422558320686e-05,
"kl": 5.092776149511337,
"learning_rate": 2.734676270747047e-05,
"loss": 0.0042,
"num_tokens": 10156839.0,
"reward": 0.6436830759048462,
"reward_std": 0.49129801988601685,
"rewards/rollout_reward_func/mean": 0.6436830759048462,
"rewards/rollout_reward_func/std": 0.4945172667503357,
"sampling/importance_sampling_ratio/max": 1.000022530555725,
"sampling/importance_sampling_ratio/mean": 0.9999971389770508,
"sampling/importance_sampling_ratio/min": 0.9999656677246094,
"sampling/sampling_logp_difference/max": 3.0277311452664435e-05,
"sampling/sampling_logp_difference/mean": 2.1845353330718353e-06,
"step": 271,
"step_time": 10.331348605999665
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 304.46875,
"completions/mean_terminated_length": 304.46875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.222742306112195e-05,
"epoch": 0.00544,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3053702787146904e-05,
"kl": 5.665085136890411,
"learning_rate": 2.7194438308822428e-05,
"loss": 0.0068,
"num_tokens": 10197931.0,
"reward": 0.5868415832519531,
"reward_std": 0.5270721912384033,
"rewards/rollout_reward_func/mean": 0.5868415832519531,
"rewards/rollout_reward_func/std": 0.5654386878013611,
"sampling/importance_sampling_ratio/max": 1.0000213384628296,
"sampling/importance_sampling_ratio/mean": 0.9999969005584717,
"sampling/importance_sampling_ratio/min": 0.9999679923057556,
"sampling/sampling_logp_difference/max": 3.0994589906185865e-05,
"sampling/sampling_logp_difference/mean": 2.5059268864424666e-06,
"step": 272,
"step_time": 11.521847475000413
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 224.84375,
"completions/mean_terminated_length": 224.84375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.230691576547542e-05,
"epoch": 0.00546,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.916292553185485e-05,
"kl": 5.193328641355038,
"learning_rate": 2.7047075787654503e-05,
"loss": 0.0114,
"num_tokens": 10235147.0,
"reward": 0.6105915307998657,
"reward_std": 0.4785401225090027,
"rewards/rollout_reward_func/mean": 0.6105915307998657,
"rewards/rollout_reward_func/std": 0.4986937940120697,
"sampling/importance_sampling_ratio/max": 1.0000423192977905,
"sampling/importance_sampling_ratio/mean": 0.999998152256012,
"sampling/importance_sampling_ratio/min": 0.9999819993972778,
"sampling/sampling_logp_difference/max": 3.814511001110077e-05,
"sampling/sampling_logp_difference/mean": 1.9021531443286221e-06,
"step": 273,
"step_time": 11.075365086000602
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 269.34375,
"completions/mean_terminated_length": 269.34375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.955465923714655e-05,
"epoch": 0.00548,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.947480217902921e-05,
"kl": 4.885876089334488,
"learning_rate": 2.6904695854436662e-05,
"loss": 0.0102,
"num_tokens": 10273871.0,
"reward": 0.6780915260314941,
"reward_std": 0.501719057559967,
"rewards/rollout_reward_func/mean": 0.6780915260314941,
"rewards/rollout_reward_func/std": 0.4803001284599304,
"sampling/importance_sampling_ratio/max": 1.0000174045562744,
"sampling/importance_sampling_ratio/mean": 0.9999932050704956,
"sampling/importance_sampling_ratio/min": 0.9998982548713684,
"sampling/sampling_logp_difference/max": 0.00011146671022288501,
"sampling/sampling_logp_difference/mean": 2.8891031433886383e-06,
"step": 274,
"step_time": 10.701226339999266
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 619.0,
"completions/max_terminated_length": 619.0,
"completions/mean_length": 241.25,
"completions/mean_terminated_length": 241.25,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.047394075560987e-05,
"epoch": 0.0055,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.9621622161357664e-05,
"kl": 4.531084284186363,
"learning_rate": 2.676731851938118e-05,
"loss": -0.0066,
"num_tokens": 10311647.0,
"reward": 0.8674999475479126,
"reward_std": 0.31253060698509216,
"rewards/rollout_reward_func/mean": 0.8674999475479126,
"rewards/rollout_reward_func/std": 0.3693324327468872,
"sampling/importance_sampling_ratio/max": 1.0000293254852295,
"sampling/importance_sampling_ratio/mean": 0.9999931454658508,
"sampling/importance_sampling_ratio/min": 0.9999464154243469,
"sampling/sampling_logp_difference/max": 3.874358662869781e-05,
"sampling/sampling_logp_difference/mean": 3.144346919725649e-06,
"step": 275,
"step_time": 10.410628370998893
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 284.75,
"completions/mean_terminated_length": 284.75,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.49887765607582e-05,
"epoch": 0.00552,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.5303739676019177e-05,
"kl": 4.912029400467873,
"learning_rate": 2.663496308963041e-05,
"loss": -0.0006,
"num_tokens": 10350910.0,
"reward": 0.7405915260314941,
"reward_std": 0.479642391204834,
"rewards/rollout_reward_func/mean": 0.7405915260314941,
"rewards/rollout_reward_func/std": 0.4598024785518646,
"sampling/importance_sampling_ratio/max": 1.0000290870666504,
"sampling/importance_sampling_ratio/mean": 0.999995231628418,
"sampling/importance_sampling_ratio/min": 0.9999715089797974,
"sampling/sampling_logp_difference/max": 3.612082946347073e-05,
"sampling/sampling_logp_difference/mean": 2.5878532596834702e-06,
"step": 276,
"step_time": 11.039417921998393
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 181.0625,
"completions/mean_terminated_length": 181.0625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 4.9210345821393275e-05,
"epoch": 0.00554,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9761475414270535e-05,
"kl": 5.184775277972221,
"learning_rate": 2.6507648166543308e-05,
"loss": 0.0078,
"num_tokens": 10386430.0,
"reward": 0.7021707892417908,
"reward_std": 0.48910099267959595,
"rewards/rollout_reward_func/mean": 0.7021707892417908,
"rewards/rollout_reward_func/std": 0.470414936542511,
"sampling/importance_sampling_ratio/max": 1.0000289678573608,
"sampling/importance_sampling_ratio/mean": 1.0000001192092896,
"sampling/importance_sampling_ratio/min": 0.9999823570251465,
"sampling/sampling_logp_difference/max": 2.503280120436102e-05,
"sampling/sampling_logp_difference/mean": 1.6454481510663754e-06,
"step": 277,
"step_time": 10.672157200998754
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 242.15625,
"completions/mean_terminated_length": 242.15625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.979084449199036e-05,
"epoch": 0.00556,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.7508402126841247e-05,
"kl": 4.862967908382416,
"learning_rate": 2.63853916430812e-05,
"loss": -0.0024,
"num_tokens": 10424522.0,
"reward": 0.6446039080619812,
"reward_std": 0.5659518241882324,
"rewards/rollout_reward_func/mean": 0.6446039080619812,
"rewards/rollout_reward_func/std": 0.554857611656189,
"sampling/importance_sampling_ratio/max": 1.000006914138794,
"sampling/importance_sampling_ratio/mean": 0.9999958276748657,
"sampling/importance_sampling_ratio/min": 0.9999752640724182,
"sampling/sampling_logp_difference/max": 2.1934793039690703e-05,
"sampling/sampling_logp_difference/mean": 1.790413080016151e-06,
"step": 278,
"step_time": 10.83243991900008
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 211.4375,
"completions/mean_terminated_length": 211.4375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.138452050663545e-05,
"epoch": 0.00558,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.7026304451283067e-05,
"kl": 4.282514490187168,
"learning_rate": 2.6268210701293073e-05,
"loss": 0.0016,
"num_tokens": 10459611.0,
"reward": 0.7649331092834473,
"reward_std": 0.45170778036117554,
"rewards/rollout_reward_func/mean": 0.7649331092834473,
"rewards/rollout_reward_func/std": 0.44183656573295593,
"sampling/importance_sampling_ratio/max": 1.00001060962677,
"sampling/importance_sampling_ratio/mean": 0.9999970197677612,
"sampling/importance_sampling_ratio/min": 0.9999755620956421,
"sampling/sampling_logp_difference/max": 2.6465353585081175e-05,
"sampling/sampling_logp_difference/mean": 2.6112129489774816e-06,
"step": 279,
"step_time": 10.760200042999259
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 678.0,
"completions/max_terminated_length": 678.0,
"completions/mean_length": 202.6875,
"completions/mean_terminated_length": 202.6875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.559260192773309e-05,
"epoch": 0.0056,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.933691525366157e-05,
"kl": 5.390433177351952,
"learning_rate": 2.615612180990079e-05,
"loss": -0.0067,
"num_tokens": 10497249.0,
"reward": 0.6449999809265137,
"reward_std": 0.4967680275440216,
"rewards/rollout_reward_func/mean": 0.6449999809265137,
"rewards/rollout_reward_func/std": 0.4975034296512604,
"sampling/importance_sampling_ratio/max": 1.0000171661376953,
"sampling/importance_sampling_ratio/mean": 0.9999953508377075,
"sampling/importance_sampling_ratio/min": 0.9999397993087769,
"sampling/sampling_logp_difference/max": 6.330482574412599e-05,
"sampling/sampling_logp_difference/mean": 2.610198407637654e-06,
"step": 280,
"step_time": 10.605298713000593
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 276.4375,
"completions/mean_terminated_length": 276.4375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.060739913024918e-05,
"epoch": 0.00562,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.849232547916472e-05,
"kl": 5.0677084401249886,
"learning_rate": 2.60491407219846e-05,
"loss": -0.005,
"num_tokens": 10537347.0,
"reward": 0.7115123271942139,
"reward_std": 0.4817769229412079,
"rewards/rollout_reward_func/mean": 0.7115123271942139,
"rewards/rollout_reward_func/std": 0.4767700135707855,
"sampling/importance_sampling_ratio/max": 1.0000137090682983,
"sampling/importance_sampling_ratio/mean": 0.9999938607215881,
"sampling/importance_sampling_ratio/min": 0.9999478459358215,
"sampling/sampling_logp_difference/max": 4.4942185922991484e-05,
"sampling/sampling_logp_difference/mean": 2.9220987016742583e-06,
"step": 281,
"step_time": 10.972126118999768
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 232.40625,
"completions/mean_terminated_length": 232.40625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.892862940333089e-05,
"epoch": 0.00564,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8199409169028513e-05,
"kl": 5.05782288312912,
"learning_rate": 2.594728247276913e-05,
"loss": -0.0011,
"num_tokens": 10575586.0,
"reward": 0.6449331045150757,
"reward_std": 0.5124734044075012,
"rewards/rollout_reward_func/mean": 0.6449331045150757,
"rewards/rollout_reward_func/std": 0.4961819350719452,
"sampling/importance_sampling_ratio/max": 1.000025987625122,
"sampling/importance_sampling_ratio/mean": 0.9999963045120239,
"sampling/importance_sampling_ratio/min": 0.9999744296073914,
"sampling/sampling_logp_difference/max": 4.422439087647945e-05,
"sampling/sampling_logp_difference/mean": 2.5519543669361155e-06,
"step": 282,
"step_time": 10.717279948999021
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 220.8125,
"completions/mean_terminated_length": 220.8125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.212359551227564e-05,
"epoch": 0.00566,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.04080297407927e-05,
"kl": 5.217775493860245,
"learning_rate": 2.5850561377510356e-05,
"loss": 0.0129,
"num_tokens": 10613250.0,
"reward": 0.6746707558631897,
"reward_std": 0.4775552451610565,
"rewards/rollout_reward_func/mean": 0.6746707558631897,
"rewards/rollout_reward_func/std": 0.4804707467556,
"sampling/importance_sampling_ratio/max": 1.0000135898590088,
"sampling/importance_sampling_ratio/mean": 0.9999940395355225,
"sampling/importance_sampling_ratio/min": 0.9999485015869141,
"sampling/sampling_logp_difference/max": 5.3048734116600826e-05,
"sampling/sampling_logp_difference/mean": 2.8808067327190656e-06,
"step": 283,
"step_time": 10.839048934998573
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 291.15625,
"completions/mean_terminated_length": 291.15625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.180986020216551e-05,
"epoch": 0.00568,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.043676249741111e-05,
"kl": 5.018721252679825,
"learning_rate": 2.5758991029483713e-05,
"loss": 0.0081,
"num_tokens": 10653230.0,
"reward": 0.6168415546417236,
"reward_std": 0.5035527944564819,
"rewards/rollout_reward_func/mean": 0.6168415546417236,
"rewards/rollout_reward_func/std": 0.49877506494522095,
"sampling/importance_sampling_ratio/max": 1.0000146627426147,
"sampling/importance_sampling_ratio/mean": 0.9999990463256836,
"sampling/importance_sampling_ratio/min": 0.9999721050262451,
"sampling/sampling_logp_difference/max": 2.6345524020143785e-05,
"sampling/sampling_logp_difference/mean": 2.221646809630329e-06,
"step": 284,
"step_time": 11.054412653003055
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 275.78125,
"completions/mean_terminated_length": 275.78125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.33028123928625e-05,
"epoch": 0.0057,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.267479329835624e-05,
"kl": 4.437944404780865,
"learning_rate": 2.5672584298073688e-05,
"loss": 0.024,
"num_tokens": 10690936.0,
"reward": 0.6755915284156799,
"reward_std": 0.4725848436355591,
"rewards/rollout_reward_func/mean": 0.6755915284156799,
"rewards/rollout_reward_func/std": 0.47646206617355347,
"sampling/importance_sampling_ratio/max": 1.000028133392334,
"sampling/importance_sampling_ratio/mean": 0.9999963641166687,
"sampling/importance_sampling_ratio/min": 0.9999643564224243,
"sampling/sampling_logp_difference/max": 2.8133623345638625e-05,
"sampling/sampling_logp_difference/mean": 2.420891178189777e-06,
"step": 285,
"step_time": 10.956865436998669
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 305.09375,
"completions/mean_terminated_length": 305.09375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.367031156491066e-05,
"epoch": 0.00572,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.3896298444014974e-05,
"kl": 5.410068087279797,
"learning_rate": 2.5591353326965118e-05,
"loss": 0.0064,
"num_tokens": 10731982.0,
"reward": 0.5849331617355347,
"reward_std": 0.5261651873588562,
"rewards/rollout_reward_func/mean": 0.5849331617355347,
"rewards/rollout_reward_func/std": 0.5046738386154175,
"sampling/importance_sampling_ratio/max": 1.0000083446502686,
"sampling/importance_sampling_ratio/mean": 0.9999963045120239,
"sampling/importance_sampling_ratio/min": 0.9999761581420898,
"sampling/sampling_logp_difference/max": 2.6822675863513723e-05,
"sampling/sampling_logp_difference/mean": 2.169199888157891e-06,
"step": 286,
"step_time": 11.055283683998823
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 609.0,
"completions/max_terminated_length": 609.0,
"completions/mean_length": 173.90625,
"completions/mean_terminated_length": 173.90625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.815319975965849e-05,
"epoch": 0.00574,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.01816469093319e-05,
"kl": 4.101704500615597,
"learning_rate": 2.551530953243656e-05,
"loss": 0.0116,
"num_tokens": 10765955.0,
"reward": 0.7652623653411865,
"reward_std": 0.4456794261932373,
"rewards/rollout_reward_func/mean": 0.7652623653411865,
"rewards/rollout_reward_func/std": 0.4375646412372589,
"sampling/importance_sampling_ratio/max": 1.0000256299972534,
"sampling/importance_sampling_ratio/mean": 0.999998927116394,
"sampling/importance_sampling_ratio/min": 0.9999776482582092,
"sampling/sampling_logp_difference/max": 3.719073720276356e-05,
"sampling/sampling_logp_difference/mean": 2.711852630454814e-06,
"step": 287,
"step_time": 10.189362570999037
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 687.0,
"completions/max_terminated_length": 687.0,
"completions/mean_length": 260.90625,
"completions/mean_terminated_length": 260.90625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.0160799051800495e-05,
"epoch": 0.00576,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.7769527150667273e-05,
"kl": 4.67512171715498,
"learning_rate": 2.5444463601755776e-05,
"loss": 0.0217,
"num_tokens": 10803410.0,
"reward": 0.7071707248687744,
"reward_std": 0.45949870347976685,
"rewards/rollout_reward_func/mean": 0.7071707248687744,
"rewards/rollout_reward_func/std": 0.46562138199806213,
"sampling/importance_sampling_ratio/max": 1.0000154972076416,
"sampling/importance_sampling_ratio/mean": 0.999997615814209,
"sampling/importance_sampling_ratio/min": 0.9999765753746033,
"sampling/sampling_logp_difference/max": 4.637298115994781e-05,
"sampling/sampling_logp_difference/mean": 2.2777503545512445e-06,
"step": 288,
"step_time": 10.624308804000066
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 284.28125,
"completions/mean_terminated_length": 284.28125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.007407710408643e-05,
"epoch": 0.00578,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.265037129400298e-05,
"kl": 5.380328759551048,
"learning_rate": 2.537882549167776e-05,
"loss": -0.0069,
"num_tokens": 10843642.0,
"reward": 0.5521038770675659,
"reward_std": 0.5371196866035461,
"rewards/rollout_reward_func/mean": 0.5521038770675659,
"rewards/rollout_reward_func/std": 0.5132983922958374,
"sampling/importance_sampling_ratio/max": 1.0000168085098267,
"sampling/importance_sampling_ratio/mean": 0.999995231628418,
"sampling/importance_sampling_ratio/min": 0.9999474883079529,
"sampling/sampling_logp_difference/max": 5.6267705076606944e-05,
"sampling/sampling_logp_difference/mean": 3.2138741516973823e-06,
"step": 289,
"step_time": 11.017298460002166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 242.78125,
"completions/mean_terminated_length": 242.78125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.603282352874885e-05,
"epoch": 0.0058,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.560553159331903e-05,
"kl": 5.179336726665497,
"learning_rate": 2.531840442704543e-05,
"loss": 0.0113,
"num_tokens": 10881481.0,
"reward": 0.674012303352356,
"reward_std": 0.47830620408058167,
"rewards/rollout_reward_func/mean": 0.674012303352356,
"rewards/rollout_reward_func/std": 0.48066821694374084,
"sampling/importance_sampling_ratio/max": 1.0000087022781372,
"sampling/importance_sampling_ratio/mean": 0.9999942779541016,
"sampling/importance_sampling_ratio/min": 0.9999580383300781,
"sampling/sampling_logp_difference/max": 4.435054142959416e-05,
"sampling/sampling_logp_difference/mean": 2.450002966725151e-06,
"step": 290,
"step_time": 10.734974283997872
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 248.59375,
"completions/mean_terminated_length": 248.59375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 5.6176058649271e-05,
"epoch": 0.00582,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.4972263418021612e-05,
"kl": 4.305257134139538,
"learning_rate": 2.5263208899493117e-05,
"loss": 0.0144,
"num_tokens": 10917982.0,
"reward": 0.8005915284156799,
"reward_std": 0.41739755868911743,
"rewards/rollout_reward_func/mean": 0.8005915284156799,
"rewards/rollout_reward_func/std": 0.41858813166618347,
"sampling/importance_sampling_ratio/max": 1.0000085830688477,
"sampling/importance_sampling_ratio/mean": 0.9999980926513672,
"sampling/importance_sampling_ratio/min": 0.999978244304657,
"sampling/sampling_logp_difference/max": 1.966987292689737e-05,
"sampling/sampling_logp_difference/mean": 1.6015185337892035e-06,
"step": 291,
"step_time": 10.732529017999695
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 241.875,
"completions/mean_terminated_length": 241.875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.879591774440087e-05,
"epoch": 0.00584,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.2613104849588126e-05,
"kl": 5.317075669765472,
"learning_rate": 2.5213246666253165e-05,
"loss": 0.0006,
"num_tokens": 10956087.0,
"reward": 0.6459207534790039,
"reward_std": 0.4879300892353058,
"rewards/rollout_reward_func/mean": 0.6459207534790039,
"rewards/rollout_reward_func/std": 0.49300137162208557,
"sampling/importance_sampling_ratio/max": 1.0000144243240356,
"sampling/importance_sampling_ratio/mean": 0.9999915361404419,
"sampling/importance_sampling_ratio/min": 0.9999055862426758,
"sampling/sampling_logp_difference/max": 9.120126196648926e-05,
"sampling/sampling_logp_difference/mean": 2.860357653844403e-06,
"step": 292,
"step_time": 10.913091708997854
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 270.90625,
"completions/mean_terminated_length": 270.90625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 8.648679309430918e-05,
"epoch": 0.00586,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.2432013540528715e-05,
"kl": 5.279637806117535,
"learning_rate": 2.5168524749065723e-05,
"loss": 0.012,
"num_tokens": 10995272.0,
"reward": 0.7109207510948181,
"reward_std": 0.4653802514076233,
"rewards/rollout_reward_func/mean": 0.7109207510948181,
"rewards/rollout_reward_func/std": 0.4689246416091919,
"sampling/importance_sampling_ratio/max": 1.0000152587890625,
"sampling/importance_sampling_ratio/mean": 0.999992847442627,
"sampling/importance_sampling_ratio/min": 0.9998891353607178,
"sampling/sampling_logp_difference/max": 9.370438783662394e-05,
"sampling/sampling_logp_difference/mean": 2.75289994533523e-06,
"step": 293,
"step_time": 10.773192945001028
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 250.625,
"completions/mean_terminated_length": 250.625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 7.307361252628652e-05,
"epoch": 0.00588,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2651251381612383e-05,
"kl": 4.669016398489475,
"learning_rate": 2.5129049433191904e-05,
"loss": 0.0081,
"num_tokens": 11034020.0,
"reward": 0.7430915832519531,
"reward_std": 0.461855411529541,
"rewards/rollout_reward_func/mean": 0.7430915832519531,
"rewards/rollout_reward_func/std": 0.45573315024375916,
"sampling/importance_sampling_ratio/max": 1.000016212463379,
"sampling/importance_sampling_ratio/mean": 0.9999939203262329,
"sampling/importance_sampling_ratio/min": 0.9999682903289795,
"sampling/sampling_logp_difference/max": 3.1233001209329814e-05,
"sampling/sampling_logp_difference/mean": 2.4066905552899698e-06,
"step": 294,
"step_time": 11.02961987100025
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 732.0,
"completions/max_terminated_length": 732.0,
"completions/mean_length": 209.40625,
"completions/mean_terminated_length": 209.40625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 4.992972093020853e-05,
"epoch": 0.0059,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.169368240691256e-06,
"kl": 5.3457541689276695,
"learning_rate": 2.509482626653043e-05,
"loss": 0.0057,
"num_tokens": 11070773.0,
"reward": 0.5475000143051147,
"reward_std": 0.4948720335960388,
"rewards/rollout_reward_func/mean": 0.5475000143051147,
"rewards/rollout_reward_func/std": 0.5076383948326111,
"sampling/importance_sampling_ratio/max": 1.000032663345337,
"sampling/importance_sampling_ratio/mean": 0.9999977946281433,
"sampling/importance_sampling_ratio/min": 0.9999710321426392,
"sampling/sampling_logp_difference/max": 4.434179572854191e-05,
"sampling/sampling_logp_difference/mean": 2.625229171826504e-06,
"step": 295,
"step_time": 11.026958206001837
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 696.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 166.6875,
"completions/mean_terminated_length": 166.6875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 3.824376666727858e-05,
"epoch": 0.00592,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7649032088229433e-05,
"kl": 4.179639674723148,
"learning_rate": 2.506586005883795e-05,
"loss": -0.0004,
"num_tokens": 11103634.0,
"reward": 0.7930915355682373,
"reward_std": 0.42395704984664917,
"rewards/rollout_reward_func/mean": 0.7930915355682373,
"rewards/rollout_reward_func/std": 0.4206209182739258,
"sampling/importance_sampling_ratio/max": 1.0000280141830444,
"sampling/importance_sampling_ratio/mean": 0.9999994039535522,
"sampling/importance_sampling_ratio/min": 0.999958336353302,
"sampling/sampling_logp_difference/max": 3.063714029849507e-05,
"sampling/sampling_logp_difference/mean": 1.612040250620339e-06,
"step": 296,
"step_time": 10.490348910001558
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 687.0,
"completions/max_terminated_length": 687.0,
"completions/mean_length": 258.3125,
"completions/mean_terminated_length": 258.3125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 9.468211831631379e-05,
"epoch": 0.00594,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.705732812406495e-05,
"kl": 4.647277727723122,
"learning_rate": 2.5042154881053053e-05,
"loss": -0.0104,
"num_tokens": 11142661.0,
"reward": 0.8065123558044434,
"reward_std": 0.3456747829914093,
"rewards/rollout_reward_func/mean": 0.8065123558044434,
"rewards/rollout_reward_func/std": 0.42172476649284363,
"sampling/importance_sampling_ratio/max": 1.0000272989273071,
"sampling/importance_sampling_ratio/mean": 0.9999910593032837,
"sampling/importance_sampling_ratio/min": 0.9999492168426514,
"sampling/sampling_logp_difference/max": 4.67304780613631e-05,
"sampling/sampling_logp_difference/mean": 3.2617106171528576e-06,
"step": 297,
"step_time": 10.858166022999285
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 251.875,
"completions/mean_terminated_length": 251.875,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.953562817102465e-05,
"epoch": 0.00596,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.815532105974853e-05,
"kl": 5.288907490670681,
"learning_rate": 2.5023714064724153e-05,
"loss": -0.0039,
"num_tokens": 11181771.0,
"reward": 0.5518415570259094,
"reward_std": 0.5265346169471741,
"rewards/rollout_reward_func/mean": 0.5518415570259094,
"rewards/rollout_reward_func/std": 0.5117542147636414,
"sampling/importance_sampling_ratio/max": 1.0000152587890625,
"sampling/importance_sampling_ratio/mean": 0.9999966621398926,
"sampling/importance_sampling_ratio/min": 0.9999724626541138,
"sampling/sampling_logp_difference/max": 2.8370497602736577e-05,
"sampling/sampling_logp_difference/mean": 2.1110099623911083e-06,
"step": 298,
"step_time": 10.510601238000163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 741.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 206.4375,
"completions/mean_terminated_length": 206.4375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 6.404310158814042e-05,
"epoch": 0.00598,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.147389699937776e-05,
"kl": 4.263456001877785,
"learning_rate": 2.5010540201541244e-05,
"loss": -0.0012,
"num_tokens": 11217402.0,
"reward": 0.798420786857605,
"reward_std": 0.3639390766620636,
"rewards/rollout_reward_func/mean": 0.798420786857605,
"rewards/rollout_reward_func/std": 0.41741809248924255,
"sampling/importance_sampling_ratio/max": 1.0000015497207642,
"sampling/importance_sampling_ratio/mean": 0.9999944567680359,
"sampling/importance_sampling_ratio/min": 0.9999539852142334,
"sampling/sampling_logp_difference/max": 5.006824358133599e-05,
"sampling/sampling_logp_difference/mean": 2.4876530915207695e-06,
"step": 299,
"step_time": 11.449250560001019
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 723.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 128.0625,
"completions/mean_terminated_length": 128.0625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 4.052700117540553e-05,
"epoch": 0.006,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.4804732422344387e-05,
"kl": 4.577869638800621,
"learning_rate": 2.5002635142971693e-05,
"loss": -0.0049,
"num_tokens": 11249250.0,
"reward": 0.8246707916259766,
"reward_std": 0.33838966488838196,
"rewards/rollout_reward_func/mean": 0.8246707916259766,
"rewards/rollout_reward_func/std": 0.3964185118675232,
"sampling/importance_sampling_ratio/max": 1.0000133514404297,
"sampling/importance_sampling_ratio/mean": 0.999998152256012,
"sampling/importance_sampling_ratio/min": 0.9999639987945557,
"sampling/sampling_logp_difference/max": 3.254435432609171e-05,
"sampling/sampling_logp_difference/mean": 1.622052195671131e-06,
"step": 300,
"step_time": 10.629190601000118
}
],
"logging_steps": 1.0,
"max_steps": 300,
"num_input_tokens_seen": 11249250,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}