ginrummy-local-evaluation / trainer_state.json
Jordansky's picture
Training 3h - gin_rummy
2bccdaa verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00424,
"eval_steps": 500,
"global_step": 106,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9274.0,
"completions/max_terminated_length": 9274.0,
"completions/mean_length": 8544.65625,
"completions/mean_terminated_length": 8544.65625,
"completions/min_length": 5146.0,
"completions/min_terminated_length": 5146.0,
"entropy": 0.10155441798269749,
"epoch": 4e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3302725553512573,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0468,
"num_tokens": 300475.0,
"reward": -0.43501541018486023,
"reward_std": 0.25424131751060486,
"rewards/rollout_reward_func/mean": -0.43501541018486023,
"rewards/rollout_reward_func/std": 0.3262947201728821,
"sampling/importance_sampling_ratio/max": 1.4890494346618652,
"sampling/importance_sampling_ratio/mean": 0.9993953704833984,
"sampling/importance_sampling_ratio/min": 0.5384275317192078,
"sampling/sampling_logp_difference/max": 0.6191023588180542,
"sampling/sampling_logp_difference/mean": 0.009896567091345787,
"step": 1,
"step_time": 99.01410378399987
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9192.0,
"completions/max_terminated_length": 9192.0,
"completions/mean_length": 6776.0625,
"completions/mean_terminated_length": 6776.0625,
"completions/min_length": 296.0,
"completions/min_terminated_length": 296.0,
"entropy": 0.0874525704421103,
"epoch": 8e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6579537391662598,
"kl": 0.0,
"learning_rate": 2.2857142857142855e-07,
"loss": -0.1968,
"num_tokens": 544729.0,
"reward": -0.2837252914905548,
"reward_std": 0.5439483523368835,
"rewards/rollout_reward_func/mean": -0.2837252914905548,
"rewards/rollout_reward_func/std": 0.6020905375480652,
"sampling/importance_sampling_ratio/max": 1.7989556789398193,
"sampling/importance_sampling_ratio/mean": 0.9998900890350342,
"sampling/importance_sampling_ratio/min": 0.45409083366394043,
"sampling/sampling_logp_difference/max": 0.7894580364227295,
"sampling/sampling_logp_difference/mean": 0.01010945439338684,
"step": 2,
"step_time": 87.14777932000197
},
{
"clip_ratio/high_max": 0.004603212466463447,
"clip_ratio/high_mean": 0.0023016062332317233,
"clip_ratio/low_mean": 0.0018652374274097383,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004166843631537631,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9212.0,
"completions/max_terminated_length": 9212.0,
"completions/mean_length": 7882.5625,
"completions/mean_terminated_length": 7882.5625,
"completions/min_length": 965.0,
"completions/min_terminated_length": 965.0,
"entropy": 0.09430563636124134,
"epoch": 0.00012,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.605349063873291,
"kl": 0.0011047614534618333,
"learning_rate": 4.571428571428571e-07,
"loss": -0.1305,
"num_tokens": 823996.0,
"reward": -0.4043263792991638,
"reward_std": 0.3804655075073242,
"rewards/rollout_reward_func/mean": -0.4043263792991638,
"rewards/rollout_reward_func/std": 0.40240201354026794,
"sampling/importance_sampling_ratio/max": 1.976324200630188,
"sampling/importance_sampling_ratio/mean": 0.9999128580093384,
"sampling/importance_sampling_ratio/min": 0.3961387276649475,
"sampling/sampling_logp_difference/max": 0.9259908199310303,
"sampling/sampling_logp_difference/mean": 0.009135385975241661,
"step": 3,
"step_time": 94.82623126800036
},
{
"clip_ratio/high_max": 0.0036170141538605094,
"clip_ratio/high_mean": 0.0018085070769302547,
"clip_ratio/low_mean": 0.003806174616329372,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005614681693259627,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8937.0,
"completions/max_terminated_length": 8937.0,
"completions/mean_length": 7744.6875,
"completions/mean_terminated_length": 7744.6875,
"completions/min_length": 293.0,
"completions/min_terminated_length": 293.0,
"entropy": 0.1019767300458625,
"epoch": 0.00016,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7453252077102661,
"kl": 0.001767605485298418,
"learning_rate": 6.857142857142857e-07,
"loss": -0.1059,
"num_tokens": 1099670.0,
"reward": -0.42792099714279175,
"reward_std": 0.40188515186309814,
"rewards/rollout_reward_func/mean": -0.42792099714279175,
"rewards/rollout_reward_func/std": 0.44791290163993835,
"sampling/importance_sampling_ratio/max": 2.167020559310913,
"sampling/importance_sampling_ratio/mean": 1.0016282796859741,
"sampling/importance_sampling_ratio/min": 0.5168222784996033,
"sampling/sampling_logp_difference/max": 0.7733532190322876,
"sampling/sampling_logp_difference/mean": 0.011834039352834225,
"step": 4,
"step_time": 93.63879696800086
},
{
"clip_ratio/high_max": 0.0066998383263126016,
"clip_ratio/high_mean": 0.0040601465734653175,
"clip_ratio/low_mean": 0.00286985796992667,
"clip_ratio/low_min": 0.0007102272938936949,
"clip_ratio/region_mean": 0.006930004572495818,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9072.0,
"completions/max_terminated_length": 9072.0,
"completions/mean_length": 7945.375,
"completions/mean_terminated_length": 7945.375,
"completions/min_length": 1143.0,
"completions/min_terminated_length": 1143.0,
"entropy": 0.10537219722755253,
"epoch": 0.0002,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5562094449996948,
"kl": 0.0011164914703840623,
"learning_rate": 9.142857142857142e-07,
"loss": -0.0856,
"num_tokens": 1381159.0,
"reward": -0.44396543502807617,
"reward_std": 0.2552921175956726,
"rewards/rollout_reward_func/mean": -0.44396543502807617,
"rewards/rollout_reward_func/std": 0.3181779980659485,
"sampling/importance_sampling_ratio/max": 1.9132328033447266,
"sampling/importance_sampling_ratio/mean": 0.9998804330825806,
"sampling/importance_sampling_ratio/min": 0.5040712952613831,
"sampling/sampling_logp_difference/max": 0.6850376129150391,
"sampling/sampling_logp_difference/mean": 0.011912481859326363,
"step": 5,
"step_time": 95.56573534999961
},
{
"clip_ratio/high_max": 0.005053992324974388,
"clip_ratio/high_mean": 0.002878119732486084,
"clip_ratio/low_mean": 0.001978989952476695,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00485710974317044,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9032.0,
"completions/max_terminated_length": 9032.0,
"completions/mean_length": 7585.09375,
"completions/mean_terminated_length": 7585.09375,
"completions/min_length": 297.0,
"completions/min_terminated_length": 297.0,
"entropy": 0.08556636655703187,
"epoch": 0.00024,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3749723434448242,
"kl": 0.000942698896778893,
"learning_rate": 1.1428571428571428e-06,
"loss": -0.2173,
"num_tokens": 1651286.0,
"reward": -0.41596484184265137,
"reward_std": 0.29581165313720703,
"rewards/rollout_reward_func/mean": -0.41596484184265137,
"rewards/rollout_reward_func/std": 0.38508185744285583,
"sampling/importance_sampling_ratio/max": 2.4631097316741943,
"sampling/importance_sampling_ratio/mean": 1.0003317594528198,
"sampling/importance_sampling_ratio/min": 0.4878324866294861,
"sampling/sampling_logp_difference/max": 0.9014246463775635,
"sampling/sampling_logp_difference/mean": 0.009676506742835045,
"step": 6,
"step_time": 91.97893484500082
},
{
"clip_ratio/high_max": 0.007268621528055519,
"clip_ratio/high_mean": 0.004720250406535342,
"clip_ratio/low_mean": 0.0014985245070420206,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006218774913577363,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9210.0,
"completions/max_terminated_length": 9210.0,
"completions/mean_length": 8051.28125,
"completions/mean_terminated_length": 8051.28125,
"completions/min_length": 278.0,
"completions/min_terminated_length": 278.0,
"entropy": 0.11584594147279859,
"epoch": 0.00028,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4247251749038696,
"kl": 0.0014235296439437661,
"learning_rate": 1.3714285714285715e-06,
"loss": -0.1468,
"num_tokens": 1936745.0,
"reward": -0.5240695476531982,
"reward_std": 0.3336127996444702,
"rewards/rollout_reward_func/mean": -0.5240695476531982,
"rewards/rollout_reward_func/std": 0.3330491781234741,
"sampling/importance_sampling_ratio/max": 1.7502962350845337,
"sampling/importance_sampling_ratio/mean": 0.9991633892059326,
"sampling/importance_sampling_ratio/min": 0.3610191345214844,
"sampling/sampling_logp_difference/max": 1.0188243389129639,
"sampling/sampling_logp_difference/mean": 0.011139116249978542,
"step": 7,
"step_time": 95.33587853800009
},
{
"clip_ratio/high_max": 0.007413623738102615,
"clip_ratio/high_mean": 0.0037068118690513074,
"clip_ratio/low_mean": 0.0014727154921274632,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005179527361178771,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9325.0,
"completions/max_terminated_length": 9325.0,
"completions/mean_length": 7462.8125,
"completions/mean_terminated_length": 7462.8125,
"completions/min_length": 640.0,
"completions/min_terminated_length": 640.0,
"entropy": 0.08036898588761687,
"epoch": 0.00032,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6594958305358887,
"kl": 0.0006596859343517281,
"learning_rate": 1.6e-06,
"loss": -0.0556,
"num_tokens": 2203101.0,
"reward": -0.35628482699394226,
"reward_std": 0.4496096074581146,
"rewards/rollout_reward_func/mean": -0.35628482699394226,
"rewards/rollout_reward_func/std": 0.5717853307723999,
"sampling/importance_sampling_ratio/max": 1.823210597038269,
"sampling/importance_sampling_ratio/mean": 1.0012201070785522,
"sampling/importance_sampling_ratio/min": 0.6447485089302063,
"sampling/sampling_logp_difference/max": 0.6005990505218506,
"sampling/sampling_logp_difference/mean": 0.009084178134799004,
"step": 8,
"step_time": 91.30933555899946
},
{
"clip_ratio/high_max": 0.008599455235525966,
"clip_ratio/high_mean": 0.004299727617762983,
"clip_ratio/low_mean": 0.0007231474155560136,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005022875062422827,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9116.0,
"completions/max_terminated_length": 9116.0,
"completions/mean_length": 7942.28125,
"completions/mean_terminated_length": 7942.28125,
"completions/min_length": 389.0,
"completions/min_terminated_length": 389.0,
"entropy": 0.09118380001746118,
"epoch": 0.00036,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2313306331634521,
"kl": 0.0009252334230041015,
"learning_rate": 1.8285714285714284e-06,
"loss": -0.2388,
"num_tokens": 2485072.0,
"reward": -0.5225930213928223,
"reward_std": 0.2826329469680786,
"rewards/rollout_reward_func/mean": -0.5225930213928223,
"rewards/rollout_reward_func/std": 0.29529422521591187,
"sampling/importance_sampling_ratio/max": 1.675890326499939,
"sampling/importance_sampling_ratio/mean": 0.998712420463562,
"sampling/importance_sampling_ratio/min": 0.4924771189689636,
"sampling/sampling_logp_difference/max": 0.7083072662353516,
"sampling/sampling_logp_difference/mean": 0.00942724198102951,
"step": 9,
"step_time": 95.65081479300034
},
{
"clip_ratio/high_max": 0.0057004125555977225,
"clip_ratio/high_mean": 0.0028502062777988613,
"clip_ratio/low_mean": 0.001784498308552429,
"clip_ratio/low_min": 0.0007022471982054412,
"clip_ratio/region_mean": 0.00463470458635129,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9102.0,
"completions/max_terminated_length": 9102.0,
"completions/mean_length": 7725.46875,
"completions/mean_terminated_length": 7725.46875,
"completions/min_length": 1577.0,
"completions/min_terminated_length": 1577.0,
"entropy": 0.10202601249329746,
"epoch": 0.0004,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2979556322097778,
"kl": 0.0008248507838288788,
"learning_rate": 2.057142857142857e-06,
"loss": 0.0504,
"num_tokens": 2759975.0,
"reward": -0.4137440621852875,
"reward_std": 0.4455605745315552,
"rewards/rollout_reward_func/mean": -0.4137440621852875,
"rewards/rollout_reward_func/std": 0.5681707859039307,
"sampling/importance_sampling_ratio/max": 1.804335355758667,
"sampling/importance_sampling_ratio/mean": 1.0012147426605225,
"sampling/importance_sampling_ratio/min": 0.42815232276916504,
"sampling/sampling_logp_difference/max": 0.8482762575149536,
"sampling/sampling_logp_difference/mean": 0.011439252644777298,
"step": 10,
"step_time": 94.30586862800192
},
{
"clip_ratio/high_max": 0.005768664239440113,
"clip_ratio/high_mean": 0.003239445824874565,
"clip_ratio/low_mean": 0.0021694422175642103,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005408888071542606,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9448.0,
"completions/max_terminated_length": 9448.0,
"completions/mean_length": 8284.78125,
"completions/mean_terminated_length": 8284.78125,
"completions/min_length": 3093.0,
"completions/min_terminated_length": 3093.0,
"entropy": 0.08923958241939545,
"epoch": 0.00044,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.318570613861084,
"kl": 0.0008534059998055454,
"learning_rate": 2.2857142857142856e-06,
"loss": -0.0948,
"num_tokens": 3052665.0,
"reward": -0.5442342162132263,
"reward_std": 0.23896004259586334,
"rewards/rollout_reward_func/mean": -0.5442342162132263,
"rewards/rollout_reward_func/std": 0.3197546601295471,
"sampling/importance_sampling_ratio/max": 1.7523086071014404,
"sampling/importance_sampling_ratio/mean": 1.0002540349960327,
"sampling/importance_sampling_ratio/min": 0.30823758244514465,
"sampling/sampling_logp_difference/max": 1.176884412765503,
"sampling/sampling_logp_difference/mean": 0.010090906172990799,
"step": 11,
"step_time": 99.13019177800106
},
{
"clip_ratio/high_max": 0.005701107264030725,
"clip_ratio/high_mean": 0.004259218374500051,
"clip_ratio/low_mean": 0.0021850839839316905,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006444302358431742,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9421.0,
"completions/max_terminated_length": 9421.0,
"completions/mean_length": 8058.28125,
"completions/mean_terminated_length": 8058.28125,
"completions/min_length": 2072.0,
"completions/min_terminated_length": 2072.0,
"entropy": 0.10937830060720444,
"epoch": 0.00048,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4182522296905518,
"kl": 0.0009953968228728627,
"learning_rate": 2.5142857142857142e-06,
"loss": -0.0408,
"num_tokens": 3337753.0,
"reward": -0.47211629152297974,
"reward_std": 0.33418089151382446,
"rewards/rollout_reward_func/mean": -0.47211629152297974,
"rewards/rollout_reward_func/std": 0.3411623537540436,
"sampling/importance_sampling_ratio/max": 1.9659794569015503,
"sampling/importance_sampling_ratio/mean": 1.00056791305542,
"sampling/importance_sampling_ratio/min": 0.4571618437767029,
"sampling/sampling_logp_difference/max": 0.7827178239822388,
"sampling/sampling_logp_difference/mean": 0.011176912114024162,
"step": 12,
"step_time": 99.95414250199883
},
{
"clip_ratio/high_max": 0.006522299780044705,
"clip_ratio/high_mean": 0.00396738713607192,
"clip_ratio/low_mean": 0.003070202248636633,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007037589413812384,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9322.0,
"completions/max_terminated_length": 9322.0,
"completions/mean_length": 7458.75,
"completions/mean_terminated_length": 7458.75,
"completions/min_length": 404.0,
"completions/min_terminated_length": 404.0,
"entropy": 0.10976240411400795,
"epoch": 0.00052,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.612453579902649,
"kl": 0.0007661073350391234,
"learning_rate": 2.742857142857143e-06,
"loss": -0.2568,
"num_tokens": 3603216.0,
"reward": -0.5523585081100464,
"reward_std": 0.3823288381099701,
"rewards/rollout_reward_func/mean": -0.5523585081100464,
"rewards/rollout_reward_func/std": 0.4409903883934021,
"sampling/importance_sampling_ratio/max": 1.8682420253753662,
"sampling/importance_sampling_ratio/mean": 0.9997111558914185,
"sampling/importance_sampling_ratio/min": 0.6721150875091553,
"sampling/sampling_logp_difference/max": 0.6249978542327881,
"sampling/sampling_logp_difference/mean": 0.011141350492835045,
"step": 13,
"step_time": 91.4230538769998
},
{
"clip_ratio/high_max": 0.004288507916498929,
"clip_ratio/high_mean": 0.0021442539582494646,
"clip_ratio/low_mean": 0.0032669483334757388,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005411202291725203,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8990.0,
"completions/max_terminated_length": 8990.0,
"completions/mean_length": 7743.78125,
"completions/mean_terminated_length": 7743.78125,
"completions/min_length": 2922.0,
"completions/min_terminated_length": 2922.0,
"entropy": 0.09799129283055663,
"epoch": 0.00056,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2676411867141724,
"kl": 0.0006998809913056903,
"learning_rate": 2.9714285714285716e-06,
"loss": -0.0431,
"num_tokens": 3878088.0,
"reward": -0.4721910059452057,
"reward_std": 0.4045681357383728,
"rewards/rollout_reward_func/mean": -0.4721910059452057,
"rewards/rollout_reward_func/std": 0.4375231862068176,
"sampling/importance_sampling_ratio/max": 1.5526798963546753,
"sampling/importance_sampling_ratio/mean": 0.9998083114624023,
"sampling/importance_sampling_ratio/min": 0.5838956236839294,
"sampling/sampling_logp_difference/max": 0.5380330085754395,
"sampling/sampling_logp_difference/mean": 0.009330052882432938,
"step": 14,
"step_time": 96.34136769400084
},
{
"clip_ratio/high_max": 0.0007102272938936949,
"clip_ratio/high_mean": 0.00035511364694684744,
"clip_ratio/low_mean": 0.001096121472073719,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014512351190205663,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9132.0,
"completions/max_terminated_length": 9132.0,
"completions/mean_length": 8004.5625,
"completions/mean_terminated_length": 8004.5625,
"completions/min_length": 2190.0,
"completions/min_terminated_length": 2190.0,
"entropy": 0.10588292591273785,
"epoch": 0.0006,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2579617500305176,
"kl": 0.0006010868073644815,
"learning_rate": 3.2e-06,
"loss": -0.0555,
"num_tokens": 4160851.0,
"reward": -0.43845027685165405,
"reward_std": 0.39296823740005493,
"rewards/rollout_reward_func/mean": -0.43845027685165405,
"rewards/rollout_reward_func/std": 0.41991615295410156,
"sampling/importance_sampling_ratio/max": 1.5303796529769897,
"sampling/importance_sampling_ratio/mean": 0.9994367361068726,
"sampling/importance_sampling_ratio/min": 0.4505850076675415,
"sampling/sampling_logp_difference/max": 0.7972085475921631,
"sampling/sampling_logp_difference/mean": 0.010887149721384048,
"step": 15,
"step_time": 94.16652587299814
},
{
"clip_ratio/high_max": 0.005843322374857962,
"clip_ratio/high_mean": 0.002921661187428981,
"clip_ratio/low_mean": 0.001611912128282711,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004533573315711692,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9094.0,
"completions/max_terminated_length": 9094.0,
"completions/mean_length": 7290.71875,
"completions/mean_terminated_length": 7290.71875,
"completions/min_length": 399.0,
"completions/min_terminated_length": 399.0,
"entropy": 0.0855347712058574,
"epoch": 0.00064,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.503034234046936,
"kl": 0.0009320054400632216,
"learning_rate": 3.428571428571428e-06,
"loss": -0.2363,
"num_tokens": 4421376.0,
"reward": -0.6369705200195312,
"reward_std": 0.26649945974349976,
"rewards/rollout_reward_func/mean": -0.6369705200195312,
"rewards/rollout_reward_func/std": 0.2727498710155487,
"sampling/importance_sampling_ratio/max": 2.165536403656006,
"sampling/importance_sampling_ratio/mean": 1.0002843141555786,
"sampling/importance_sampling_ratio/min": 0.2399691492319107,
"sampling/sampling_logp_difference/max": 1.4272449016571045,
"sampling/sampling_logp_difference/mean": 0.00943131186068058,
"step": 16,
"step_time": 91.92633632400157
},
{
"clip_ratio/high_max": 0.0058416714309714735,
"clip_ratio/high_mean": 0.0029208357154857367,
"clip_ratio/low_mean": 0.0018002486322075129,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00472108434769325,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9424.0,
"completions/max_terminated_length": 9424.0,
"completions/mean_length": 7896.0,
"completions/mean_terminated_length": 7896.0,
"completions/min_length": 2734.0,
"completions/min_terminated_length": 2734.0,
"entropy": 0.08829498197883368,
"epoch": 0.00068,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.409907341003418,
"kl": 0.0010910468854490318,
"learning_rate": 3.657142857142857e-06,
"loss": -0.0325,
"num_tokens": 4700950.0,
"reward": -0.4402346909046173,
"reward_std": 0.4639027416706085,
"rewards/rollout_reward_func/mean": -0.4402346909046173,
"rewards/rollout_reward_func/std": 0.49393191933631897,
"sampling/importance_sampling_ratio/max": 2.308764696121216,
"sampling/importance_sampling_ratio/mean": 0.999572217464447,
"sampling/importance_sampling_ratio/min": 0.42815110087394714,
"sampling/sampling_logp_difference/max": 0.8482791185379028,
"sampling/sampling_logp_difference/mean": 0.010221365839242935,
"step": 17,
"step_time": 94.6907513449978
},
{
"clip_ratio/high_max": 0.005707252013962716,
"clip_ratio/high_mean": 0.0032087396539282054,
"clip_ratio/low_mean": 0.0003676470660138875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003576386719942093,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9244.0,
"completions/max_terminated_length": 9244.0,
"completions/mean_length": 8353.9375,
"completions/mean_terminated_length": 8353.9375,
"completions/min_length": 3917.0,
"completions/min_terminated_length": 3917.0,
"entropy": 0.09095717361196876,
"epoch": 0.00072,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5224615335464478,
"kl": 0.0014694390793010825,
"learning_rate": 3.885714285714286e-06,
"loss": -0.0395,
"num_tokens": 4995416.0,
"reward": -0.4383149743080139,
"reward_std": 0.2706039547920227,
"rewards/rollout_reward_func/mean": -0.4383149743080139,
"rewards/rollout_reward_func/std": 0.32969218492507935,
"sampling/importance_sampling_ratio/max": 2.5134615898132324,
"sampling/importance_sampling_ratio/mean": 1.0007150173187256,
"sampling/importance_sampling_ratio/min": 0.5002727508544922,
"sampling/sampling_logp_difference/max": 0.9216609001159668,
"sampling/sampling_logp_difference/mean": 0.010718216188251972,
"step": 18,
"step_time": 97.70541409099951
},
{
"clip_ratio/high_max": 0.01007883029524237,
"clip_ratio/high_mean": 0.00613380636787042,
"clip_ratio/low_mean": 0.002054276497801766,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008188082865672186,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9502.0,
"completions/max_terminated_length": 9502.0,
"completions/mean_length": 7494.6875,
"completions/mean_terminated_length": 7494.6875,
"completions/min_length": 275.0,
"completions/min_terminated_length": 275.0,
"entropy": 0.1125592899043113,
"epoch": 0.00076,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3465840816497803,
"kl": 0.001035056316140981,
"learning_rate": 4.114285714285714e-06,
"loss": -0.3483,
"num_tokens": 5262287.0,
"reward": -0.5773377418518066,
"reward_std": 0.2989773154258728,
"rewards/rollout_reward_func/mean": -0.5773377418518066,
"rewards/rollout_reward_func/std": 0.3251783847808838,
"sampling/importance_sampling_ratio/max": 1.4896283149719238,
"sampling/importance_sampling_ratio/mean": 0.998917281627655,
"sampling/importance_sampling_ratio/min": 0.4792267382144928,
"sampling/sampling_logp_difference/max": 0.7355813980102539,
"sampling/sampling_logp_difference/mean": 0.010768642649054527,
"step": 19,
"step_time": 92.93958369900156
},
{
"clip_ratio/high_max": 0.006031187484040856,
"clip_ratio/high_mean": 0.0033707073889672756,
"clip_ratio/low_mean": 0.002600287349196151,
"clip_ratio/low_min": 0.0014124744920991361,
"clip_ratio/region_mean": 0.0059709947381634265,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9373.0,
"completions/max_terminated_length": 9373.0,
"completions/mean_length": 7589.03125,
"completions/mean_terminated_length": 7589.03125,
"completions/min_length": 1505.0,
"completions/min_terminated_length": 1505.0,
"entropy": 0.0929885795339942,
"epoch": 0.0008,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3409026861190796,
"kl": 0.0012185092678009823,
"learning_rate": 4.342857142857142e-06,
"loss": -0.1635,
"num_tokens": 5532862.0,
"reward": -0.38000091910362244,
"reward_std": 0.5382703542709351,
"rewards/rollout_reward_func/mean": -0.38000091910362244,
"rewards/rollout_reward_func/std": 0.5696430206298828,
"sampling/importance_sampling_ratio/max": 1.7224009037017822,
"sampling/importance_sampling_ratio/mean": 0.9997408390045166,
"sampling/importance_sampling_ratio/min": 0.43135055899620056,
"sampling/sampling_logp_difference/max": 0.8408341407775879,
"sampling/sampling_logp_difference/mean": 0.010231327265501022,
"step": 20,
"step_time": 93.35555667800236
},
{
"clip_ratio/high_max": 0.00801744224736467,
"clip_ratio/high_mean": 0.004008721123682335,
"clip_ratio/low_mean": 0.0037498018937185407,
"clip_ratio/low_min": 0.0007102272938936949,
"clip_ratio/region_mean": 0.007758523075608537,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9290.0,
"completions/max_terminated_length": 9290.0,
"completions/mean_length": 8283.6875,
"completions/mean_terminated_length": 8283.6875,
"completions/min_length": 5247.0,
"completions/min_terminated_length": 5247.0,
"entropy": 0.10184192517772317,
"epoch": 0.00084,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.558704137802124,
"kl": 0.0017424946527171414,
"learning_rate": 4.571428571428571e-06,
"loss": -0.0122,
"num_tokens": 5825083.0,
"reward": -0.4749671220779419,
"reward_std": 0.3252595067024231,
"rewards/rollout_reward_func/mean": -0.4749671220779419,
"rewards/rollout_reward_func/std": 0.3718957304954529,
"sampling/importance_sampling_ratio/max": 1.6438056230545044,
"sampling/importance_sampling_ratio/mean": 0.9992358684539795,
"sampling/importance_sampling_ratio/min": 0.4867466688156128,
"sampling/sampling_logp_difference/max": 0.7200114727020264,
"sampling/sampling_logp_difference/mean": 0.010186174884438515,
"step": 21,
"step_time": 97.14219132500148
},
{
"clip_ratio/high_max": 0.005766152869910002,
"clip_ratio/high_mean": 0.002883076434955001,
"clip_ratio/low_mean": 0.0018179319449700415,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004701008350821212,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9405.0,
"completions/max_terminated_length": 9405.0,
"completions/mean_length": 7702.15625,
"completions/mean_terminated_length": 7702.15625,
"completions/min_length": 2410.0,
"completions/min_terminated_length": 2410.0,
"entropy": 0.10855974955484271,
"epoch": 0.00088,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.825653076171875,
"kl": 0.0012982330545128207,
"learning_rate": 4.8e-06,
"loss": -0.1233,
"num_tokens": 6098628.0,
"reward": -0.37962836027145386,
"reward_std": 0.4688373804092407,
"rewards/rollout_reward_func/mean": -0.37962836027145386,
"rewards/rollout_reward_func/std": 0.47551068663597107,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 1.0009162425994873,
"sampling/importance_sampling_ratio/min": 0.3347569704055786,
"sampling/sampling_logp_difference/max": 1.1153497695922852,
"sampling/sampling_logp_difference/mean": 0.014124426990747452,
"step": 22,
"step_time": 93.53759804500078
},
{
"clip_ratio/high_max": 0.007469919160939753,
"clip_ratio/high_mean": 0.004086083208676428,
"clip_ratio/low_mean": 0.004067586531164125,
"clip_ratio/low_min": 0.0007183908019214869,
"clip_ratio/region_mean": 0.008153669739840552,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9131.0,
"completions/max_terminated_length": 9131.0,
"completions/mean_length": 7552.71875,
"completions/mean_terminated_length": 7552.71875,
"completions/min_length": 274.0,
"completions/min_terminated_length": 274.0,
"entropy": 0.10671317647211254,
"epoch": 0.00092,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3214221000671387,
"kl": 0.0012135217020841083,
"learning_rate": 5.0285714285714285e-06,
"loss": -0.0996,
"num_tokens": 6367409.0,
"reward": -0.3321595788002014,
"reward_std": 0.4558962881565094,
"rewards/rollout_reward_func/mean": -0.3321595788002014,
"rewards/rollout_reward_func/std": 0.47004231810569763,
"sampling/importance_sampling_ratio/max": 1.9132328033447266,
"sampling/importance_sampling_ratio/mean": 1.0005993843078613,
"sampling/importance_sampling_ratio/min": 0.6039242148399353,
"sampling/sampling_logp_difference/max": 0.648794412612915,
"sampling/sampling_logp_difference/mean": 0.01185811311006546,
"step": 23,
"step_time": 91.04886297199937
},
{
"clip_ratio/high_max": 0.005706682160962373,
"clip_ratio/high_mean": 0.0032298470905516297,
"clip_ratio/low_mean": 0.004950174450641498,
"clip_ratio/low_min": 0.0007183908019214869,
"clip_ratio/region_mean": 0.008180021541193128,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9173.0,
"completions/max_terminated_length": 9173.0,
"completions/mean_length": 7576.0,
"completions/mean_terminated_length": 7576.0,
"completions/min_length": 889.0,
"completions/min_terminated_length": 889.0,
"entropy": 0.08463638043031096,
"epoch": 0.00096,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1648985147476196,
"kl": 0.0009409518825123087,
"learning_rate": 5.257142857142857e-06,
"loss": -0.2373,
"num_tokens": 6637311.0,
"reward": -0.5275511741638184,
"reward_std": 0.3658484220504761,
"rewards/rollout_reward_func/mean": -0.5275511741638184,
"rewards/rollout_reward_func/std": 0.35727164149284363,
"sampling/importance_sampling_ratio/max": 2.0391674041748047,
"sampling/importance_sampling_ratio/mean": 0.9986543655395508,
"sampling/importance_sampling_ratio/min": 0.4036279618740082,
"sampling/sampling_logp_difference/max": 0.9072617292404175,
"sampling/sampling_logp_difference/mean": 0.009868521243333817,
"step": 24,
"step_time": 93.86759965099827
},
{
"clip_ratio/high_max": 0.008623368688859046,
"clip_ratio/high_mean": 0.00466679799137637,
"clip_ratio/low_mean": 0.0014336399617604911,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006100437924033031,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8925.0,
"completions/max_terminated_length": 8925.0,
"completions/mean_length": 7419.5625,
"completions/mean_terminated_length": 7419.5625,
"completions/min_length": 395.0,
"completions/min_terminated_length": 395.0,
"entropy": 0.10826450167223811,
"epoch": 0.001,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5118001699447632,
"kl": 0.0012769073209710768,
"learning_rate": 5.485714285714286e-06,
"loss": -0.1327,
"num_tokens": 6901616.0,
"reward": -0.46326500177383423,
"reward_std": 0.30533546209335327,
"rewards/rollout_reward_func/mean": -0.46326500177383423,
"rewards/rollout_reward_func/std": 0.44885945320129395,
"sampling/importance_sampling_ratio/max": 1.874658226966858,
"sampling/importance_sampling_ratio/mean": 0.9994730949401855,
"sampling/importance_sampling_ratio/min": 0.25686314702033997,
"sampling/sampling_logp_difference/max": 1.359211802482605,
"sampling/sampling_logp_difference/mean": 0.012570840306580067,
"step": 25,
"step_time": 90.51520027400147
},
{
"clip_ratio/high_max": 0.00718123564729467,
"clip_ratio/high_mean": 0.003590617823647335,
"clip_ratio/low_mean": 0.003063533455133438,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006654151278780773,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9233.0,
"completions/max_terminated_length": 9233.0,
"completions/mean_length": 7398.21875,
"completions/mean_terminated_length": 7398.21875,
"completions/min_length": 432.0,
"completions/min_terminated_length": 432.0,
"entropy": 0.10546251107007265,
"epoch": 0.00104,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.260907769203186,
"kl": 0.0011700783816195326,
"learning_rate": 5.7142857142857145e-06,
"loss": -0.1799,
"num_tokens": 7165128.0,
"reward": -0.43886110186576843,
"reward_std": 0.43460309505462646,
"rewards/rollout_reward_func/mean": -0.43886110186576843,
"rewards/rollout_reward_func/std": 0.5057812929153442,
"sampling/importance_sampling_ratio/max": 1.4404255151748657,
"sampling/importance_sampling_ratio/mean": 0.9987990856170654,
"sampling/importance_sampling_ratio/min": 0.42815157771110535,
"sampling/sampling_logp_difference/max": 0.8482780456542969,
"sampling/sampling_logp_difference/mean": 0.010892972350120544,
"step": 26,
"step_time": 92.59128134799994
},
{
"clip_ratio/high_max": 0.0057237689034081995,
"clip_ratio/high_mean": 0.0028618844517040998,
"clip_ratio/low_mean": 0.004686754720751196,
"clip_ratio/low_min": 0.0010416667209938169,
"clip_ratio/region_mean": 0.007548639201559126,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9370.0,
"completions/max_terminated_length": 9370.0,
"completions/mean_length": 8178.8125,
"completions/mean_terminated_length": 8178.8125,
"completions/min_length": 1876.0,
"completions/min_terminated_length": 1876.0,
"entropy": 0.08889654604718089,
"epoch": 0.00108,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5473191738128662,
"kl": 0.0017560607998348132,
"learning_rate": 5.942857142857143e-06,
"loss": -0.1515,
"num_tokens": 7453964.0,
"reward": -0.38067591190338135,
"reward_std": 0.30481967329978943,
"rewards/rollout_reward_func/mean": -0.38067591190338135,
"rewards/rollout_reward_func/std": 0.35617271065711975,
"sampling/importance_sampling_ratio/max": 1.6653558015823364,
"sampling/importance_sampling_ratio/mean": 0.9998430013656616,
"sampling/importance_sampling_ratio/min": 0.4598047733306885,
"sampling/sampling_logp_difference/max": 0.7769533395767212,
"sampling/sampling_logp_difference/mean": 0.009767385199666023,
"step": 27,
"step_time": 92.73414052499811
},
{
"clip_ratio/high_max": 0.006360859319102019,
"clip_ratio/high_mean": 0.0031804296595510095,
"clip_ratio/low_mean": 0.0023903494293335825,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005570779088884592,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9179.0,
"completions/max_terminated_length": 9179.0,
"completions/mean_length": 7466.21875,
"completions/mean_terminated_length": 7466.21875,
"completions/min_length": 873.0,
"completions/min_terminated_length": 873.0,
"entropy": 0.09552860073745251,
"epoch": 0.00112,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2436789274215698,
"kl": 0.0010375778874731623,
"learning_rate": 6.171428571428571e-06,
"loss": -0.2215,
"num_tokens": 7719394.0,
"reward": -0.5598432421684265,
"reward_std": 0.38949286937713623,
"rewards/rollout_reward_func/mean": -0.5598432421684265,
"rewards/rollout_reward_func/std": 0.3775184154510498,
"sampling/importance_sampling_ratio/max": 1.9373853206634521,
"sampling/importance_sampling_ratio/mean": 1.0011639595031738,
"sampling/importance_sampling_ratio/min": 0.48319393396377563,
"sampling/sampling_logp_difference/max": 0.7273372411727905,
"sampling/sampling_logp_difference/mean": 0.010366151109337807,
"step": 28,
"step_time": 92.60433979299978
},
{
"clip_ratio/high_max": 0.007077141373883933,
"clip_ratio/high_mean": 0.0035385706869419664,
"clip_ratio/low_mean": 0.0018004352750722319,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005339005962014198,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9004.0,
"completions/max_terminated_length": 9004.0,
"completions/mean_length": 8326.46875,
"completions/mean_terminated_length": 8326.46875,
"completions/min_length": 5302.0,
"completions/min_terminated_length": 5302.0,
"entropy": 0.10070427041500807,
"epoch": 0.00116,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5207918882369995,
"kl": 0.0014244818194129039,
"learning_rate": 6.4e-06,
"loss": 0.0112,
"num_tokens": 8013061.0,
"reward": -0.39570820331573486,
"reward_std": 0.3965410888195038,
"rewards/rollout_reward_func/mean": -0.39570820331573486,
"rewards/rollout_reward_func/std": 0.45599400997161865,
"sampling/importance_sampling_ratio/max": 1.5399894714355469,
"sampling/importance_sampling_ratio/mean": 0.9994006156921387,
"sampling/importance_sampling_ratio/min": 0.6555483937263489,
"sampling/sampling_logp_difference/max": 0.4317755699157715,
"sampling/sampling_logp_difference/mean": 0.00899563729763031,
"step": 29,
"step_time": 96.32543996599998
},
{
"clip_ratio/high_max": 0.004940582090057433,
"clip_ratio/high_mean": 0.0024702910450287163,
"clip_ratio/low_mean": 0.0014245363418012857,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003894827386830002,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9184.0,
"completions/max_terminated_length": 9184.0,
"completions/mean_length": 7472.78125,
"completions/mean_terminated_length": 7472.78125,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"entropy": 0.0843133123125881,
"epoch": 0.0012,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9470192193984985,
"kl": 0.0012731589044960856,
"learning_rate": 6.628571428571428e-06,
"loss": -0.2329,
"num_tokens": 8278694.0,
"reward": -0.4546031951904297,
"reward_std": 0.37902095913887024,
"rewards/rollout_reward_func/mean": -0.4546031951904297,
"rewards/rollout_reward_func/std": 0.3981786370277405,
"sampling/importance_sampling_ratio/max": 2.6150405406951904,
"sampling/importance_sampling_ratio/mean": 1.0001695156097412,
"sampling/importance_sampling_ratio/min": 0.45047178864479065,
"sampling/sampling_logp_difference/max": 0.9612796306610107,
"sampling/sampling_logp_difference/mean": 0.009561501443386078,
"step": 30,
"step_time": 91.3648072499991
},
{
"clip_ratio/high_max": 0.007095419394318014,
"clip_ratio/high_mean": 0.003894931956892833,
"clip_ratio/low_mean": 0.0019353094103280455,
"clip_ratio/low_min": 0.000735294132027775,
"clip_ratio/region_mean": 0.005830241338117048,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9345.0,
"completions/max_terminated_length": 9345.0,
"completions/mean_length": 7978.6875,
"completions/mean_terminated_length": 7978.6875,
"completions/min_length": 1496.0,
"completions/min_terminated_length": 1496.0,
"entropy": 0.12459231936372817,
"epoch": 0.00124,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6051055192947388,
"kl": 0.001607223342944053,
"learning_rate": 6.857142857142856e-06,
"loss": -0.0711,
"num_tokens": 8561291.0,
"reward": -0.4515855014324188,
"reward_std": 0.3598511219024658,
"rewards/rollout_reward_func/mean": -0.4515855014324188,
"rewards/rollout_reward_func/std": 0.40270760655403137,
"sampling/importance_sampling_ratio/max": 1.798833966255188,
"sampling/importance_sampling_ratio/mean": 1.0015867948532104,
"sampling/importance_sampling_ratio/min": 0.6142684817314148,
"sampling/sampling_logp_difference/max": 0.5871386528015137,
"sampling/sampling_logp_difference/mean": 0.012068906798958778,
"step": 31,
"step_time": 94.2069850859998
},
{
"clip_ratio/high_max": 0.0036044081789441407,
"clip_ratio/high_mean": 0.0018022040894720703,
"clip_ratio/low_mean": 0.004805904318345711,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006608108436921611,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9188.0,
"completions/max_terminated_length": 9188.0,
"completions/mean_length": 7920.375,
"completions/mean_terminated_length": 7920.375,
"completions/min_length": 1912.0,
"completions/min_terminated_length": 1912.0,
"entropy": 0.10130815836600959,
"epoch": 0.00128,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1719034910202026,
"kl": 0.0014112608332652599,
"learning_rate": 7.085714285714285e-06,
"loss": -0.1936,
"num_tokens": 8841736.0,
"reward": -0.4756479263305664,
"reward_std": 0.3400876522064209,
"rewards/rollout_reward_func/mean": -0.4756479263305664,
"rewards/rollout_reward_func/std": 0.39079946279525757,
"sampling/importance_sampling_ratio/max": 1.9659785032272339,
"sampling/importance_sampling_ratio/mean": 1.0011847019195557,
"sampling/importance_sampling_ratio/min": 0.5454869270324707,
"sampling/sampling_logp_difference/max": 0.675990104675293,
"sampling/sampling_logp_difference/mean": 0.010147813707590103,
"step": 32,
"step_time": 94.26557795399913
},
{
"clip_ratio/high_max": 0.00641977513441816,
"clip_ratio/high_mean": 0.003561011195415631,
"clip_ratio/low_mean": 0.002905456320149824,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006466467515565455,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9143.0,
"completions/max_terminated_length": 9143.0,
"completions/mean_length": 8356.90625,
"completions/mean_terminated_length": 8356.90625,
"completions/min_length": 7207.0,
"completions/min_terminated_length": 7207.0,
"entropy": 0.10661770938895643,
"epoch": 0.00132,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5199352502822876,
"kl": 0.0017544178053867654,
"learning_rate": 7.314285714285714e-06,
"loss": -0.0041,
"num_tokens": 9136084.0,
"reward": -0.4854487180709839,
"reward_std": 0.3774080276489258,
"rewards/rollout_reward_func/mean": -0.4854487180709839,
"rewards/rollout_reward_func/std": 0.4189002811908722,
"sampling/importance_sampling_ratio/max": 2.0858850479125977,
"sampling/importance_sampling_ratio/mean": 1.0001283884048462,
"sampling/importance_sampling_ratio/min": 0.5391538143157959,
"sampling/sampling_logp_difference/max": 0.7351932525634766,
"sampling/sampling_logp_difference/mean": 0.012622365728020668,
"step": 33,
"step_time": 98.72545758600063
},
{
"clip_ratio/high_max": 0.006451850291341543,
"clip_ratio/high_mean": 0.0032259251456707716,
"clip_ratio/low_mean": 0.0043309712782502174,
"clip_ratio/low_min": 0.0007267441833391786,
"clip_ratio/region_mean": 0.0075568964530248195,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9138.0,
"completions/max_terminated_length": 9138.0,
"completions/mean_length": 7346.3125,
"completions/mean_terminated_length": 7346.3125,
"completions/min_length": 653.0,
"completions/min_terminated_length": 653.0,
"entropy": 0.09478296199813485,
"epoch": 0.00136,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5919150114059448,
"kl": 0.0016131439460878028,
"learning_rate": 7.542857142857142e-06,
"loss": -0.159,
"num_tokens": 9397980.0,
"reward": -0.44115152955055237,
"reward_std": 0.431715190410614,
"rewards/rollout_reward_func/mean": -0.44115152955055237,
"rewards/rollout_reward_func/std": 0.45346444845199585,
"sampling/importance_sampling_ratio/max": 2.316176414489746,
"sampling/importance_sampling_ratio/mean": 1.0013811588287354,
"sampling/importance_sampling_ratio/min": 0.4872380495071411,
"sampling/sampling_logp_difference/max": 0.8399176597595215,
"sampling/sampling_logp_difference/mean": 0.010870829224586487,
"step": 34,
"step_time": 94.50844769800096
},
{
"clip_ratio/high_max": 0.00862313958350569,
"clip_ratio/high_mean": 0.004311569791752845,
"clip_ratio/low_mean": 0.0021354027558118105,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0064469725475646555,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8829.0,
"completions/max_terminated_length": 8829.0,
"completions/mean_length": 6773.25,
"completions/mean_terminated_length": 6773.25,
"completions/min_length": 273.0,
"completions/min_terminated_length": 273.0,
"entropy": 0.09439847921021283,
"epoch": 0.0014,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2422112226486206,
"kl": 0.0016963164634944405,
"learning_rate": 7.771428571428572e-06,
"loss": -0.2231,
"num_tokens": 9641472.0,
"reward": -0.5031548142433167,
"reward_std": 0.4589303135871887,
"rewards/rollout_reward_func/mean": -0.5031548142433167,
"rewards/rollout_reward_func/std": 0.46338412165641785,
"sampling/importance_sampling_ratio/max": 1.4338102340698242,
"sampling/importance_sampling_ratio/mean": 0.9990464448928833,
"sampling/importance_sampling_ratio/min": 0.5403674840927124,
"sampling/sampling_logp_difference/max": 0.6155059337615967,
"sampling/sampling_logp_difference/mean": 0.00977895874530077,
"step": 35,
"step_time": 92.36479951699857
},
{
"clip_ratio/high_max": 0.007964848773553967,
"clip_ratio/high_mean": 0.004350071423687041,
"clip_ratio/low_mean": 0.002024611836532131,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006374683260219172,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9254.0,
"completions/max_terminated_length": 9254.0,
"completions/mean_length": 7325.6875,
"completions/mean_terminated_length": 7325.6875,
"completions/min_length": 418.0,
"completions/min_terminated_length": 418.0,
"entropy": 0.10633090999908745,
"epoch": 0.00144,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1656460762023926,
"kl": 0.0023267651195055805,
"learning_rate": 8e-06,
"loss": -0.275,
"num_tokens": 9902557.0,
"reward": -0.4935969114303589,
"reward_std": 0.3179719150066376,
"rewards/rollout_reward_func/mean": -0.4935969114303589,
"rewards/rollout_reward_func/std": 0.473392516374588,
"sampling/importance_sampling_ratio/max": 1.6787704229354858,
"sampling/importance_sampling_ratio/mean": 1.0006248950958252,
"sampling/importance_sampling_ratio/min": 0.6142685413360596,
"sampling/sampling_logp_difference/max": 0.518061637878418,
"sampling/sampling_logp_difference/mean": 0.011818873696029186,
"step": 36,
"step_time": 88.57236199499948
},
{
"clip_ratio/high_max": 0.006417479307856411,
"clip_ratio/high_mean": 0.0032087396539282054,
"clip_ratio/low_mean": 0.0007312192174140364,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003939958871342242,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9355.0,
"completions/max_terminated_length": 9355.0,
"completions/mean_length": 7663.3125,
"completions/mean_terminated_length": 7663.3125,
"completions/min_length": 847.0,
"completions/min_terminated_length": 847.0,
"entropy": 0.08696980169042945,
"epoch": 0.00148,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.192537546157837,
"kl": 0.002410084724033368,
"learning_rate": 7.999888877348797e-06,
"loss": -0.2529,
"num_tokens": 10174044.0,
"reward": -0.5070594549179077,
"reward_std": 0.3443080186843872,
"rewards/rollout_reward_func/mean": -0.5070594549179077,
"rewards/rollout_reward_func/std": 0.4040925204753876,
"sampling/importance_sampling_ratio/max": 1.9132274389266968,
"sampling/importance_sampling_ratio/mean": 1.0009825229644775,
"sampling/importance_sampling_ratio/min": 0.598889946937561,
"sampling/sampling_logp_difference/max": 0.6487915515899658,
"sampling/sampling_logp_difference/mean": 0.009411752223968506,
"step": 37,
"step_time": 93.29507449899847
},
{
"clip_ratio/high_max": 0.002155932132154703,
"clip_ratio/high_mean": 0.001437161467038095,
"clip_ratio/low_mean": 0.0017123925790656358,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003149554046103731,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9064.0,
"completions/max_terminated_length": 9064.0,
"completions/mean_length": 7980.78125,
"completions/mean_terminated_length": 7980.78125,
"completions/min_length": 423.0,
"completions/min_terminated_length": 423.0,
"entropy": 0.09132296103052795,
"epoch": 0.00152,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5600956678390503,
"kl": 0.0021186837166169425,
"learning_rate": 7.999555517627349e-06,
"loss": -0.0928,
"num_tokens": 10455930.0,
"reward": -0.37076419591903687,
"reward_std": 0.42336732149124146,
"rewards/rollout_reward_func/mean": -0.37076419591903687,
"rewards/rollout_reward_func/std": 0.44395267963409424,
"sampling/importance_sampling_ratio/max": 1.7510610818862915,
"sampling/importance_sampling_ratio/mean": 0.999518871307373,
"sampling/importance_sampling_ratio/min": 0.5154094696044922,
"sampling/sampling_logp_difference/max": 0.6627936363220215,
"sampling/sampling_logp_difference/mean": 0.00994320772588253,
"step": 38,
"step_time": 94.34694778699941
},
{
"clip_ratio/high_max": 0.0057840069639496505,
"clip_ratio/high_mean": 0.0032431271101813763,
"clip_ratio/low_mean": 0.002503328782040626,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005746455892222002,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9161.0,
"completions/max_terminated_length": 9161.0,
"completions/mean_length": 7555.75,
"completions/mean_terminated_length": 7555.75,
"completions/min_length": 1935.0,
"completions/min_terminated_length": 1935.0,
"entropy": 0.10114077711477876,
"epoch": 0.00156,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3555232286453247,
"kl": 0.004268854574547731,
"learning_rate": 7.998999945531534e-06,
"loss": -0.0645,
"num_tokens": 10724635.0,
"reward": -0.35600388050079346,
"reward_std": 0.4358783960342407,
"rewards/rollout_reward_func/mean": -0.35600388050079346,
"rewards/rollout_reward_func/std": 0.5173202753067017,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 1.0001956224441528,
"sampling/importance_sampling_ratio/min": 0.5086521506309509,
"sampling/sampling_logp_difference/max": 1.204916000366211,
"sampling/sampling_logp_difference/mean": 0.012541755102574825,
"step": 39,
"step_time": 92.03758510300031
},
{
"clip_ratio/high_max": 0.006485855847131461,
"clip_ratio/high_mean": 0.003962173970649019,
"clip_ratio/low_mean": 0.0059998030483257025,
"clip_ratio/low_min": 0.0007183908019214869,
"clip_ratio/region_mean": 0.009961976989870891,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9313.0,
"completions/max_terminated_length": 9313.0,
"completions/mean_length": 8137.46875,
"completions/mean_terminated_length": 8137.46875,
"completions/min_length": 1552.0,
"completions/min_terminated_length": 1552.0,
"entropy": 0.10354270855896175,
"epoch": 0.0016,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2437281608581543,
"kl": 0.004692138581958716,
"learning_rate": 7.998222202219114e-06,
"loss": -0.1094,
"num_tokens": 11011586.0,
"reward": -0.476349800825119,
"reward_std": 0.2979031801223755,
"rewards/rollout_reward_func/mean": -0.476349800825119,
"rewards/rollout_reward_func/std": 0.29430100321769714,
"sampling/importance_sampling_ratio/max": 1.9385071992874146,
"sampling/importance_sampling_ratio/mean": 1.000066876411438,
"sampling/importance_sampling_ratio/min": 0.3966410756111145,
"sampling/sampling_logp_difference/max": 0.9247235059738159,
"sampling/sampling_logp_difference/mean": 0.011882856488227844,
"step": 40,
"step_time": 95.45463361300062
},
{
"clip_ratio/high_max": 0.005031014792621136,
"clip_ratio/high_mean": 0.002874702855478972,
"clip_ratio/low_mean": 0.0025365093897562474,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005411212187027559,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9126.0,
"completions/max_terminated_length": 9126.0,
"completions/mean_length": 7551.875,
"completions/mean_terminated_length": 7551.875,
"completions/min_length": 273.0,
"completions/min_terminated_length": 273.0,
"entropy": 0.10094616864807904,
"epoch": 0.00164,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2249935865402222,
"kl": 0.004580019838613225,
"learning_rate": 7.99722234530669e-06,
"loss": -0.1866,
"num_tokens": 11279946.0,
"reward": -0.509591817855835,
"reward_std": 0.4151650369167328,
"rewards/rollout_reward_func/mean": -0.509591817855835,
"rewards/rollout_reward_func/std": 0.47652074694633484,
"sampling/importance_sampling_ratio/max": 1.552679181098938,
"sampling/importance_sampling_ratio/mean": 0.9984216690063477,
"sampling/importance_sampling_ratio/min": 0.41414451599121094,
"sampling/sampling_logp_difference/max": 0.8815402984619141,
"sampling/sampling_logp_difference/mean": 0.009862950071692467,
"step": 41,
"step_time": 92.74472697800138
},
{
"clip_ratio/high_max": 0.00608739914605394,
"clip_ratio/high_mean": 0.0034028949739877135,
"clip_ratio/low_mean": 0.0022560179349966347,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005658912938088179,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9129.0,
"completions/max_terminated_length": 9129.0,
"completions/mean_length": 7584.25,
"completions/mean_terminated_length": 7584.25,
"completions/min_length": 272.0,
"completions/min_terminated_length": 272.0,
"entropy": 0.1042005839990452,
"epoch": 0.00168,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3942543268203735,
"kl": 0.004878379874753591,
"learning_rate": 7.996000448865428e-06,
"loss": -0.1046,
"num_tokens": 11549745.0,
"reward": -0.33219966292381287,
"reward_std": 0.51850426197052,
"rewards/rollout_reward_func/mean": -0.33219966292381287,
"rewards/rollout_reward_func/std": 0.5476008653640747,
"sampling/importance_sampling_ratio/max": 1.732586145401001,
"sampling/importance_sampling_ratio/mean": 0.999406099319458,
"sampling/importance_sampling_ratio/min": 0.5086521506309509,
"sampling/sampling_logp_difference/max": 0.6759908199310303,
"sampling/sampling_logp_difference/mean": 0.011652151122689247,
"step": 42,
"step_time": 92.72314244600057
},
{
"clip_ratio/high_max": 0.0075053395121358335,
"clip_ratio/high_mean": 0.0037526697560679168,
"clip_ratio/low_mean": 0.003480347600998357,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007233017327962443,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9100.0,
"completions/max_terminated_length": 9100.0,
"completions/mean_length": 7485.375,
"completions/mean_terminated_length": 7485.375,
"completions/min_length": 401.0,
"completions/min_terminated_length": 401.0,
"entropy": 0.09614216513000429,
"epoch": 0.00172,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0772511959075928,
"kl": 0.00416894428053638,
"learning_rate": 7.994556603415576e-06,
"loss": -0.1628,
"num_tokens": 11816091.0,
"reward": -0.42178797721862793,
"reward_std": 0.4303983449935913,
"rewards/rollout_reward_func/mean": -0.42178797721862793,
"rewards/rollout_reward_func/std": 0.4538474678993225,
"sampling/importance_sampling_ratio/max": 1.6209644079208374,
"sampling/importance_sampling_ratio/mean": 1.0004045963287354,
"sampling/importance_sampling_ratio/min": 0.3242146670818329,
"sampling/sampling_logp_difference/max": 1.1263494491577148,
"sampling/sampling_logp_difference/mean": 0.011008251458406448,
"step": 43,
"step_time": 95.21691550700052
},
{
"clip_ratio/high_max": 0.004524656978901476,
"clip_ratio/high_mean": 0.0026095507491845638,
"clip_ratio/low_mean": 0.002854076214134693,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0054636269342154264,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9289.0,
"completions/max_terminated_length": 9289.0,
"completions/mean_length": 7716.84375,
"completions/mean_terminated_length": 7716.84375,
"completions/min_length": 910.0,
"completions/min_terminated_length": 910.0,
"entropy": 0.10308593721129,
"epoch": 0.00176,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0850143432617188,
"kl": 0.008197117564122891,
"learning_rate": 7.992890915919757e-06,
"loss": -0.0659,
"num_tokens": 12089680.0,
"reward": -0.4302936792373657,
"reward_std": 0.48004305362701416,
"rewards/rollout_reward_func/mean": -0.4302936792373657,
"rewards/rollout_reward_func/std": 0.5400975346565247,
"sampling/importance_sampling_ratio/max": 2.6225600242614746,
"sampling/importance_sampling_ratio/mean": 0.9998188018798828,
"sampling/importance_sampling_ratio/min": 0.31274011731147766,
"sampling/sampling_logp_difference/max": 1.16238272190094,
"sampling/sampling_logp_difference/mean": 0.012810271233320236,
"step": 44,
"step_time": 93.50324885599821
},
{
"clip_ratio/high_max": 0.009332069545052946,
"clip_ratio/high_mean": 0.005380530434194952,
"clip_ratio/low_mean": 0.00249813572736457,
"clip_ratio/low_min": 0.0007102272938936949,
"clip_ratio/region_mean": 0.007878666161559522,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9215.0,
"completions/max_terminated_length": 9215.0,
"completions/mean_length": 7574.625,
"completions/mean_terminated_length": 7574.625,
"completions/min_length": 271.0,
"completions/min_terminated_length": 271.0,
"entropy": 0.10209884284995496,
"epoch": 0.0018,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7118793725967407,
"kl": 0.012286211093851307,
"learning_rate": 7.991003509775045e-06,
"loss": -0.2223,
"num_tokens": 12358674.0,
"reward": -0.4782068431377411,
"reward_std": 0.3393690288066864,
"rewards/rollout_reward_func/mean": -0.4782068431377411,
"rewards/rollout_reward_func/std": 0.3350028097629547,
"sampling/importance_sampling_ratio/max": 2.010491371154785,
"sampling/importance_sampling_ratio/mean": 0.998539924621582,
"sampling/importance_sampling_ratio/min": 0.4432297348976135,
"sampling/sampling_logp_difference/max": 0.8136670589447021,
"sampling/sampling_logp_difference/mean": 0.013688696548342705,
"step": 45,
"step_time": 94.50348583800042
},
{
"clip_ratio/high_max": 0.006435320246964693,
"clip_ratio/high_mean": 0.0032176601234823465,
"clip_ratio/low_mean": 0.0035371377016417682,
"clip_ratio/low_min": 0.0014286180958151817,
"clip_ratio/region_mean": 0.006754797854227945,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8981.0,
"completions/max_terminated_length": 8981.0,
"completions/mean_length": 7618.03125,
"completions/mean_terminated_length": 7618.03125,
"completions/min_length": 296.0,
"completions/min_terminated_length": 296.0,
"entropy": 0.12036720756441355,
"epoch": 0.00184,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.262148141860962,
"kl": 0.009586318075889722,
"learning_rate": 7.988894524803824e-06,
"loss": -0.1508,
"num_tokens": 12629063.0,
"reward": -0.47280675172805786,
"reward_std": 0.33904793858528137,
"rewards/rollout_reward_func/mean": -0.47280675172805786,
"rewards/rollout_reward_func/std": 0.44742149114608765,
"sampling/importance_sampling_ratio/max": 1.7463794946670532,
"sampling/importance_sampling_ratio/mean": 0.9998955726623535,
"sampling/importance_sampling_ratio/min": 0.4740566611289978,
"sampling/sampling_logp_difference/max": 0.7464284896850586,
"sampling/sampling_logp_difference/mean": 0.013707821257412434,
"step": 46,
"step_time": 91.94492842599993
},
{
"clip_ratio/high_max": 0.010757386451587081,
"clip_ratio/high_mean": 0.006088920519687235,
"clip_ratio/low_mean": 0.0021593490964733064,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008248269616160542,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9378.0,
"completions/max_terminated_length": 9378.0,
"completions/mean_length": 7946.34375,
"completions/mean_terminated_length": 7946.34375,
"completions/min_length": 882.0,
"completions/min_terminated_length": 882.0,
"entropy": 0.10261929128319025,
"epoch": 0.00188,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4796141386032104,
"kl": 0.007273106846696464,
"learning_rate": 7.986564117243426e-06,
"loss": -0.1533,
"num_tokens": 12909851.0,
"reward": -0.4102572202682495,
"reward_std": 0.2766183316707611,
"rewards/rollout_reward_func/mean": -0.4102572202682495,
"rewards/rollout_reward_func/std": 0.3281300961971283,
"sampling/importance_sampling_ratio/max": 2.5544402599334717,
"sampling/importance_sampling_ratio/mean": 0.9995360374450684,
"sampling/importance_sampling_ratio/min": 0.5378825068473816,
"sampling/sampling_logp_difference/max": 0.9378330707550049,
"sampling/sampling_logp_difference/mean": 0.011439410038292408,
"step": 47,
"step_time": 96.26667236799949
},
{
"clip_ratio/high_max": 0.007107149169314653,
"clip_ratio/high_mean": 0.003900796815287322,
"clip_ratio/low_mean": 0.0016345784824807197,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005535375326871872,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9092.0,
"completions/max_terminated_length": 9092.0,
"completions/mean_length": 7913.40625,
"completions/mean_terminated_length": 7913.40625,
"completions/min_length": 2022.0,
"completions/min_terminated_length": 2022.0,
"entropy": 0.1097751297056675,
"epoch": 0.00192,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.754341721534729,
"kl": 0.009378650134749478,
"learning_rate": 7.984012459734564e-06,
"loss": 0.0673,
"num_tokens": 13190202.0,
"reward": -0.3472431004047394,
"reward_std": 0.5651601552963257,
"rewards/rollout_reward_func/mean": -0.3472431004047394,
"rewards/rollout_reward_func/std": 0.5896835327148438,
"sampling/importance_sampling_ratio/max": 2.230642080307007,
"sampling/importance_sampling_ratio/mean": 1.0000313520431519,
"sampling/importance_sampling_ratio/min": 0.41313982009887695,
"sampling/sampling_logp_difference/max": 0.8839691877365112,
"sampling/sampling_logp_difference/mean": 0.011971874162554741,
"step": 48,
"step_time": 93.4559626040018
},
{
"clip_ratio/high_max": 0.006600149557925761,
"clip_ratio/high_mean": 0.004353445663582534,
"clip_ratio/low_mean": 0.0038080242229625583,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0081614697992336,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9282.0,
"completions/max_terminated_length": 9282.0,
"completions/mean_length": 7946.375,
"completions/mean_terminated_length": 7946.375,
"completions/min_length": 2384.0,
"completions/min_terminated_length": 2384.0,
"entropy": 0.09944085357710719,
"epoch": 0.00196,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2742760181427,
"kl": 0.026823249965673313,
"learning_rate": 7.981239741308533e-06,
"loss": -0.1137,
"num_tokens": 13470689.0,
"reward": -0.4703686833381653,
"reward_std": 0.2635863721370697,
"rewards/rollout_reward_func/mean": -0.4703686833381653,
"rewards/rollout_reward_func/std": 0.3776912987232208,
"sampling/importance_sampling_ratio/max": 1.6952903270721436,
"sampling/importance_sampling_ratio/mean": 0.9991170167922974,
"sampling/importance_sampling_ratio/min": 0.13879217207431793,
"sampling/sampling_logp_difference/max": 1.9747775793075562,
"sampling/sampling_logp_difference/mean": 0.012732356786727905,
"step": 49,
"step_time": 99.44212867799979
},
{
"clip_ratio/high_max": 0.009582321741618216,
"clip_ratio/high_mean": 0.0051462745177559555,
"clip_ratio/low_mean": 0.0014256636786740273,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006571938167326152,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9426.0,
"completions/max_terminated_length": 9426.0,
"completions/mean_length": 8105.5625,
"completions/mean_terminated_length": 8105.5625,
"completions/min_length": 5043.0,
"completions/min_terminated_length": 5043.0,
"entropy": 0.10958141228184104,
"epoch": 0.002,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5589848756790161,
"kl": 0.010070401827761088,
"learning_rate": 7.97824616737322e-06,
"loss": 0.0316,
"num_tokens": 13757071.0,
"reward": -0.37979596853256226,
"reward_std": 0.40848225355148315,
"rewards/rollout_reward_func/mean": -0.37979596853256226,
"rewards/rollout_reward_func/std": 0.4637466371059418,
"sampling/importance_sampling_ratio/max": 1.9868782758712769,
"sampling/importance_sampling_ratio/mean": 0.9999864101409912,
"sampling/importance_sampling_ratio/min": 0.5031964778900146,
"sampling/sampling_logp_difference/max": 0.6867746114730835,
"sampling/sampling_logp_difference/mean": 0.012691027484834194,
"step": 50,
"step_time": 97.8083335750016
},
{
"clip_ratio/high_max": 0.003072801686357707,
"clip_ratio/high_mean": 0.0015364008431788534,
"clip_ratio/low_mean": 0.005726226721890271,
"clip_ratio/low_min": 0.0007102272938936949,
"clip_ratio/region_mean": 0.007262627565069124,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9171.0,
"completions/max_terminated_length": 9171.0,
"completions/mean_length": 6720.09375,
"completions/mean_terminated_length": 6720.09375,
"completions/min_length": 279.0,
"completions/min_terminated_length": 279.0,
"entropy": 0.13046934758313,
"epoch": 0.00204,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.509793758392334,
"kl": 0.007366802208707668,
"learning_rate": 7.975031959697869e-06,
"loss": -0.3095,
"num_tokens": 13998648.0,
"reward": -0.5631532073020935,
"reward_std": 0.45109108090400696,
"rewards/rollout_reward_func/mean": -0.5631532073020935,
"rewards/rollout_reward_func/std": 0.5162220001220703,
"sampling/importance_sampling_ratio/max": 2.486898422241211,
"sampling/importance_sampling_ratio/mean": 1.0022380352020264,
"sampling/importance_sampling_ratio/min": 0.483195036649704,
"sampling/sampling_logp_difference/max": 0.9110362529754639,
"sampling/sampling_logp_difference/mean": 0.013770891353487968,
"step": 51,
"step_time": 90.76845354700072
},
{
"clip_ratio/high_max": 0.006709904468152672,
"clip_ratio/high_mean": 0.003354952234076336,
"clip_ratio/low_mean": 0.002561694651376456,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005916646885452792,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9172.0,
"completions/max_terminated_length": 9172.0,
"completions/mean_length": 6978.65625,
"completions/mean_terminated_length": 6978.65625,
"completions/min_length": 399.0,
"completions/min_terminated_length": 399.0,
"entropy": 0.09509467124007642,
"epoch": 0.00208,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6503801345825195,
"kl": 0.009442919392313343,
"learning_rate": 7.971597356396667e-06,
"loss": -0.1007,
"num_tokens": 14248771.0,
"reward": -0.2623698115348816,
"reward_std": 0.4206399917602539,
"rewards/rollout_reward_func/mean": -0.2623698115348816,
"rewards/rollout_reward_func/std": 0.599825382232666,
"sampling/importance_sampling_ratio/max": 2.463897228240967,
"sampling/importance_sampling_ratio/mean": 1.0013692378997803,
"sampling/importance_sampling_ratio/min": 0.36485153436660767,
"sampling/sampling_logp_difference/max": 1.0082647800445557,
"sampling/sampling_logp_difference/mean": 0.012561045587062836,
"step": 52,
"step_time": 88.74248152199925
},
{
"clip_ratio/high_max": 0.006392595882061869,
"clip_ratio/high_mean": 0.0031962979410309345,
"clip_ratio/low_mean": 0.00391515699448064,
"clip_ratio/low_min": 0.0014124744920991361,
"clip_ratio/region_mean": 0.007111454935511574,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9027.0,
"completions/max_terminated_length": 9027.0,
"completions/mean_length": 7856.53125,
"completions/mean_terminated_length": 7856.53125,
"completions/min_length": 1737.0,
"completions/min_terminated_length": 1737.0,
"entropy": 0.12024440453387797,
"epoch": 0.00212,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.004279136657715,
"kl": 0.010564027619693661,
"learning_rate": 7.967942611911098e-06,
"loss": 0.0652,
"num_tokens": 14527107.0,
"reward": -0.3790961503982544,
"reward_std": 0.3861127495765686,
"rewards/rollout_reward_func/mean": -0.3790961503982544,
"rewards/rollout_reward_func/std": 0.5224611759185791,
"sampling/importance_sampling_ratio/max": 2.314476251602173,
"sampling/importance_sampling_ratio/mean": 0.9991632699966431,
"sampling/importance_sampling_ratio/min": 0.4916602075099945,
"sampling/sampling_logp_difference/max": 0.8391833305358887,
"sampling/sampling_logp_difference/mean": 0.014314696192741394,
"step": 53,
"step_time": 94.72645873500005
},
{
"clip_ratio/high_max": 0.0043270515743643045,
"clip_ratio/high_mean": 0.0027531484374776483,
"clip_ratio/low_mean": 0.005383253679610789,
"clip_ratio/low_min": 0.0014124744920991361,
"clip_ratio/region_mean": 0.008136402087984607,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9499.0,
"completions/max_terminated_length": 9499.0,
"completions/mean_length": 7953.84375,
"completions/mean_terminated_length": 7953.84375,
"completions/min_length": 2228.0,
"completions/min_terminated_length": 2228.0,
"entropy": 0.13417695788666606,
"epoch": 0.00216,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5954021215438843,
"kl": 0.011198288608284201,
"learning_rate": 7.964067996991091e-06,
"loss": 0.0528,
"num_tokens": 14808243.0,
"reward": -0.40133580565452576,
"reward_std": 0.49466052651405334,
"rewards/rollout_reward_func/mean": -0.40133580565452576,
"rewards/rollout_reward_func/std": 0.5125959515571594,
"sampling/importance_sampling_ratio/max": 1.7431554794311523,
"sampling/importance_sampling_ratio/mean": 1.0006169080734253,
"sampling/importance_sampling_ratio/min": 0.5706778764724731,
"sampling/sampling_logp_difference/max": 0.5609303712844849,
"sampling/sampling_logp_difference/mean": 0.014247460290789604,
"step": 54,
"step_time": 94.98353923000104
},
{
"clip_ratio/high_max": 0.0049565358203835785,
"clip_ratio/high_mean": 0.003176613769028336,
"clip_ratio/low_mean": 0.0034777895780280232,
"clip_ratio/low_min": 0.0007183908019214869,
"clip_ratio/region_mean": 0.006654403347056359,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9193.0,
"completions/max_terminated_length": 9193.0,
"completions/mean_length": 7506.28125,
"completions/mean_terminated_length": 7506.28125,
"completions/min_length": 419.0,
"completions/min_terminated_length": 419.0,
"entropy": 0.10779642057605088,
"epoch": 0.0022,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.364328384399414,
"kl": 0.00858347719622543,
"learning_rate": 7.95997379867497e-06,
"loss": -0.3026,
"num_tokens": 15075076.0,
"reward": -0.5677193403244019,
"reward_std": 0.3086436986923218,
"rewards/rollout_reward_func/mean": -0.5677193403244019,
"rewards/rollout_reward_func/std": 0.30036547780036926,
"sampling/importance_sampling_ratio/max": 1.7251583337783813,
"sampling/importance_sampling_ratio/mean": 0.9997882843017578,
"sampling/importance_sampling_ratio/min": 0.411167174577713,
"sampling/sampling_logp_difference/max": 0.8887554407119751,
"sampling/sampling_logp_difference/mean": 0.012287753634154797,
"step": 55,
"step_time": 92.83784514500621
},
{
"clip_ratio/high_max": 0.007216088590212166,
"clip_ratio/high_mean": 0.003608044295106083,
"clip_ratio/low_mean": 0.0014666151255369186,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005074659449746832,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8876.0,
"completions/max_terminated_length": 8876.0,
"completions/mean_length": 7641.3125,
"completions/mean_terminated_length": 7641.3125,
"completions/min_length": 1432.0,
"completions/min_terminated_length": 1432.0,
"entropy": 0.10442002071067691,
"epoch": 0.00224,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2740049362182617,
"kl": 0.008950471974458196,
"learning_rate": 7.955660320268182e-06,
"loss": -0.1444,
"num_tokens": 15346367.0,
"reward": -0.4558144807815552,
"reward_std": 0.3483259677886963,
"rewards/rollout_reward_func/mean": -0.4558144807815552,
"rewards/rollout_reward_func/std": 0.4156731367111206,
"sampling/importance_sampling_ratio/max": 1.416229248046875,
"sampling/importance_sampling_ratio/mean": 0.999237060546875,
"sampling/importance_sampling_ratio/min": 0.46688735485076904,
"sampling/sampling_logp_difference/max": 0.7616672515869141,
"sampling/sampling_logp_difference/mean": 0.011178133077919483,
"step": 56,
"step_time": 94.99530417499773
},
{
"clip_ratio/high_max": 0.008344250149093568,
"clip_ratio/high_mean": 0.005249992886092514,
"clip_ratio/low_mean": 0.0018445987370796502,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007094591623172164,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9259.0,
"completions/max_terminated_length": 9259.0,
"completions/mean_length": 8172.09375,
"completions/mean_terminated_length": 8172.09375,
"completions/min_length": 1553.0,
"completions/min_terminated_length": 1553.0,
"entropy": 0.10022557340562344,
"epoch": 0.00228,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4207067489624023,
"kl": 0.008608459174865857,
"learning_rate": 7.951127881320829e-06,
"loss": 0.0944,
"num_tokens": 15634481.0,
"reward": -0.40433624386787415,
"reward_std": 0.40806153416633606,
"rewards/rollout_reward_func/mean": -0.40433624386787415,
"rewards/rollout_reward_func/std": 0.5100822448730469,
"sampling/importance_sampling_ratio/max": 2.1468729972839355,
"sampling/importance_sampling_ratio/mean": 0.9993807077407837,
"sampling/importance_sampling_ratio/min": 0.48448964953422546,
"sampling/sampling_logp_difference/max": 0.764012336730957,
"sampling/sampling_logp_difference/mean": 0.011580238118767738,
"step": 57,
"step_time": 95.68224068699783
},
{
"clip_ratio/high_max": 0.005006112158298492,
"clip_ratio/high_mean": 0.002503056079149246,
"clip_ratio/low_mean": 0.0023096000077202916,
"clip_ratio/low_min": 0.0007022471982054412,
"clip_ratio/region_mean": 0.004812656086869538,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9625.0,
"completions/max_terminated_length": 9625.0,
"completions/mean_length": 7705.75,
"completions/mean_terminated_length": 7705.75,
"completions/min_length": 822.0,
"completions/min_terminated_length": 822.0,
"entropy": 0.10624475823715329,
"epoch": 0.00232,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3904070854187012,
"kl": 0.013951810746220872,
"learning_rate": 7.946376817604e-06,
"loss": -0.0914,
"num_tokens": 15907912.0,
"reward": -0.5049029588699341,
"reward_std": 0.3845633864402771,
"rewards/rollout_reward_func/mean": -0.5049029588699341,
"rewards/rollout_reward_func/std": 0.46199285984039307,
"sampling/importance_sampling_ratio/max": 1.798954963684082,
"sampling/importance_sampling_ratio/mean": 1.0002611875534058,
"sampling/importance_sampling_ratio/min": 0.46839648485183716,
"sampling/sampling_logp_difference/max": 0.7584401965141296,
"sampling/sampling_logp_difference/mean": 0.0127301886677742,
"step": 58,
"step_time": 93.91949066700363
},
{
"clip_ratio/high_max": 0.00436182162957266,
"clip_ratio/high_mean": 0.00218091081478633,
"clip_ratio/low_mean": 0.003825479419901967,
"clip_ratio/low_min": 0.0014204545877873898,
"clip_ratio/region_mean": 0.006006390263792127,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9536.0,
"completions/max_terminated_length": 9536.0,
"completions/mean_length": 8050.53125,
"completions/mean_terminated_length": 8050.53125,
"completions/min_length": 894.0,
"completions/min_terminated_length": 894.0,
"entropy": 0.11231644381769001,
"epoch": 0.00236,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3271838426589966,
"kl": 0.007607599673065124,
"learning_rate": 7.941407481084896e-06,
"loss": -0.1529,
"num_tokens": 16191978.0,
"reward": -0.5152639150619507,
"reward_std": 0.3767213821411133,
"rewards/rollout_reward_func/mean": -0.5152639150619507,
"rewards/rollout_reward_func/std": 0.4134299159049988,
"sampling/importance_sampling_ratio/max": 1.7963619232177734,
"sampling/importance_sampling_ratio/mean": 0.9979240894317627,
"sampling/importance_sampling_ratio/min": 0.4781153202056885,
"sampling/sampling_logp_difference/max": 0.737903356552124,
"sampling/sampling_logp_difference/mean": 0.012326521798968315,
"step": 59,
"step_time": 96.3539147010033
},
{
"clip_ratio/high_max": 0.008266003103926778,
"clip_ratio/high_mean": 0.004488115198910236,
"clip_ratio/low_mean": 0.004311547527322546,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008799662697128952,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9246.0,
"completions/max_terminated_length": 9246.0,
"completions/mean_length": 8110.9375,
"completions/mean_terminated_length": 8110.9375,
"completions/min_length": 3902.0,
"completions/min_terminated_length": 3902.0,
"entropy": 0.12428534729406238,
"epoch": 0.0024,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9092047214508057,
"kl": 0.012073749028786551,
"learning_rate": 7.936220239900746e-06,
"loss": -0.0357,
"num_tokens": 16478068.0,
"reward": -0.4510955512523651,
"reward_std": 0.31745630502700806,
"rewards/rollout_reward_func/mean": -0.4510955512523651,
"rewards/rollout_reward_func/std": 0.37336239218711853,
"sampling/importance_sampling_ratio/max": 2.0322628021240234,
"sampling/importance_sampling_ratio/mean": 0.9984027147293091,
"sampling/importance_sampling_ratio/min": 0.535262942314148,
"sampling/sampling_logp_difference/max": 0.7091498374938965,
"sampling/sampling_logp_difference/mean": 0.015058638527989388,
"step": 60,
"step_time": 95.91656595599852
},
{
"clip_ratio/high_max": 0.0013888889225199819,
"clip_ratio/high_mean": 0.0006944444612599909,
"clip_ratio/low_mean": 0.0022155048209242523,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002909949282184243,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9154.0,
"completions/max_terminated_length": 9154.0,
"completions/mean_length": 7612.8125,
"completions/mean_terminated_length": 7612.8125,
"completions/min_length": 1984.0,
"completions/min_terminated_length": 1984.0,
"entropy": 0.10300399106927216,
"epoch": 0.00244,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.391798973083496,
"kl": 0.004920201638014987,
"learning_rate": 7.930815478331545e-06,
"loss": -0.1339,
"num_tokens": 16748102.0,
"reward": -0.4280434846878052,
"reward_std": 0.4262932240962982,
"rewards/rollout_reward_func/mean": -0.4280434846878052,
"rewards/rollout_reward_func/std": 0.5089011192321777,
"sampling/importance_sampling_ratio/max": 1.7586421966552734,
"sampling/importance_sampling_ratio/mean": 1.000064730644226,
"sampling/importance_sampling_ratio/min": 0.37761256098747253,
"sampling/sampling_logp_difference/max": 0.9738866090774536,
"sampling/sampling_logp_difference/mean": 0.011382215656340122,
"step": 61,
"step_time": 92.25858828300079
},
{
"clip_ratio/high_max": 0.007856509007979184,
"clip_ratio/high_mean": 0.003928254503989592,
"clip_ratio/low_mean": 0.0018972224788740277,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00582547701196745,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9187.0,
"completions/max_terminated_length": 9187.0,
"completions/mean_length": 7976.46875,
"completions/mean_terminated_length": 7976.46875,
"completions/min_length": 3602.0,
"completions/min_terminated_length": 3602.0,
"entropy": 0.11776942946016788,
"epoch": 0.00248,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3773763179779053,
"kl": 0.006828719437180553,
"learning_rate": 7.925193596771585e-06,
"loss": -0.1787,
"num_tokens": 17030139.0,
"reward": -0.4070221781730652,
"reward_std": 0.4106467366218567,
"rewards/rollout_reward_func/mean": -0.4070221781730652,
"rewards/rollout_reward_func/std": 0.44072529673576355,
"sampling/importance_sampling_ratio/max": 1.859816312789917,
"sampling/importance_sampling_ratio/mean": 0.9994407296180725,
"sampling/importance_sampling_ratio/min": 0.4592384099960327,
"sampling/sampling_logp_difference/max": 0.7781858444213867,
"sampling/sampling_logp_difference/mean": 0.011814535595476627,
"step": 62,
"step_time": 92.16294349900272
},
{
"clip_ratio/high_max": 0.0035953749902546406,
"clip_ratio/high_mean": 0.0017976874951273203,
"clip_ratio/low_mean": 0.0022025909856893122,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004000278509920463,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9030.0,
"completions/max_terminated_length": 9030.0,
"completions/mean_length": 8136.34375,
"completions/mean_terminated_length": 8136.34375,
"completions/min_length": 2270.0,
"completions/min_terminated_length": 2270.0,
"entropy": 0.09144307160750031,
"epoch": 0.00252,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1514160633087158,
"kl": 0.00797528739349218,
"learning_rate": 7.919355011699786e-06,
"loss": -0.0769,
"num_tokens": 17316894.0,
"reward": -0.4627407193183899,
"reward_std": 0.3360978960990906,
"rewards/rollout_reward_func/mean": -0.4627407193183899,
"rewards/rollout_reward_func/std": 0.35081133246421814,
"sampling/importance_sampling_ratio/max": 1.6952903270721436,
"sampling/importance_sampling_ratio/mean": 0.9994579553604126,
"sampling/importance_sampling_ratio/min": 0.4881785213947296,
"sampling/sampling_logp_difference/max": 0.7170741558074951,
"sampling/sampling_logp_difference/mean": 0.010036073625087738,
"step": 63,
"step_time": 96.17660570699991
},
{
"clip_ratio/high_max": 0.002883423527237028,
"clip_ratio/high_mean": 0.001441711763618514,
"clip_ratio/low_mean": 0.00249927889672108,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003940990660339594,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9163.0,
"completions/max_terminated_length": 9163.0,
"completions/mean_length": 7561.375,
"completions/mean_terminated_length": 7561.375,
"completions/min_length": 3059.0,
"completions/min_terminated_length": 3059.0,
"entropy": 0.09984453371725976,
"epoch": 0.00256,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3233143091201782,
"kl": 0.00887665447044128,
"learning_rate": 7.913300155648851e-06,
"loss": 0.0311,
"num_tokens": 17585042.0,
"reward": -0.3819345235824585,
"reward_std": 0.46788647770881653,
"rewards/rollout_reward_func/mean": -0.3819345235824585,
"rewards/rollout_reward_func/std": 0.5240892171859741,
"sampling/importance_sampling_ratio/max": 1.9474657773971558,
"sampling/importance_sampling_ratio/mean": 0.9996236562728882,
"sampling/importance_sampling_ratio/min": 0.537067711353302,
"sampling/sampling_logp_difference/max": 0.6665289402008057,
"sampling/sampling_logp_difference/mean": 0.010441070422530174,
"step": 64,
"step_time": 92.7075216550038
},
{
"clip_ratio/high_max": 0.009018870943691581,
"clip_ratio/high_mean": 0.004860559041844681,
"clip_ratio/low_mean": 0.001825410407036543,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006685969448881224,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9200.0,
"completions/max_terminated_length": 9200.0,
"completions/mean_length": 7343.84375,
"completions/mean_terminated_length": 7343.84375,
"completions/min_length": 1590.0,
"completions/min_terminated_length": 1590.0,
"entropy": 0.11347688268870115,
"epoch": 0.0026,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4888594150543213,
"kl": 0.01094079991162289,
"learning_rate": 7.907029477173219e-06,
"loss": 0.0067,
"num_tokens": 17846223.0,
"reward": -0.3279417157173157,
"reward_std": 0.5277228951454163,
"rewards/rollout_reward_func/mean": -0.3279417157173157,
"rewards/rollout_reward_func/std": 0.6081415414810181,
"sampling/importance_sampling_ratio/max": 1.5022884607315063,
"sampling/importance_sampling_ratio/mean": 0.9989460706710815,
"sampling/importance_sampling_ratio/min": 0.5411266088485718,
"sampling/sampling_logp_difference/max": 0.6141020059585571,
"sampling/sampling_logp_difference/mean": 0.011345047503709793,
"step": 65,
"step_time": 90.58205723999708
},
{
"clip_ratio/high_max": 0.004411768633872271,
"clip_ratio/high_mean": 0.002557007916038856,
"clip_ratio/low_mean": 0.0017878134967759252,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004344821412814781,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9217.0,
"completions/max_terminated_length": 9217.0,
"completions/mean_length": 7967.03125,
"completions/mean_terminated_length": 7967.03125,
"completions/min_length": 2804.0,
"completions/min_terminated_length": 2804.0,
"entropy": 0.1117274472489953,
"epoch": 0.00264,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3865375518798828,
"kl": 0.009410161008418072,
"learning_rate": 7.900543440815832e-06,
"loss": -0.0278,
"num_tokens": 18127517.0,
"reward": -0.3698572516441345,
"reward_std": 0.43620961904525757,
"rewards/rollout_reward_func/mean": -0.3698572516441345,
"rewards/rollout_reward_func/std": 0.48424720764160156,
"sampling/importance_sampling_ratio/max": 2.5134615898132324,
"sampling/importance_sampling_ratio/mean": 0.999183714389801,
"sampling/importance_sampling_ratio/min": 0.4420686364173889,
"sampling/sampling_logp_difference/max": 0.9216609001159668,
"sampling/sampling_logp_difference/mean": 0.01282300055027008,
"step": 66,
"step_time": 95.30812872799834
},
{
"clip_ratio/high_max": 0.0057083725114353,
"clip_ratio/high_mean": 0.00285418625571765,
"clip_ratio/low_mean": 0.002307449496584013,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005161635781405494,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9051.0,
"completions/max_terminated_length": 9051.0,
"completions/mean_length": 7223.84375,
"completions/mean_terminated_length": 7223.84375,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"entropy": 0.10340581508353353,
"epoch": 0.00268,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1929662227630615,
"kl": 0.004904634231934324,
"learning_rate": 7.89384252707373e-06,
"loss": -0.3888,
"num_tokens": 18385190.0,
"reward": -0.48364531993865967,
"reward_std": 0.3940197825431824,
"rewards/rollout_reward_func/mean": -0.48364531993865967,
"rewards/rollout_reward_func/std": 0.428783118724823,
"sampling/importance_sampling_ratio/max": 1.5800230503082275,
"sampling/importance_sampling_ratio/mean": 0.9995721578598022,
"sampling/importance_sampling_ratio/min": 0.3921872675418854,
"sampling/sampling_logp_difference/max": 0.9360158443450928,
"sampling/sampling_logp_difference/mean": 0.010175148025155067,
"step": 67,
"step_time": 91.31019782999829
},
{
"clip_ratio/high_max": 0.009525597095489502,
"clip_ratio/high_mean": 0.005110020807478577,
"clip_ratio/low_mean": 0.002821325935656205,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00793134668492712,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9310.0,
"completions/max_terminated_length": 9310.0,
"completions/mean_length": 7807.96875,
"completions/mean_terminated_length": 7807.96875,
"completions/min_length": 1538.0,
"completions/min_terminated_length": 1538.0,
"entropy": 0.11348246154375374,
"epoch": 0.00272,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4794671535491943,
"kl": 0.009939985982782673,
"learning_rate": 7.886927232362445e-06,
"loss": 0.1917,
"num_tokens": 18661453.0,
"reward": -0.29462701082229614,
"reward_std": 0.5272692441940308,
"rewards/rollout_reward_func/mean": -0.29462701082229614,
"rewards/rollout_reward_func/std": 0.5638213753700256,
"sampling/importance_sampling_ratio/max": 2.0398762226104736,
"sampling/importance_sampling_ratio/mean": 1.0006808042526245,
"sampling/importance_sampling_ratio/min": 0.3978581726551056,
"sampling/sampling_logp_difference/max": 0.9216597080230713,
"sampling/sampling_logp_difference/mean": 0.013158271089196205,
"step": 68,
"step_time": 94.51114526800302
},
{
"clip_ratio/high_max": 0.0028011860558763146,
"clip_ratio/high_mean": 0.0017478152585681528,
"clip_ratio/low_mean": 0.0007023358775768429,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0024501511361449957,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9273.0,
"completions/max_terminated_length": 9273.0,
"completions/mean_length": 8334.46875,
"completions/mean_terminated_length": 8334.46875,
"completions/min_length": 2627.0,
"completions/min_terminated_length": 2627.0,
"entropy": 0.10280115297064185,
"epoch": 0.00276,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4324398040771484,
"kl": 0.007807943955413066,
"learning_rate": 7.879798068979234e-06,
"loss": 0.0179,
"num_tokens": 18954599.0,
"reward": -0.3730016350746155,
"reward_std": 0.33994197845458984,
"rewards/rollout_reward_func/mean": -0.3730016350746155,
"rewards/rollout_reward_func/std": 0.4095904231071472,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 1.00093674659729,
"sampling/importance_sampling_ratio/min": 0.49454185366630554,
"sampling/sampling_logp_difference/max": 1.1368892192840576,
"sampling/sampling_logp_difference/mean": 0.010166226886212826,
"step": 69,
"step_time": 95.59550246900108
},
{
"clip_ratio/high_max": 0.004303681547753513,
"clip_ratio/high_mean": 0.0021518407738767564,
"clip_ratio/low_mean": 0.0021395922813098878,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004291433084290475,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9158.0,
"completions/max_terminated_length": 9158.0,
"completions/mean_length": 7782.03125,
"completions/mean_terminated_length": 7782.03125,
"completions/min_length": 1927.0,
"completions/min_terminated_length": 1927.0,
"entropy": 0.09547213604673743,
"epoch": 0.0028,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1895850896835327,
"kl": 0.005259023229882587,
"learning_rate": 7.872455565065118e-06,
"loss": -0.1185,
"num_tokens": 19230041.0,
"reward": -0.4593248963356018,
"reward_std": 0.37924250960350037,
"rewards/rollout_reward_func/mean": -0.4593248963356018,
"rewards/rollout_reward_func/std": 0.39817455410957336,
"sampling/importance_sampling_ratio/max": 1.7586390972137451,
"sampling/importance_sampling_ratio/mean": 0.9991085529327393,
"sampling/importance_sampling_ratio/min": 0.4861142635345459,
"sampling/sampling_logp_difference/max": 0.7213115692138672,
"sampling/sampling_logp_difference/mean": 0.01034074928611517,
"step": 70,
"step_time": 92.33433518700258
},
{
"clip_ratio/high_max": 0.00991362234344706,
"clip_ratio/high_mean": 0.0060480811225716025,
"clip_ratio/low_mean": 0.0010794433765113354,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007127524469979107,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9219.0,
"completions/max_terminated_length": 9219.0,
"completions/mean_length": 7936.96875,
"completions/mean_terminated_length": 7936.96875,
"completions/min_length": 307.0,
"completions/min_terminated_length": 307.0,
"entropy": 0.11046935408376157,
"epoch": 0.00284,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2573769092559814,
"kl": 0.00650980025602621,
"learning_rate": 7.864900264565765e-06,
"loss": -0.0379,
"num_tokens": 19509951.0,
"reward": -0.520825982093811,
"reward_std": 0.35236403346061707,
"rewards/rollout_reward_func/mean": -0.520825982093811,
"rewards/rollout_reward_func/std": 0.42140084505081177,
"sampling/importance_sampling_ratio/max": 1.9192090034484863,
"sampling/importance_sampling_ratio/mean": 0.9996625185012817,
"sampling/importance_sampling_ratio/min": 0.4776991903781891,
"sampling/sampling_logp_difference/max": 0.7387740612030029,
"sampling/sampling_logp_difference/mean": 0.010734163224697113,
"step": 71,
"step_time": 93.32257420199858
},
{
"clip_ratio/high_max": 0.00632950384169817,
"clip_ratio/high_mean": 0.004238442983478308,
"clip_ratio/low_mean": 0.001788180525181815,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006026623508660123,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9523.0,
"completions/max_terminated_length": 9523.0,
"completions/mean_length": 8397.84375,
"completions/mean_terminated_length": 8397.84375,
"completions/min_length": 2433.0,
"completions/min_terminated_length": 2433.0,
"entropy": 0.11352175939828157,
"epoch": 0.00288,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4297624826431274,
"kl": 0.005914928762649652,
"learning_rate": 7.857132727191193e-06,
"loss": 0.0541,
"num_tokens": 19805153.0,
"reward": -0.3045397400856018,
"reward_std": 0.33476388454437256,
"rewards/rollout_reward_func/mean": -0.3045397400856018,
"rewards/rollout_reward_func/std": 0.4641619026660919,
"sampling/importance_sampling_ratio/max": 1.5166332721710205,
"sampling/importance_sampling_ratio/mean": 1.000412940979004,
"sampling/importance_sampling_ratio/min": 0.4192443788051605,
"sampling/sampling_logp_difference/max": 0.8693013191223145,
"sampling/sampling_logp_difference/mean": 0.010624254122376442,
"step": 72,
"step_time": 96.8673131069936
},
{
"clip_ratio/high_max": 0.006114824442192912,
"clip_ratio/high_mean": 0.003057412221096456,
"clip_ratio/low_mean": 0.0060244997730478644,
"clip_ratio/low_min": 0.002430555585306138,
"clip_ratio/region_mean": 0.00908191199414432,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9157.0,
"completions/max_terminated_length": 9157.0,
"completions/mean_length": 7370.25,
"completions/mean_terminated_length": 7370.25,
"completions/min_length": 1337.0,
"completions/min_terminated_length": 1337.0,
"entropy": 0.12441039457917213,
"epoch": 0.00292,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3885090351104736,
"kl": 0.014640487075666897,
"learning_rate": 7.849153528374295e-06,
"loss": -0.1697,
"num_tokens": 20067395.0,
"reward": -0.5984026789665222,
"reward_std": 0.3406347632408142,
"rewards/rollout_reward_func/mean": -0.5984026789665222,
"rewards/rollout_reward_func/std": 0.4121968448162079,
"sampling/importance_sampling_ratio/max": 1.8335965871810913,
"sampling/importance_sampling_ratio/mean": 1.000123381614685,
"sampling/importance_sampling_ratio/min": 0.560712456703186,
"sampling/sampling_logp_difference/max": 0.6062793731689453,
"sampling/sampling_logp_difference/mean": 0.013483337126672268,
"step": 73,
"step_time": 89.83431924199795
},
{
"clip_ratio/high_max": 0.007186023925896734,
"clip_ratio/high_mean": 0.00395220736390911,
"clip_ratio/low_mean": 0.0026030205481220037,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006555227912031114,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9158.0,
"completions/max_terminated_length": 9158.0,
"completions/mean_length": 7705.6875,
"completions/mean_terminated_length": 7705.6875,
"completions/min_length": 578.0,
"completions/min_terminated_length": 578.0,
"entropy": 0.10846898006275296,
"epoch": 0.00296,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3326221704483032,
"kl": 0.008529015118256211,
"learning_rate": 7.840963259228225e-06,
"loss": -0.1738,
"num_tokens": 20340257.0,
"reward": -0.4566282629966736,
"reward_std": 0.36557790637016296,
"rewards/rollout_reward_func/mean": -0.4566282629966736,
"rewards/rollout_reward_func/std": 0.39172062277793884,
"sampling/importance_sampling_ratio/max": 1.7097969055175781,
"sampling/importance_sampling_ratio/mean": 0.9991310238838196,
"sampling/importance_sampling_ratio/min": 0.39339712262153625,
"sampling/sampling_logp_difference/max": 0.9329357147216797,
"sampling/sampling_logp_difference/mean": 0.011443836614489555,
"step": 74,
"step_time": 94.07717578700249
},
{
"clip_ratio/high_max": 0.0028093435103073716,
"clip_ratio/high_mean": 0.0014046717551536858,
"clip_ratio/low_mean": 0.0007022471982054412,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002106918953359127,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9250.0,
"completions/max_terminated_length": 9250.0,
"completions/mean_length": 7632.8125,
"completions/mean_terminated_length": 7632.8125,
"completions/min_length": 414.0,
"completions/min_terminated_length": 414.0,
"entropy": 0.10531778959557414,
"epoch": 0.003,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4729112386703491,
"kl": 0.009549812879413366,
"learning_rate": 7.832562526502598e-06,
"loss": -0.1487,
"num_tokens": 20610590.0,
"reward": -0.5081942677497864,
"reward_std": 0.3885437250137329,
"rewards/rollout_reward_func/mean": -0.5081942677497864,
"rewards/rollout_reward_func/std": 0.4038866460323334,
"sampling/importance_sampling_ratio/max": 2.1431355476379395,
"sampling/importance_sampling_ratio/mean": 1.0006523132324219,
"sampling/importance_sampling_ratio/min": 0.5124464631080627,
"sampling/sampling_logp_difference/max": 0.7622699737548828,
"sampling/sampling_logp_difference/mean": 0.010758567601442337,
"step": 75,
"step_time": 91.45560667300197
},
{
"clip_ratio/high_max": 0.004320964915677905,
"clip_ratio/high_mean": 0.0025281294947490096,
"clip_ratio/low_mean": 0.002119072509231046,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0046472020039800555,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9478.0,
"completions/max_terminated_length": 9478.0,
"completions/mean_length": 7664.84375,
"completions/mean_terminated_length": 7664.84375,
"completions/min_length": 290.0,
"completions/min_terminated_length": 290.0,
"entropy": 0.1210014394018799,
"epoch": 0.00304,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.333920955657959,
"kl": 0.008828506610370823,
"learning_rate": 7.82395195253854e-06,
"loss": -0.0473,
"num_tokens": 20882278.0,
"reward": -0.33376026153564453,
"reward_std": 0.5433810949325562,
"rewards/rollout_reward_func/mean": -0.33376026153564453,
"rewards/rollout_reward_func/std": 0.5932837724685669,
"sampling/importance_sampling_ratio/max": 1.7989699840545654,
"sampling/importance_sampling_ratio/mean": 0.9988389015197754,
"sampling/importance_sampling_ratio/min": 0.5031105279922485,
"sampling/sampling_logp_difference/max": 0.686945378780365,
"sampling/sampling_logp_difference/mean": 0.01223824918270111,
"step": 76,
"step_time": 93.80719368800237
},
{
"clip_ratio/high_max": 0.004671525151934475,
"clip_ratio/high_mean": 0.0023357625759672374,
"clip_ratio/low_mean": 0.0031846001220401376,
"clip_ratio/low_min": 0.0007022471982054412,
"clip_ratio/region_mean": 0.0055203626689035445,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8957.0,
"completions/max_terminated_length": 8957.0,
"completions/mean_length": 7233.0625,
"completions/mean_terminated_length": 7233.0625,
"completions/min_length": 199.0,
"completions/min_terminated_length": 199.0,
"entropy": 0.11074003390967846,
"epoch": 0.00308,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5347291231155396,
"kl": 0.00865966931451112,
"learning_rate": 7.815132175222592e-06,
"loss": -0.1913,
"num_tokens": 21139607.0,
"reward": -0.4299502968788147,
"reward_std": 0.5717419385910034,
"rewards/rollout_reward_func/mean": -0.4299502968788147,
"rewards/rollout_reward_func/std": 0.5658487677574158,
"sampling/importance_sampling_ratio/max": 3.0,
"sampling/importance_sampling_ratio/mean": 1.000859260559082,
"sampling/importance_sampling_ratio/min": 0.5086528062820435,
"sampling/sampling_logp_difference/max": 1.1856822967529297,
"sampling/sampling_logp_difference/mean": 0.011567970737814903,
"step": 77,
"step_time": 88.42593535700325
},
{
"clip_ratio/high_max": 0.0021392186754383147,
"clip_ratio/high_mean": 0.0010696093377191573,
"clip_ratio/low_mean": 0.00273930755793117,
"clip_ratio/low_min": 0.0007022471982054412,
"clip_ratio/region_mean": 0.0038089169247541577,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9201.0,
"completions/max_terminated_length": 9201.0,
"completions/mean_length": 7827.09375,
"completions/mean_terminated_length": 7827.09375,
"completions/min_length": 1987.0,
"completions/min_terminated_length": 1987.0,
"entropy": 0.10664277244359255,
"epoch": 0.00312,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2181073427200317,
"kl": 0.006914699915796518,
"learning_rate": 7.806103847939445e-06,
"loss": -0.1403,
"num_tokens": 21416098.0,
"reward": -0.44013938307762146,
"reward_std": 0.407479465007782,
"rewards/rollout_reward_func/mean": -0.44013938307762146,
"rewards/rollout_reward_func/std": 0.44036611914634705,
"sampling/importance_sampling_ratio/max": 2.069561004638672,
"sampling/importance_sampling_ratio/mean": 1.0012619495391846,
"sampling/importance_sampling_ratio/min": 0.6060473918914795,
"sampling/sampling_logp_difference/max": 0.7273365259170532,
"sampling/sampling_logp_difference/mean": 0.010566653683781624,
"step": 78,
"step_time": 92.46403468800418
},
{
"clip_ratio/high_max": 0.004304705187678337,
"clip_ratio/high_mean": 0.0021523525938391685,
"clip_ratio/low_mean": 0.00035511364694684744,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002507466240786016,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9566.0,
"completions/max_terminated_length": 9566.0,
"completions/mean_length": 6899.25,
"completions/mean_terminated_length": 6899.25,
"completions/min_length": 950.0,
"completions/min_terminated_length": 950.0,
"entropy": 0.1280936081893742,
"epoch": 0.00316,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.282666802406311,
"kl": 0.01210544134664815,
"learning_rate": 7.79686763952354e-06,
"loss": -0.3126,
"num_tokens": 21663152.0,
"reward": -0.4885612428188324,
"reward_std": 0.42654168605804443,
"rewards/rollout_reward_func/mean": -0.4885612428188324,
"rewards/rollout_reward_func/std": 0.4574175775051117,
"sampling/importance_sampling_ratio/max": 1.6807042360305786,
"sampling/importance_sampling_ratio/mean": 0.9999642372131348,
"sampling/importance_sampling_ratio/min": 0.6889766454696655,
"sampling/sampling_logp_difference/max": 0.5192129611968994,
"sampling/sampling_logp_difference/mean": 0.012327692471444607,
"step": 79,
"step_time": 86.57649834300173
},
{
"clip_ratio/high_max": 0.005642829288262874,
"clip_ratio/high_mean": 0.002821414644131437,
"clip_ratio/low_mean": 0.0033451107738073915,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006166525447042659,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9229.0,
"completions/max_terminated_length": 9229.0,
"completions/mean_length": 7748.8125,
"completions/mean_terminated_length": 7748.8125,
"completions/min_length": 959.0,
"completions/min_terminated_length": 959.0,
"entropy": 0.11266007972881198,
"epoch": 0.0032,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2385865449905396,
"kl": 0.009715210646390915,
"learning_rate": 7.787424234209523e-06,
"loss": -0.2138,
"num_tokens": 21937291.0,
"reward": -0.5135730504989624,
"reward_std": 0.28290629386901855,
"rewards/rollout_reward_func/mean": -0.5135730504989624,
"rewards/rollout_reward_func/std": 0.29492101073265076,
"sampling/importance_sampling_ratio/max": 1.8985683917999268,
"sampling/importance_sampling_ratio/mean": 1.0005488395690918,
"sampling/importance_sampling_ratio/min": 0.21555323898792267,
"sampling/sampling_logp_difference/max": 1.5345473289489746,
"sampling/sampling_logp_difference/mean": 0.012439057230949402,
"step": 80,
"step_time": 94.79033007999533
},
{
"clip_ratio/high_max": 0.004234770254697651,
"clip_ratio/high_mean": 0.002476580470101908,
"clip_ratio/low_mean": 0.0045510306663345546,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007027611136436462,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9216.0,
"completions/max_terminated_length": 9216.0,
"completions/mean_length": 7755.90625,
"completions/mean_terminated_length": 7755.90625,
"completions/min_length": 1219.0,
"completions/min_terminated_length": 1219.0,
"entropy": 0.10599858709610999,
"epoch": 0.00324,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1525481939315796,
"kl": 0.0123361352016218,
"learning_rate": 7.77777433158155e-06,
"loss": -0.1473,
"num_tokens": 22211636.0,
"reward": -0.4603438377380371,
"reward_std": 0.3456616699695587,
"rewards/rollout_reward_func/mean": -0.4603438377380371,
"rewards/rollout_reward_func/std": 0.43396344780921936,
"sampling/importance_sampling_ratio/max": 2.0103213787078857,
"sampling/importance_sampling_ratio/mean": 0.999226450920105,
"sampling/importance_sampling_ratio/min": 0.6042999029159546,
"sampling/sampling_logp_difference/max": 0.6982946395874023,
"sampling/sampling_logp_difference/mean": 0.01090179942548275,
"step": 81,
"step_time": 91.66288167299899
},
{
"clip_ratio/high_max": 0.006423490063752979,
"clip_ratio/high_mean": 0.0035628686600830406,
"clip_ratio/low_mean": 0.002858268067939207,
"clip_ratio/low_min": 0.0014044943964108825,
"clip_ratio/region_mean": 0.0064211367862299085,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9370.0,
"completions/max_terminated_length": 9370.0,
"completions/mean_length": 7690.96875,
"completions/mean_terminated_length": 7690.96875,
"completions/min_length": 655.0,
"completions/min_terminated_length": 655.0,
"entropy": 0.12091189902275801,
"epoch": 0.00328,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2017008066177368,
"kl": 0.011158690598676912,
"learning_rate": 7.767918646521461e-06,
"loss": -0.0613,
"num_tokens": 22483765.0,
"reward": -0.3411584496498108,
"reward_std": 0.49997615814208984,
"rewards/rollout_reward_func/mean": -0.3411584496498108,
"rewards/rollout_reward_func/std": 0.5335732698440552,
"sampling/importance_sampling_ratio/max": 1.4986999034881592,
"sampling/importance_sampling_ratio/mean": 0.9999135732650757,
"sampling/importance_sampling_ratio/min": 0.37155619263648987,
"sampling/sampling_logp_difference/max": 0.9900552034378052,
"sampling/sampling_logp_difference/mean": 0.010972358286380768,
"step": 82,
"step_time": 95.45822799199959
},
{
"clip_ratio/high_max": 0.005522000603377819,
"clip_ratio/high_mean": 0.0027610003016889095,
"clip_ratio/low_mean": 0.0028991997824050486,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005660199996782467,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9288.0,
"completions/max_terminated_length": 9288.0,
"completions/mean_length": 7861.5,
"completions/mean_terminated_length": 7861.5,
"completions/min_length": 300.0,
"completions/min_terminated_length": 300.0,
"entropy": 0.12187603581696749,
"epoch": 0.00332,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2622228860855103,
"kl": 0.014604453899664804,
"learning_rate": 7.75785790915583e-06,
"loss": 0.0668,
"num_tokens": 22761866.0,
"reward": -0.3003544807434082,
"reward_std": 0.6722631454467773,
"rewards/rollout_reward_func/mean": -0.3003544807434082,
"rewards/rollout_reward_func/std": 0.6533814668655396,
"sampling/importance_sampling_ratio/max": 1.6991353034973145,
"sampling/importance_sampling_ratio/mean": 0.9996775984764099,
"sampling/importance_sampling_ratio/min": 0.07604925334453583,
"sampling/sampling_logp_difference/max": 2.576374053955078,
"sampling/sampling_logp_difference/mean": 0.01398524735122919,
"step": 83,
"step_time": 93.20210950300316
},
{
"clip_ratio/high_max": 0.00501747039379552,
"clip_ratio/high_mean": 0.00250873519689776,
"clip_ratio/low_mean": 0.0032708367507439107,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00577957188943401,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9430.0,
"completions/max_terminated_length": 9430.0,
"completions/mean_length": 7677.46875,
"completions/mean_terminated_length": 7677.46875,
"completions/min_length": 1991.0,
"completions/min_terminated_length": 1991.0,
"entropy": 0.11121218441985548,
"epoch": 0.00336,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3994040489196777,
"kl": 0.027663854853017256,
"learning_rate": 7.74759286480186e-06,
"loss": -0.098,
"num_tokens": 23033870.0,
"reward": -0.3911195993423462,
"reward_std": 0.40892452001571655,
"rewards/rollout_reward_func/mean": -0.3911195993423462,
"rewards/rollout_reward_func/std": 0.49502092599868774,
"sampling/importance_sampling_ratio/max": 1.9817404747009277,
"sampling/importance_sampling_ratio/mean": 0.9991714954376221,
"sampling/importance_sampling_ratio/min": 0.32034245133399963,
"sampling/sampling_logp_difference/max": 1.1383646726608276,
"sampling/sampling_logp_difference/mean": 0.013728287070989609,
"step": 84,
"step_time": 93.50239179400523
},
{
"clip_ratio/high_max": 0.006439179298467934,
"clip_ratio/high_mean": 0.003566811908967793,
"clip_ratio/low_mean": 0.0006127451197244227,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004179557028692216,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9303.0,
"completions/max_terminated_length": 9303.0,
"completions/mean_length": 7925.1875,
"completions/mean_terminated_length": 7925.1875,
"completions/min_length": 1447.0,
"completions/min_terminated_length": 1447.0,
"entropy": 0.1154963904991746,
"epoch": 0.0034,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1012736558914185,
"kl": 0.01787613780470565,
"learning_rate": 7.737124273912181e-06,
"loss": -0.2728,
"num_tokens": 23313399.0,
"reward": -0.43042296171188354,
"reward_std": 0.3645261526107788,
"rewards/rollout_reward_func/mean": -0.43042296171188354,
"rewards/rollout_reward_func/std": 0.39467355608940125,
"sampling/importance_sampling_ratio/max": 1.962335228919983,
"sampling/importance_sampling_ratio/mean": 0.999538779258728,
"sampling/importance_sampling_ratio/min": 0.41583091020584106,
"sampling/sampling_logp_difference/max": 0.8774765729904175,
"sampling/sampling_logp_difference/mean": 0.011602518148720264,
"step": 85,
"step_time": 91.32576291800251
},
{
"clip_ratio/high_max": 0.0035516806528903544,
"clip_ratio/high_mean": 0.0021309539733920246,
"clip_ratio/low_mean": 0.0014086648297961801,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0035396188031882048,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9175.0,
"completions/max_terminated_length": 9175.0,
"completions/mean_length": 7601.90625,
"completions/mean_terminated_length": 7601.90625,
"completions/min_length": 272.0,
"completions/min_terminated_length": 272.0,
"entropy": 0.13196291960775852,
"epoch": 0.00344,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.298827052116394,
"kl": 0.013925580627983436,
"learning_rate": 7.72645291201851e-06,
"loss": -0.2696,
"num_tokens": 23582639.0,
"reward": -0.40399467945098877,
"reward_std": 0.350833535194397,
"rewards/rollout_reward_func/mean": -0.40399467945098877,
"rewards/rollout_reward_func/std": 0.40605658292770386,
"sampling/importance_sampling_ratio/max": 1.568974256515503,
"sampling/importance_sampling_ratio/mean": 0.9993267059326172,
"sampling/importance_sampling_ratio/min": 0.6157581806182861,
"sampling/sampling_logp_difference/max": 0.48490095138549805,
"sampling/sampling_logp_difference/mean": 0.011755777522921562,
"step": 86,
"step_time": 94.93100565299937
},
{
"clip_ratio/high_max": 0.005048143619205803,
"clip_ratio/high_mean": 0.0025240718096029013,
"clip_ratio/low_mean": 0.003231386741390452,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005755458580097184,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9284.0,
"completions/max_terminated_length": 9284.0,
"completions/mean_length": 7635.3125,
"completions/mean_terminated_length": 7635.3125,
"completions/min_length": 2500.0,
"completions/min_terminated_length": 2500.0,
"entropy": 0.11647467198781669,
"epoch": 0.00348,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1801884174346924,
"kl": 0.013889241206925362,
"learning_rate": 7.715579569674193e-06,
"loss": -0.1101,
"num_tokens": 23852759.0,
"reward": -0.4265971779823303,
"reward_std": 0.4005134105682373,
"rewards/rollout_reward_func/mean": -0.4265971779823303,
"rewards/rollout_reward_func/std": 0.46251291036605835,
"sampling/importance_sampling_ratio/max": 2.021784782409668,
"sampling/importance_sampling_ratio/mean": 1.0004267692565918,
"sampling/importance_sampling_ratio/min": 0.4357590675354004,
"sampling/sampling_logp_difference/max": 0.8306658864021301,
"sampling/sampling_logp_difference/mean": 0.011593816801905632,
"step": 87,
"step_time": 91.54987620599968
},
{
"clip_ratio/high_max": 0.0033402342814952135,
"clip_ratio/high_mean": 0.0016701171407476068,
"clip_ratio/low_mean": 0.0017763022624421865,
"clip_ratio/low_min": 0.0006944444612599909,
"clip_ratio/region_mean": 0.0034464194031897932,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9331.0,
"completions/max_terminated_length": 9331.0,
"completions/mean_length": 8468.625,
"completions/mean_terminated_length": 8468.625,
"completions/min_length": 2484.0,
"completions/min_terminated_length": 2484.0,
"entropy": 0.119684575824067,
"epoch": 0.00352,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3990496397018433,
"kl": 0.012226378137711436,
"learning_rate": 7.704505052395651e-06,
"loss": 0.1027,
"num_tokens": 24149881.0,
"reward": -0.1723448634147644,
"reward_std": 0.3936545252799988,
"rewards/rollout_reward_func/mean": -0.1723448634147644,
"rewards/rollout_reward_func/std": 0.48704293370246887,
"sampling/importance_sampling_ratio/max": 2.3026111125946045,
"sampling/importance_sampling_ratio/mean": 1.0012261867523193,
"sampling/importance_sampling_ratio/min": 0.3968614935874939,
"sampling/sampling_logp_difference/max": 0.9241679906845093,
"sampling/sampling_logp_difference/mean": 0.012234903872013092,
"step": 88,
"step_time": 95.13692331799757
},
{
"clip_ratio/high_max": 0.0035114133497700095,
"clip_ratio/high_mean": 0.002106830303091556,
"clip_ratio/low_mean": 0.0025690634502097964,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004675893753301352,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9274.0,
"completions/max_terminated_length": 9274.0,
"completions/mean_length": 7818.875,
"completions/mean_terminated_length": 7818.875,
"completions/min_length": 730.0,
"completions/min_terminated_length": 730.0,
"entropy": 0.13537772255949676,
"epoch": 0.00356,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3247586488723755,
"kl": 0.01799499534536153,
"learning_rate": 7.693230180602701e-06,
"loss": -0.1045,
"num_tokens": 24426080.0,
"reward": -0.4246547818183899,
"reward_std": 0.2930266261100769,
"rewards/rollout_reward_func/mean": -0.4246547818183899,
"rewards/rollout_reward_func/std": 0.40061336755752563,
"sampling/importance_sampling_ratio/max": 2.4912331104278564,
"sampling/importance_sampling_ratio/mean": 0.99860680103302,
"sampling/importance_sampling_ratio/min": 0.4037307798862457,
"sampling/sampling_logp_difference/max": 0.9127777814865112,
"sampling/sampling_logp_difference/mean": 0.014037063345313072,
"step": 89,
"step_time": 94.62805927399677
},
{
"clip_ratio/high_max": 0.007079427479766309,
"clip_ratio/high_mean": 0.0035397137398831546,
"clip_ratio/low_mean": 0.003276572242612019,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006816285924287513,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9564.0,
"completions/max_terminated_length": 9564.0,
"completions/mean_length": 8033.125,
"completions/mean_terminated_length": 8033.125,
"completions/min_length": 743.0,
"completions/min_terminated_length": 743.0,
"entropy": 0.111784094478935,
"epoch": 0.0036,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1404390335083008,
"kl": 0.0197194812935777,
"learning_rate": 7.68175578955777e-06,
"loss": -0.1376,
"num_tokens": 24709065.0,
"reward": -0.4281678795814514,
"reward_std": 0.3626987934112549,
"rewards/rollout_reward_func/mean": -0.4281678795814514,
"rewards/rollout_reward_func/std": 0.40464091300964355,
"sampling/importance_sampling_ratio/max": 2.2442386150360107,
"sampling/importance_sampling_ratio/mean": 0.9990909099578857,
"sampling/importance_sampling_ratio/min": 0.4673902690410614,
"sampling/sampling_logp_difference/max": 0.8083662986755371,
"sampling/sampling_logp_difference/mean": 0.011810792610049248,
"step": 90,
"step_time": 94.34466740900098
},
{
"clip_ratio/high_max": 0.007889641856309026,
"clip_ratio/high_mean": 0.004651058145100251,
"clip_ratio/low_mean": 0.0014329880068544298,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006084046239266172,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9008.0,
"completions/max_terminated_length": 9008.0,
"completions/mean_length": 7546.65625,
"completions/mean_terminated_length": 7546.65625,
"completions/min_length": 401.0,
"completions/min_terminated_length": 401.0,
"entropy": 0.10288556944578886,
"epoch": 0.00364,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.186630368232727,
"kl": 0.018865237798308954,
"learning_rate": 7.67008272930403e-06,
"loss": -0.2213,
"num_tokens": 24976474.0,
"reward": -0.4113379716873169,
"reward_std": 0.34501367807388306,
"rewards/rollout_reward_func/mean": -0.4113379716873169,
"rewards/rollout_reward_func/std": 0.375704288482666,
"sampling/importance_sampling_ratio/max": 2.0333220958709717,
"sampling/importance_sampling_ratio/mean": 1.000919222831726,
"sampling/importance_sampling_ratio/min": 0.3369899392127991,
"sampling/sampling_logp_difference/max": 1.0877022743225098,
"sampling/sampling_logp_difference/mean": 0.010551965795457363,
"step": 91,
"step_time": 93.05898203200377
},
{
"clip_ratio/high_max": 0.007170227414462715,
"clip_ratio/high_mean": 0.004291531280614436,
"clip_ratio/low_mean": 0.0003511235991027206,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004642654879717156,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9088.0,
"completions/max_terminated_length": 9088.0,
"completions/mean_length": 7282.0625,
"completions/mean_terminated_length": 7282.0625,
"completions/min_length": 385.0,
"completions/min_terminated_length": 385.0,
"entropy": 0.1037353971041739,
"epoch": 0.00368,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3811126947402954,
"kl": 0.01996720782153716,
"learning_rate": 7.658211864602414e-06,
"loss": -0.1225,
"num_tokens": 25235357.0,
"reward": -0.21767657995224,
"reward_std": 0.556893527507782,
"rewards/rollout_reward_func/mean": -0.21767657995224,
"rewards/rollout_reward_func/std": 0.6004104614257812,
"sampling/importance_sampling_ratio/max": 2.3581435680389404,
"sampling/importance_sampling_ratio/mean": 1.0017621517181396,
"sampling/importance_sampling_ratio/min": 0.07947485893964767,
"sampling/sampling_logp_difference/max": 2.5323145389556885,
"sampling/sampling_logp_difference/mean": 0.012833436019718647,
"step": 92,
"step_time": 91.07534395600123
},
{
"clip_ratio/high_max": 0.0027935606776736677,
"clip_ratio/high_mean": 0.0013967803388368338,
"clip_ratio/low_mean": 0.002134947048034519,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0035317273577675223,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9362.0,
"completions/max_terminated_length": 9362.0,
"completions/mean_length": 8238.3125,
"completions/mean_terminated_length": 8238.3125,
"completions/min_length": 614.0,
"completions/min_terminated_length": 614.0,
"entropy": 0.0987412256654352,
"epoch": 0.00372,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2942229509353638,
"kl": 0.014109234602074139,
"learning_rate": 7.64614407486756e-06,
"loss": -0.1174,
"num_tokens": 25524816.0,
"reward": -0.4631078839302063,
"reward_std": 0.36089861392974854,
"rewards/rollout_reward_func/mean": -0.4631078839302063,
"rewards/rollout_reward_func/std": 0.421775758266449,
"sampling/importance_sampling_ratio/max": 2.2098324298858643,
"sampling/importance_sampling_ratio/mean": 1.001816749572754,
"sampling/importance_sampling_ratio/min": 0.5984537601470947,
"sampling/sampling_logp_difference/max": 0.7929167747497559,
"sampling/sampling_logp_difference/mean": 0.01035095565021038,
"step": 93,
"step_time": 95.76501873699817
},
{
"clip_ratio/high_max": 0.004278437350876629,
"clip_ratio/high_mean": 0.0021392186754383147,
"clip_ratio/low_mean": 0.0011272609990555793,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003266479674493894,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9301.0,
"completions/max_terminated_length": 9301.0,
"completions/mean_length": 7563.09375,
"completions/mean_terminated_length": 7563.09375,
"completions/min_length": 277.0,
"completions/min_terminated_length": 277.0,
"entropy": 0.1352407243102789,
"epoch": 0.00376,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2077804803848267,
"kl": 0.033299635659204796,
"learning_rate": 7.633880254102664e-06,
"loss": -0.1037,
"num_tokens": 25792762.0,
"reward": -0.487395703792572,
"reward_std": 0.4409431219100952,
"rewards/rollout_reward_func/mean": -0.487395703792572,
"rewards/rollout_reward_func/std": 0.45345383882522583,
"sampling/importance_sampling_ratio/max": 2.2063913345336914,
"sampling/importance_sampling_ratio/mean": 1.001375436782837,
"sampling/importance_sampling_ratio/min": 0.2881685495376587,
"sampling/sampling_logp_difference/max": 1.2442097663879395,
"sampling/sampling_logp_difference/mean": 0.01331554725766182,
"step": 94,
"step_time": 91.56602428999759
},
{
"clip_ratio/high_max": 0.005700222682207823,
"clip_ratio/high_mean": 0.0032012349111028016,
"clip_ratio/low_mean": 0.0008800287614576519,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004081263672560453,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9186.0,
"completions/max_terminated_length": 9186.0,
"completions/mean_length": 7445.5,
"completions/mean_terminated_length": 7445.5,
"completions/min_length": 307.0,
"completions/min_terminated_length": 307.0,
"entropy": 0.12523270677775145,
"epoch": 0.0038,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1024125814437866,
"kl": 0.015170263191976119,
"learning_rate": 7.621421310833242e-06,
"loss": -0.2471,
"num_tokens": 26056916.0,
"reward": -0.47263962030410767,
"reward_std": 0.4123695492744446,
"rewards/rollout_reward_func/mean": -0.47263962030410767,
"rewards/rollout_reward_func/std": 0.42915722727775574,
"sampling/importance_sampling_ratio/max": 1.9018123149871826,
"sampling/importance_sampling_ratio/mean": 0.9993163347244263,
"sampling/importance_sampling_ratio/min": 0.37924882769584656,
"sampling/sampling_logp_difference/max": 0.9695627689361572,
"sampling/sampling_logp_difference/mean": 0.013316385447978973,
"step": 95,
"step_time": 91.25094345499565
},
{
"clip_ratio/high_max": 0.00731422781245783,
"clip_ratio/high_mean": 0.004351901414338499,
"clip_ratio/low_mean": 0.0017361111822538078,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006088012654799968,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9454.0,
"completions/max_terminated_length": 9454.0,
"completions/mean_length": 7747.3125,
"completions/mean_terminated_length": 7747.3125,
"completions/min_length": 3419.0,
"completions/min_terminated_length": 3419.0,
"entropy": 0.12114885123446584,
"epoch": 0.00384,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.179085373878479,
"kl": 0.02324743496137671,
"learning_rate": 7.608768168039832e-06,
"loss": -0.1271,
"num_tokens": 26330673.0,
"reward": -0.37598949670791626,
"reward_std": 0.43450477719306946,
"rewards/rollout_reward_func/mean": -0.37598949670791626,
"rewards/rollout_reward_func/std": 0.5974510908126831,
"sampling/importance_sampling_ratio/max": 1.950088381767273,
"sampling/importance_sampling_ratio/mean": 0.9992377161979675,
"sampling/importance_sampling_ratio/min": 0.35387420654296875,
"sampling/sampling_logp_difference/max": 1.038813829421997,
"sampling/sampling_logp_difference/mean": 0.01316380687057972,
"step": 96,
"step_time": 92.53011242599314
},
{
"clip_ratio/high_max": 0.004979305085726082,
"clip_ratio/high_mean": 0.002489652542863041,
"clip_ratio/low_mean": 0.002151657157810405,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004641309700673446,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9315.0,
"completions/max_terminated_length": 9315.0,
"completions/mean_length": 8012.21875,
"completions/mean_terminated_length": 8012.21875,
"completions/min_length": 381.0,
"completions/min_terminated_length": 381.0,
"entropy": 0.13530251197516918,
"epoch": 0.00388,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1584080457687378,
"kl": 0.02002575868391432,
"learning_rate": 7.5959217630896185e-06,
"loss": -0.1398,
"num_tokens": 26612879.0,
"reward": -0.3609606921672821,
"reward_std": 0.31441646814346313,
"rewards/rollout_reward_func/mean": -0.3609606921672821,
"rewards/rollout_reward_func/std": 0.3982548713684082,
"sampling/importance_sampling_ratio/max": 1.962359070777893,
"sampling/importance_sampling_ratio/mean": 1.0006780624389648,
"sampling/importance_sampling_ratio/min": 0.5123910307884216,
"sampling/sampling_logp_difference/max": 0.674147367477417,
"sampling/sampling_logp_difference/mean": 0.013763591647148132,
"step": 97,
"step_time": 93.49124078800196
},
{
"clip_ratio/high_max": 0.00703950843308121,
"clip_ratio/high_mean": 0.0038748678634874523,
"clip_ratio/low_mean": 0.0023320690961554646,
"clip_ratio/low_min": 0.0007022471982054412,
"clip_ratio/region_mean": 0.006206936959642917,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9290.0,
"completions/max_terminated_length": 9290.0,
"completions/mean_length": 7702.40625,
"completions/mean_terminated_length": 7702.40625,
"completions/min_length": 754.0,
"completions/min_terminated_length": 754.0,
"entropy": 0.13701792433857918,
"epoch": 0.00392,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3185031414031982,
"kl": 0.026673564861994237,
"learning_rate": 7.5828830476669816e-06,
"loss": -0.2084,
"num_tokens": 26885204.0,
"reward": -0.4948327839374542,
"reward_std": 0.31638196110725403,
"rewards/rollout_reward_func/mean": -0.4948327839374542,
"rewards/rollout_reward_func/std": 0.42699867486953735,
"sampling/importance_sampling_ratio/max": 1.5802547931671143,
"sampling/importance_sampling_ratio/mean": 0.9976853132247925,
"sampling/importance_sampling_ratio/min": 0.4126710891723633,
"sampling/sampling_logp_difference/max": 0.8851044178009033,
"sampling/sampling_logp_difference/mean": 0.015071339905261993,
"step": 98,
"step_time": 91.84920660200441
},
{
"clip_ratio/high_max": 0.003422708949074149,
"clip_ratio/high_mean": 0.0017113544745370746,
"clip_ratio/low_mean": 0.0013361009187065065,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003047455393243581,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9112.0,
"completions/max_terminated_length": 9112.0,
"completions/mean_length": 7679.53125,
"completions/mean_terminated_length": 7679.53125,
"completions/min_length": 894.0,
"completions/min_terminated_length": 894.0,
"entropy": 0.10829294635914266,
"epoch": 0.00396,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2493257522583008,
"kl": 0.015406116601297981,
"learning_rate": 7.569652987703011e-06,
"loss": -0.0216,
"num_tokens": 27156852.0,
"reward": -0.3716369867324829,
"reward_std": 0.49375930428504944,
"rewards/rollout_reward_func/mean": -0.3716369867324829,
"rewards/rollout_reward_func/std": 0.5273211598396301,
"sampling/importance_sampling_ratio/max": 1.4128153324127197,
"sampling/importance_sampling_ratio/mean": 0.9987783432006836,
"sampling/importance_sampling_ratio/min": 0.35299214720726013,
"sampling/sampling_logp_difference/max": 1.0413094758987427,
"sampling/sampling_logp_difference/mean": 0.010902078822255135,
"step": 99,
"step_time": 90.7731563160014
},
{
"clip_ratio/high_max": 0.007757734332699329,
"clip_ratio/high_mean": 0.0038788671663496643,
"clip_ratio/low_mean": 0.00035511364694684744,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004233980813296512,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9181.0,
"completions/max_terminated_length": 9181.0,
"completions/mean_length": 8015.375,
"completions/mean_terminated_length": 8015.375,
"completions/min_length": 290.0,
"completions/min_terminated_length": 290.0,
"entropy": 0.1193245961330831,
"epoch": 0.004,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2326804399490356,
"kl": 0.016626078002445865,
"learning_rate": 7.5562325633039275e-06,
"loss": -0.167,
"num_tokens": 27439174.0,
"reward": -0.42884576320648193,
"reward_std": 0.32799845933914185,
"rewards/rollout_reward_func/mean": -0.42884576320648193,
"rewards/rollout_reward_func/std": 0.3549758791923523,
"sampling/importance_sampling_ratio/max": 1.9659780263900757,
"sampling/importance_sampling_ratio/mean": 0.9990392923355103,
"sampling/importance_sampling_ratio/min": 0.2670063078403473,
"sampling/sampling_logp_difference/max": 1.3204829692840576,
"sampling/sampling_logp_difference/mean": 0.013362506404519081,
"step": 100,
"step_time": 94.30247499899633
},
{
"clip_ratio/high_max": 0.01159447367535904,
"clip_ratio/high_mean": 0.00579723683767952,
"clip_ratio/low_mean": 0.0007812500116415322,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006578486849321052,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9356.0,
"completions/max_terminated_length": 9356.0,
"completions/mean_length": 8226.75,
"completions/mean_terminated_length": 8226.75,
"completions/min_length": 3396.0,
"completions/min_terminated_length": 3396.0,
"entropy": 0.12249888107180595,
"epoch": 0.00404,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0833367109298706,
"kl": 0.01735936271143146,
"learning_rate": 7.542622768678494e-06,
"loss": 0.0171,
"num_tokens": 27728261.0,
"reward": -0.3308451175689697,
"reward_std": 0.5605739951133728,
"rewards/rollout_reward_func/mean": -0.3308451175689697,
"rewards/rollout_reward_func/std": 0.5798808932304382,
"sampling/importance_sampling_ratio/max": 1.5370640754699707,
"sampling/importance_sampling_ratio/mean": 1.0012454986572266,
"sampling/importance_sampling_ratio/min": 0.5971825122833252,
"sampling/sampling_logp_difference/max": 0.5155324935913086,
"sampling/sampling_logp_difference/mean": 0.011695077642798424,
"step": 101,
"step_time": 95.23633311199774
},
{
"clip_ratio/high_max": 0.002826430252753198,
"clip_ratio/high_mean": 0.001413215126376599,
"clip_ratio/low_mean": 0.001412654877640307,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002825870004016906,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9459.0,
"completions/max_terminated_length": 9459.0,
"completions/mean_length": 8216.125,
"completions/mean_terminated_length": 8216.125,
"completions/min_length": 1191.0,
"completions/min_terminated_length": 1191.0,
"entropy": 0.12594126863405108,
"epoch": 0.00408,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3509782552719116,
"kl": 0.02238677909190301,
"learning_rate": 7.528824612064348e-06,
"loss": -0.0679,
"num_tokens": 28017012.0,
"reward": -0.3804386854171753,
"reward_std": 0.3933500647544861,
"rewards/rollout_reward_func/mean": -0.3804386854171753,
"rewards/rollout_reward_func/std": 0.4109920263290405,
"sampling/importance_sampling_ratio/max": 2.3366055488586426,
"sampling/importance_sampling_ratio/mean": 1.0009369850158691,
"sampling/importance_sampling_ratio/min": 0.46159544587135315,
"sampling/sampling_logp_difference/max": 0.8486993312835693,
"sampling/sampling_logp_difference/mean": 0.014044486917555332,
"step": 102,
"step_time": 95.00174767700264
},
{
"clip_ratio/high_max": 0.0028175070183351636,
"clip_ratio/high_mean": 0.0024504201719537377,
"clip_ratio/low_mean": 0.0021656582539435476,
"clip_ratio/low_min": 0.0006944444612599909,
"clip_ratio/region_mean": 0.004616078425897285,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9223.0,
"completions/max_terminated_length": 9223.0,
"completions/mean_length": 7621.125,
"completions/mean_terminated_length": 7621.125,
"completions/min_length": 1625.0,
"completions/min_terminated_length": 1625.0,
"entropy": 0.11429717438295484,
"epoch": 0.00412,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.284745216369629,
"kl": 0.04166489985072985,
"learning_rate": 7.5148391156533234e-06,
"loss": -0.0986,
"num_tokens": 28286718.0,
"reward": -0.3517298400402069,
"reward_std": 0.44646015763282776,
"rewards/rollout_reward_func/mean": -0.3517298400402069,
"rewards/rollout_reward_func/std": 0.5219200253486633,
"sampling/importance_sampling_ratio/max": 2.0663206577301025,
"sampling/importance_sampling_ratio/mean": 1.0006217956542969,
"sampling/importance_sampling_ratio/min": 0.3164287209510803,
"sampling/sampling_logp_difference/max": 1.150657296180725,
"sampling/sampling_logp_difference/mean": 0.016996370628476143,
"step": 103,
"step_time": 89.48946891300147
},
{
"clip_ratio/high_max": 0.006022654706612229,
"clip_ratio/high_mean": 0.003717744955793023,
"clip_ratio/low_mean": 0.0033087176270782948,
"clip_ratio/low_min": 0.0007022471982054412,
"clip_ratio/region_mean": 0.007026462582871318,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9336.0,
"completions/max_terminated_length": 9336.0,
"completions/mean_length": 7775.3125,
"completions/mean_terminated_length": 7775.3125,
"completions/min_length": 288.0,
"completions/min_terminated_length": 288.0,
"entropy": 0.11511457245796919,
"epoch": 0.00416,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1231155395507812,
"kl": 0.02951129319262691,
"learning_rate": 7.500667315515709e-06,
"loss": -0.1068,
"num_tokens": 28561357.0,
"reward": -0.39338138699531555,
"reward_std": 0.42901262640953064,
"rewards/rollout_reward_func/mean": -0.39338138699531555,
"rewards/rollout_reward_func/std": 0.48031023144721985,
"sampling/importance_sampling_ratio/max": 2.4638149738311768,
"sampling/importance_sampling_ratio/mean": 0.9997503757476807,
"sampling/importance_sampling_ratio/min": 0.5230858325958252,
"sampling/sampling_logp_difference/max": 0.9017109870910645,
"sampling/sampling_logp_difference/mean": 0.012721032835543156,
"step": 104,
"step_time": 93.24399638200339
},
{
"clip_ratio/high_max": 0.0027933833189308643,
"clip_ratio/high_mean": 0.0013966916594654322,
"clip_ratio/low_mean": 0.0025065841618925333,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003903275792254135,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9330.0,
"completions/max_terminated_length": 9330.0,
"completions/mean_length": 8119.09375,
"completions/mean_terminated_length": 8119.09375,
"completions/min_length": 3061.0,
"completions/min_terminated_length": 3061.0,
"entropy": 0.11791149899363518,
"epoch": 0.0042,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2128372192382812,
"kl": 0.027731850394047797,
"learning_rate": 7.486310261523511e-06,
"loss": 0.0399,
"num_tokens": 28847024.0,
"reward": -0.3779003918170929,
"reward_std": 0.5017719268798828,
"rewards/rollout_reward_func/mean": -0.3779003918170929,
"rewards/rollout_reward_func/std": 0.5323984622955322,
"sampling/importance_sampling_ratio/max": 1.925367832183838,
"sampling/importance_sampling_ratio/mean": 1.0002028942108154,
"sampling/importance_sampling_ratio/min": 0.3569715917110443,
"sampling/sampling_logp_difference/max": 1.0300991535186768,
"sampling/sampling_logp_difference/mean": 0.01532074436545372,
"step": 105,
"step_time": 93.38768337700094
},
{
"clip_ratio/high_max": 0.005748430092353374,
"clip_ratio/high_mean": 0.002874215046176687,
"clip_ratio/low_mean": 0.002773832093225792,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005648046964779496,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9337.0,
"completions/max_terminated_length": 9337.0,
"completions/mean_length": 7877.40625,
"completions/mean_terminated_length": 7877.40625,
"completions/min_length": 1862.0,
"completions/min_terminated_length": 1862.0,
"entropy": 0.10635959357023239,
"epoch": 0.00424,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1976176500320435,
"kl": 0.030401008145418018,
"learning_rate": 7.471769017272662e-06,
"loss": 0.0169,
"num_tokens": 29124980.0,
"reward": -0.40934568643569946,
"reward_std": 0.4898146986961365,
"rewards/rollout_reward_func/mean": -0.40934568643569946,
"rewards/rollout_reward_func/std": 0.5182675719261169,
"sampling/importance_sampling_ratio/max": 1.8533921241760254,
"sampling/importance_sampling_ratio/mean": 0.9990241527557373,
"sampling/importance_sampling_ratio/min": 0.43001580238342285,
"sampling/sampling_logp_difference/max": 0.8439333438873291,
"sampling/sampling_logp_difference/mean": 0.01345333456993103,
"step": 106,
"step_time": 92.02846917199531
}
],
"logging_steps": 1.0,
"max_steps": 400,
"num_input_tokens_seen": 29124980,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}