environment-zay / trainer_state.json
Gege24's picture
Upload task output 1
4de4ab9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9609.0,
"completions/max_terminated_length": 9609.0,
"completions/mean_length": 7527.34375,
"completions/mean_terminated_length": 7527.34375,
"completions/min_length": 2464.0,
"completions/min_terminated_length": 2464.0,
"entropy": 0.31753343041054904,
"epoch": 0.008,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4123356342315674,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.305,
"num_tokens": 263947.0,
"reward": 0.031437501311302185,
"reward_std": 0.22218962013721466,
"rewards/alfworld_rollout_reward_func/mean": 0.031437501311302185,
"rewards/alfworld_rollout_reward_func/std": 0.3469863533973694,
"sampling/importance_sampling_ratio/max": 1.4741449356079102,
"sampling/importance_sampling_ratio/mean": 0.29983627796173096,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.5447624921798706,
"sampling/sampling_logp_difference/mean": 0.018553823232650757,
"step": 1,
"step_time": 335.8077390380013
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9708.0,
"completions/max_terminated_length": 9708.0,
"completions/mean_length": 7452.15625,
"completions/mean_terminated_length": 7452.15625,
"completions/min_length": 1958.0,
"completions/min_terminated_length": 1958.0,
"entropy": 0.26579508977010846,
"epoch": 0.016,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6812145113945007,
"kl": 0.0,
"learning_rate": 6.25e-06,
"loss": 0.3284,
"num_tokens": 528400.0,
"reward": 0.23356249928474426,
"reward_std": 0.47549036145210266,
"rewards/alfworld_rollout_reward_func/mean": 0.23356249928474426,
"rewards/alfworld_rollout_reward_func/std": 0.5023998618125916,
"sampling/importance_sampling_ratio/max": 2.4382495880126953,
"sampling/importance_sampling_ratio/mean": 0.5181977152824402,
"sampling/importance_sampling_ratio/min": 0.00792621448636055,
"sampling/sampling_logp_difference/max": 2.3046507835388184,
"sampling/sampling_logp_difference/mean": 0.017045794054865837,
"step": 2,
"step_time": 322.64169866899965
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7940.0,
"completions/max_terminated_length": 7940.0,
"completions/mean_length": 5720.75,
"completions/mean_terminated_length": 5720.75,
"completions/min_length": 1041.0,
"completions/min_terminated_length": 1041.0,
"entropy": 0.39676310354843736,
"epoch": 0.024,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5915209054946899,
"kl": 0.0016266869151877472,
"learning_rate": 1.25e-05,
"loss": -0.5029,
"num_tokens": 730920.0,
"reward": 0.14431250095367432,
"reward_std": 0.4770090878009796,
"rewards/alfworld_rollout_reward_func/mean": 0.14431250095367432,
"rewards/alfworld_rollout_reward_func/std": 0.4686758518218994,
"sampling/importance_sampling_ratio/max": 2.4560086727142334,
"sampling/importance_sampling_ratio/mean": 0.37546274065971375,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.5112149715423584,
"sampling/sampling_logp_difference/mean": 0.021917540580034256,
"step": 3,
"step_time": 280.4968342440011
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10155.0,
"completions/max_terminated_length": 10155.0,
"completions/mean_length": 8253.03125,
"completions/mean_terminated_length": 8253.03125,
"completions/min_length": 4463.0,
"completions/min_terminated_length": 4463.0,
"entropy": 0.3865774553269148,
"epoch": 0.032,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6168197989463806,
"kl": 0.0021592163539025933,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.5303,
"num_tokens": 1018057.0,
"reward": -0.05943749472498894,
"reward_std": 0.17190764844417572,
"rewards/alfworld_rollout_reward_func/mean": -0.05943749472498894,
"rewards/alfworld_rollout_reward_func/std": 0.2674463391304016,
"sampling/importance_sampling_ratio/max": 2.158449172973633,
"sampling/importance_sampling_ratio/mean": 0.3775303363800049,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.5581846237182617,
"sampling/sampling_logp_difference/mean": 0.02310691960155964,
"step": 4,
"step_time": 360.68988642800105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5661.0,
"completions/max_terminated_length": 5661.0,
"completions/mean_length": 4748.75,
"completions/mean_terminated_length": 4748.75,
"completions/min_length": 3574.0,
"completions/min_terminated_length": 3574.0,
"entropy": 0.3531342991627753,
"epoch": 0.04,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4972003400325775,
"kl": 0.0017817301231843885,
"learning_rate": 2.5e-05,
"loss": -0.1008,
"num_tokens": 1186209.0,
"reward": -0.03831250220537186,
"reward_std": 0.16514073312282562,
"rewards/alfworld_rollout_reward_func/mean": -0.03831250220537186,
"rewards/alfworld_rollout_reward_func/std": 0.263570100069046,
"sampling/importance_sampling_ratio/max": 1.5651471614837646,
"sampling/importance_sampling_ratio/mean": 0.23428680002689362,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 10.689125061035156,
"sampling/sampling_logp_difference/mean": 0.02276257984340191,
"step": 5,
"step_time": 235.26227801500136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11864.0,
"completions/max_terminated_length": 11864.0,
"completions/mean_length": 9544.53125,
"completions/mean_terminated_length": 9544.53125,
"completions/min_length": 2217.0,
"completions/min_terminated_length": 2217.0,
"entropy": 0.3936329837888479,
"epoch": 0.048,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4895797669887543,
"kl": 0.0018409217773296405,
"learning_rate": 2.4995787066293908e-05,
"loss": 0.0504,
"num_tokens": 1519058.0,
"reward": 0.08924999833106995,
"reward_std": 0.3558388352394104,
"rewards/alfworld_rollout_reward_func/mean": 0.08924999833106995,
"rewards/alfworld_rollout_reward_func/std": 0.4207598865032196,
"sampling/importance_sampling_ratio/max": 2.6714699268341064,
"sampling/importance_sampling_ratio/mean": 0.5432157516479492,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.0583038330078125,
"sampling/sampling_logp_difference/mean": 0.022378364577889442,
"step": 6,
"step_time": 440.67857990999437
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9557.0,
"completions/max_terminated_length": 9557.0,
"completions/mean_length": 8216.96875,
"completions/mean_terminated_length": 8216.96875,
"completions/min_length": 7643.0,
"completions/min_terminated_length": 7643.0,
"entropy": 0.3345654481090605,
"epoch": 0.056,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.36864349246025085,
"kl": 0.001815770137909567,
"learning_rate": 2.498315110498529e-05,
"loss": -0.6279,
"num_tokens": 1805009.0,
"reward": -0.07706249505281448,
"reward_std": 0.10743739455938339,
"rewards/alfworld_rollout_reward_func/mean": -0.07706249505281448,
"rewards/alfworld_rollout_reward_func/std": 0.18897344172000885,
"sampling/importance_sampling_ratio/max": 2.8883540630340576,
"sampling/importance_sampling_ratio/mean": 0.3507537841796875,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.560537338256836,
"sampling/sampling_logp_difference/mean": 0.019598914310336113,
"step": 7,
"step_time": 352.1544552209989
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5112.0,
"completions/max_terminated_length": 5112.0,
"completions/mean_length": 4309.6875,
"completions/mean_terminated_length": 4309.6875,
"completions/min_length": 1531.0,
"completions/min_terminated_length": 1531.0,
"entropy": 0.3713793349452317,
"epoch": 0.064,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6283824443817139,
"kl": 0.0014209353576006833,
"learning_rate": 2.496210063358892e-05,
"loss": 0.6368,
"num_tokens": 1958983.0,
"reward": 0.05275000259280205,
"reward_std": 0.3398139476776123,
"rewards/alfworld_rollout_reward_func/mean": 0.05275000259280205,
"rewards/alfworld_rollout_reward_func/std": 0.38452139496803284,
"sampling/importance_sampling_ratio/max": 2.513087272644043,
"sampling/importance_sampling_ratio/mean": 0.4636477828025818,
"sampling/importance_sampling_ratio/min": 0.007982950657606125,
"sampling/sampling_logp_difference/max": 1.4883854389190674,
"sampling/sampling_logp_difference/mean": 0.019389096647500992,
"step": 8,
"step_time": 223.46994931800418
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9707.0,
"completions/max_terminated_length": 9707.0,
"completions/mean_length": 6891.5625,
"completions/mean_terminated_length": 6891.5625,
"completions/min_length": 1846.0,
"completions/min_terminated_length": 1846.0,
"entropy": 0.3352759047411382,
"epoch": 0.072,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5288219451904297,
"kl": 0.0022124643182905857,
"learning_rate": 2.4932649841583266e-05,
"loss": -0.19,
"num_tokens": 2205465.0,
"reward": 0.26093751192092896,
"reward_std": 0.539341151714325,
"rewards/alfworld_rollout_reward_func/mean": 0.26093751192092896,
"rewards/alfworld_rollout_reward_func/std": 0.5594924092292786,
"sampling/importance_sampling_ratio/max": 2.794914960861206,
"sampling/importance_sampling_ratio/mean": 0.49680012464523315,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 46.763362884521484,
"sampling/sampling_logp_difference/mean": 0.023306839168071747,
"step": 9,
"step_time": 329.5470013760005
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10376.0,
"completions/max_terminated_length": 10376.0,
"completions/mean_length": 8182.25,
"completions/mean_terminated_length": 8182.25,
"completions/min_length": 5768.0,
"completions/min_terminated_length": 5768.0,
"entropy": 0.3423183555714786,
"epoch": 0.08,
"frac_reward_zero_std": 0.0,
"grad_norm": 500.11370849609375,
"kl": 1.0201642424835882,
"learning_rate": 2.489481858084583e-05,
"loss": 0.1234,
"num_tokens": 2489921.0,
"reward": -0.07881250232458115,
"reward_std": 0.10148754715919495,
"rewards/alfworld_rollout_reward_func/mean": -0.07881250232458115,
"rewards/alfworld_rollout_reward_func/std": 0.18484507501125336,
"sampling/importance_sampling_ratio/max": 2.4246673583984375,
"sampling/importance_sampling_ratio/mean": 0.40079888701438904,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 49.023101806640625,
"sampling/sampling_logp_difference/mean": 0.03266483545303345,
"step": 10,
"step_time": 350.9630481339973
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5637.0,
"completions/max_terminated_length": 5637.0,
"completions/mean_length": 5241.5625,
"completions/mean_terminated_length": 5241.5625,
"completions/min_length": 4864.0,
"completions/min_terminated_length": 4864.0,
"entropy": 0.36981488950550556,
"epoch": 0.088,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5718499422073364,
"kl": 0.0016583001779508777,
"learning_rate": 2.4848632352271566e-05,
"loss": -1.1019,
"num_tokens": 2674899.0,
"reward": -0.11343749612569809,
"reward_std": 0.04205840826034546,
"rewards/alfworld_rollout_reward_func/mean": -0.11343749612569809,
"rewards/alfworld_rollout_reward_func/std": 0.04254859685897827,
"sampling/importance_sampling_ratio/max": 2.287071704864502,
"sampling/importance_sampling_ratio/mean": 0.5078843832015991,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.9416933059692383,
"sampling/sampling_logp_difference/mean": 0.019637946039438248,
"step": 11,
"step_time": 247.4129799650018
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9350.0,
"completions/max_terminated_length": 9350.0,
"completions/mean_length": 8717.375,
"completions/mean_terminated_length": 8717.375,
"completions/min_length": 8325.0,
"completions/min_terminated_length": 8325.0,
"entropy": 0.3274609283544123,
"epoch": 0.096,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6321223974227905,
"kl": 0.0023520230570284184,
"learning_rate": 2.4794122288583533e-05,
"loss": 0.8207,
"num_tokens": 2978463.0,
"reward": -0.10750000178813934,
"reward_std": 0.034280747175216675,
"rewards/alfworld_rollout_reward_func/mean": -0.10750000178813934,
"rewards/alfworld_rollout_reward_func/std": 0.03943144157528877,
"sampling/importance_sampling_ratio/max": 2.627840042114258,
"sampling/importance_sampling_ratio/mean": 0.43182629346847534,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2869181632995605,
"sampling/sampling_logp_difference/mean": 0.019458087161183357,
"step": 12,
"step_time": 362.32430394600306
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11219.0,
"completions/max_terminated_length": 11219.0,
"completions/mean_length": 8425.9375,
"completions/mean_terminated_length": 8425.9375,
"completions/min_length": 6101.0,
"completions/min_terminated_length": 6101.0,
"entropy": 0.27532787807285786,
"epoch": 0.104,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5922659635543823,
"kl": 0.0038648816698696464,
"learning_rate": 2.4731325133347272e-05,
"loss": -0.0541,
"num_tokens": 3272573.0,
"reward": 0.062562495470047,
"reward_std": 0.22287124395370483,
"rewards/alfworld_rollout_reward_func/mean": 0.062562495470047,
"rewards/alfworld_rollout_reward_func/std": 0.3846460282802582,
"sampling/importance_sampling_ratio/max": 2.459491729736328,
"sampling/importance_sampling_ratio/mean": 0.7038756608963013,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.8825843334198,
"sampling/sampling_logp_difference/mean": 0.017496587708592415,
"step": 13,
"step_time": 377.7220775159976
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5013.0,
"completions/max_terminated_length": 5013.0,
"completions/mean_length": 3482.0625,
"completions/mean_terminated_length": 3482.0625,
"completions/min_length": 524.0,
"completions/min_terminated_length": 524.0,
"entropy": 0.35823827097192407,
"epoch": 0.112,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7376836538314819,
"kl": 0.003011322856764309,
"learning_rate": 2.466028321620309e-05,
"loss": 0.6336,
"num_tokens": 3399487.0,
"reward": 0.39381250739097595,
"reward_std": 0.507357120513916,
"rewards/alfworld_rollout_reward_func/mean": 0.39381250739097595,
"rewards/alfworld_rollout_reward_func/std": 0.5500279068946838,
"sampling/importance_sampling_ratio/max": 2.5899088382720947,
"sampling/importance_sampling_ratio/mean": 0.6255139708518982,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.7760124206542969,
"sampling/sampling_logp_difference/mean": 0.021579492837190628,
"step": 14,
"step_time": 182.0329440820051
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9315.0,
"completions/max_terminated_length": 9315.0,
"completions/mean_length": 8675.8125,
"completions/mean_terminated_length": 8675.8125,
"completions/min_length": 7983.0,
"completions/min_terminated_length": 7983.0,
"entropy": 0.29767332202754915,
"epoch": 0.12,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6271937489509583,
"kl": 0.002263592203235021,
"learning_rate": 2.4581044424332964e-05,
"loss": 0.0795,
"num_tokens": 3701529.0,
"reward": -0.08531250059604645,
"reward_std": 0.024405591189861298,
"rewards/alfworld_rollout_reward_func/mean": -0.08531250059604645,
"rewards/alfworld_rollout_reward_func/std": 0.024884814396500587,
"sampling/importance_sampling_ratio/max": 2.4757065773010254,
"sampling/importance_sampling_ratio/mean": 0.5632457137107849,
"sampling/importance_sampling_ratio/min": 0.021398449316620827,
"sampling/sampling_logp_difference/max": 1.3662091493606567,
"sampling/sampling_logp_difference/mean": 0.018054665997624397,
"step": 15,
"step_time": 357.9417388269985
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5504.0,
"completions/max_terminated_length": 5504.0,
"completions/mean_length": 2116.59375,
"completions/mean_terminated_length": 2116.59375,
"completions/min_length": 705.0,
"completions/min_terminated_length": 705.0,
"entropy": 0.3064890103414655,
"epoch": 0.128,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3754582107067108,
"kl": 0.0037364878353400854,
"learning_rate": 2.449366217018122e-05,
"loss": 0.067,
"num_tokens": 3784812.0,
"reward": 0.7191874980926514,
"reward_std": 0.4386064410209656,
"rewards/alfworld_rollout_reward_func/mean": 0.7191874980926514,
"rewards/alfworld_rollout_reward_func/std": 0.4786539673805237,
"sampling/importance_sampling_ratio/max": 2.0642874240875244,
"sampling/importance_sampling_ratio/mean": 0.6617263555526733,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.3662114143371582,
"sampling/sampling_logp_difference/mean": 0.021964222192764282,
"step": 16,
"step_time": 139.03199604900237
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5829.0,
"completions/max_terminated_length": 5829.0,
"completions/mean_length": 4871.875,
"completions/mean_terminated_length": 4871.875,
"completions/min_length": 1450.0,
"completions/min_terminated_length": 1450.0,
"entropy": 0.3853617487475276,
"epoch": 0.136,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6372714042663574,
"kl": 0.002946533808426466,
"learning_rate": 2.439819535545087e-05,
"loss": 0.6343,
"num_tokens": 3958504.0,
"reward": -0.002124996855854988,
"reward_std": 0.22313132882118225,
"rewards/alfworld_rollout_reward_func/mean": -0.002124996855854988,
"rewards/alfworld_rollout_reward_func/std": 0.3138851225376129,
"sampling/importance_sampling_ratio/max": 2.760993719100952,
"sampling/importance_sampling_ratio/mean": 0.6022263169288635,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.7812860012054443,
"sampling/sampling_logp_difference/mean": 0.02008485235273838,
"step": 17,
"step_time": 219.52375941698938
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12497.0,
"completions/max_terminated_length": 12497.0,
"completions/mean_length": 10824.96875,
"completions/mean_terminated_length": 10824.96875,
"completions/min_length": 2927.0,
"completions/min_terminated_length": 2927.0,
"entropy": 0.3916443451307714,
"epoch": 0.144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5299886465072632,
"kl": 0.004217182104184758,
"learning_rate": 2.4294708331399775e-05,
"loss": -0.6161,
"num_tokens": 4332871.0,
"reward": -0.01993749663233757,
"reward_std": 0.21947234869003296,
"rewards/alfworld_rollout_reward_func/mean": -0.01993749663233757,
"rewards/alfworld_rollout_reward_func/std": 0.30789878964424133,
"sampling/importance_sampling_ratio/max": 2.8617172241210938,
"sampling/importance_sampling_ratio/mean": 0.43398600816726685,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.6485512256622314,
"sampling/sampling_logp_difference/mean": 0.023952314630150795,
"step": 18,
"step_time": 453.0025306710031
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6249.0,
"completions/max_terminated_length": 6249.0,
"completions/mean_length": 5000.625,
"completions/mean_terminated_length": 5000.625,
"completions/min_length": 3696.0,
"completions/min_terminated_length": 3696.0,
"entropy": 0.43349673599004745,
"epoch": 0.152,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5063394904136658,
"kl": 0.003751923381059896,
"learning_rate": 2.4183270855463413e-05,
"loss": -0.0025,
"num_tokens": 4508859.0,
"reward": -0.05518750101327896,
"reward_std": 0.09173674881458282,
"rewards/alfworld_rollout_reward_func/mean": -0.05518750101327896,
"rewards/alfworld_rollout_reward_func/std": 0.18693235516548157,
"sampling/importance_sampling_ratio/max": 2.7608871459960938,
"sampling/importance_sampling_ratio/mean": 0.3959204852581024,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.0717275142669678,
"sampling/sampling_logp_difference/mean": 0.02457268536090851,
"step": 19,
"step_time": 232.94531282299977
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11351.0,
"completions/max_terminated_length": 11351.0,
"completions/mean_length": 10259.65625,
"completions/mean_terminated_length": 10259.65625,
"completions/min_length": 6872.0,
"completions/min_terminated_length": 6872.0,
"entropy": 0.3433692827820778,
"epoch": 0.16,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2678588032722473,
"kl": 0.00476062010784517,
"learning_rate": 2.406395804423355e-05,
"loss": -0.1127,
"num_tokens": 4865168.0,
"reward": -0.010062501765787601,
"reward_std": 0.20959903299808502,
"rewards/alfworld_rollout_reward_func/mean": -0.010062501765787601,
"rewards/alfworld_rollout_reward_func/std": 0.29610002040863037,
"sampling/importance_sampling_ratio/max": 1.9335780143737793,
"sampling/importance_sampling_ratio/mean": 0.2613844871520996,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.60330331325531,
"sampling/sampling_logp_difference/mean": 0.02182953618466854,
"step": 20,
"step_time": 411.72310698399815
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5627.0,
"completions/max_terminated_length": 5627.0,
"completions/mean_length": 3213.1875,
"completions/mean_terminated_length": 3213.1875,
"completions/min_length": 616.0,
"completions/min_terminated_length": 616.0,
"entropy": 0.3132283384911716,
"epoch": 0.168,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.28126800060272217,
"kl": 0.006387614266714081,
"learning_rate": 2.3936850322824417e-05,
"loss": 0.0117,
"num_tokens": 4985238.0,
"reward": 0.6055624485015869,
"reward_std": 0.48004958033561707,
"rewards/alfworld_rollout_reward_func/mean": 0.6055624485015869,
"rewards/alfworld_rollout_reward_func/std": 0.5068143606185913,
"sampling/importance_sampling_ratio/max": 2.4571378231048584,
"sampling/importance_sampling_ratio/mean": 0.5615379810333252,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.6348450183868408,
"sampling/sampling_logp_difference/mean": 0.020369766280055046,
"step": 21,
"step_time": 166.97806567500083
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8290.0,
"completions/max_terminated_length": 8290.0,
"completions/mean_length": 6657.03125,
"completions/mean_terminated_length": 6657.03125,
"completions/min_length": 1775.0,
"completions/min_terminated_length": 1775.0,
"entropy": 0.32329121232032776,
"epoch": 0.176,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.294430673122406,
"kl": 0.0053747415950056165,
"learning_rate": 2.380203337066063e-05,
"loss": 0.0999,
"num_tokens": 5218679.0,
"reward": 0.0065000057220458984,
"reward_std": 0.23578479886054993,
"rewards/alfworld_rollout_reward_func/mean": 0.0065000057220458984,
"rewards/alfworld_rollout_reward_func/std": 0.3685416877269745,
"sampling/importance_sampling_ratio/max": 2.0549917221069336,
"sampling/importance_sampling_ratio/mean": 0.4458431601524353,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.611385464668274,
"sampling/sampling_logp_difference/mean": 0.019326191395521164,
"step": 22,
"step_time": 299.1344787260059
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5289.0,
"completions/max_terminated_length": 5289.0,
"completions/mean_length": 4233.375,
"completions/mean_terminated_length": 4233.375,
"completions/min_length": 535.0,
"completions/min_terminated_length": 535.0,
"entropy": 0.38607919216156006,
"epoch": 0.184,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5484526753425598,
"kl": 0.004964196581568103,
"learning_rate": 2.36595980637233e-05,
"loss": 0.1743,
"num_tokens": 5371395.0,
"reward": 0.25599998235702515,
"reward_std": 0.5028968453407288,
"rewards/alfworld_rollout_reward_func/mean": 0.25599998235702515,
"rewards/alfworld_rollout_reward_func/std": 0.5062468647956848,
"sampling/importance_sampling_ratio/max": 1.7829545736312866,
"sampling/importance_sampling_ratio/mean": 0.5887770652770996,
"sampling/importance_sampling_ratio/min": 0.008336997590959072,
"sampling/sampling_logp_difference/max": 2.665585517883301,
"sampling/sampling_logp_difference/mean": 0.02209375984966755,
"step": 23,
"step_time": 205.04334290899897
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 5275.0,
"completions/max_terminated_length": 4975.0,
"completions/mean_length": 3625.5,
"completions/mean_terminated_length": 3572.290283203125,
"completions/min_length": 503.0,
"completions/min_terminated_length": 503.0,
"entropy": 0.3871590462513268,
"epoch": 0.192,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6740729212760925,
"kl": 0.005397088676545536,
"learning_rate": 2.3509640413293303e-05,
"loss": 1.3846,
"num_tokens": 5503731.0,
"reward": 0.45381247997283936,
"reward_std": 0.5777146220207214,
"rewards/alfworld_rollout_reward_func/mean": 0.45381247997283936,
"rewards/alfworld_rollout_reward_func/std": 0.538960337638855,
"sampling/importance_sampling_ratio/max": 2.8123936653137207,
"sampling/importance_sampling_ratio/mean": 0.6568740606307983,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.7232227325439453,
"sampling/sampling_logp_difference/mean": 0.019891591742634773,
"step": 24,
"step_time": 187.47785765799927
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8448.0,
"completions/max_terminated_length": 8448.0,
"completions/mean_length": 7377.09375,
"completions/mean_terminated_length": 7377.09375,
"completions/min_length": 2312.0,
"completions/min_terminated_length": 2312.0,
"entropy": 0.35573973087593913,
"epoch": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6313058733940125,
"kl": 0.006160886805446353,
"learning_rate": 2.335226150123305e-05,
"loss": 0.5233,
"num_tokens": 5762006.0,
"reward": -0.02968749776482582,
"reward_std": 0.1530226618051529,
"rewards/alfworld_rollout_reward_func/mean": -0.02968749776482582,
"rewards/alfworld_rollout_reward_func/std": 0.25761842727661133,
"sampling/importance_sampling_ratio/max": 2.5971615314483643,
"sampling/importance_sampling_ratio/mean": 0.6694207191467285,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.766273856163025,
"sampling/sampling_logp_difference/mean": 0.022114556282758713,
"step": 25,
"step_time": 308.79034867000337
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10749.0,
"completions/max_terminated_length": 10749.0,
"completions/mean_length": 9741.78125,
"completions/mean_terminated_length": 9741.78125,
"completions/min_length": 7678.0,
"completions/min_terminated_length": 7678.0,
"entropy": 0.3445580299012363,
"epoch": 0.208,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6585941314697266,
"kl": 0.006797554538934492,
"learning_rate": 2.3187567411850253e-05,
"loss": -0.128,
"num_tokens": 6100879.0,
"reward": -0.049687497317790985,
"reward_std": 0.15042373538017273,
"rewards/alfworld_rollout_reward_func/mean": -0.049687497317790985,
"rewards/alfworld_rollout_reward_func/std": 0.24326865375041962,
"sampling/importance_sampling_ratio/max": 2.6634747982025146,
"sampling/importance_sampling_ratio/mean": 0.7100951075553894,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.047456741333008,
"sampling/sampling_logp_difference/mean": 0.020113738253712654,
"step": 26,
"step_time": 400.11353040200447
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6953.0,
"completions/max_terminated_length": 6953.0,
"completions/mean_length": 6171.40625,
"completions/mean_terminated_length": 6171.40625,
"completions/min_length": 3641.0,
"completions/min_terminated_length": 3641.0,
"entropy": 0.31149382051080465,
"epoch": 0.216,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6737591028213501,
"kl": 0.005847332264238503,
"learning_rate": 2.3015669160389767e-05,
"loss": -0.2064,
"num_tokens": 6319100.0,
"reward": 0.09456250071525574,
"reward_std": 0.29294320940971375,
"rewards/alfworld_rollout_reward_func/mean": 0.09456250071525574,
"rewards/alfworld_rollout_reward_func/std": 0.41390296816825867,
"sampling/importance_sampling_ratio/max": 2.599041223526001,
"sampling/importance_sampling_ratio/mean": 0.5393513441085815,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.398754596710205,
"sampling/sampling_logp_difference/mean": 0.020149247720837593,
"step": 27,
"step_time": 238.93764413800636
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 4903.0,
"completions/max_terminated_length": 4903.0,
"completions/mean_length": 2781.09375,
"completions/mean_terminated_length": 2781.09375,
"completions/min_length": 481.0,
"completions/min_terminated_length": 481.0,
"entropy": 0.3561269377823919,
"epoch": 0.224,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3367806077003479,
"kl": 0.006373407544742804,
"learning_rate": 2.283668261820161e-05,
"loss": -0.4613,
"num_tokens": 6423071.0,
"reward": 0.6257500052452087,
"reward_std": 0.43230414390563965,
"rewards/alfworld_rollout_reward_func/mean": 0.6257500052452087,
"rewards/alfworld_rollout_reward_func/std": 0.5019095540046692,
"sampling/importance_sampling_ratio/max": 2.960655927658081,
"sampling/importance_sampling_ratio/mean": 0.511298656463623,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.4176197052001953,
"sampling/sampling_logp_difference/mean": 0.020958153530955315,
"step": 28,
"step_time": 165.59535311200307
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9912.0,
"completions/max_terminated_length": 9912.0,
"completions/mean_length": 8714.34375,
"completions/mean_terminated_length": 8714.34375,
"completions/min_length": 3333.0,
"completions/min_terminated_length": 3333.0,
"entropy": 0.3266249899752438,
"epoch": 0.232,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4753160774707794,
"kl": 0.004642268795578275,
"learning_rate": 2.2650728434635627e-05,
"loss": -0.2244,
"num_tokens": 6727242.0,
"reward": -0.024625001475214958,
"reward_std": 0.15632086992263794,
"rewards/alfworld_rollout_reward_func/mean": -0.024625001475214958,
"rewards/alfworld_rollout_reward_func/std": 0.2652129530906677,
"sampling/importance_sampling_ratio/max": 2.4731619358062744,
"sampling/importance_sampling_ratio/mean": 0.6834967732429504,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.7130157947540283,
"sampling/sampling_logp_difference/mean": 0.018951794132590294,
"step": 29,
"step_time": 380.26207640899884
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9703.0,
"completions/max_terminated_length": 9703.0,
"completions/mean_length": 5901.9375,
"completions/mean_terminated_length": 5901.9375,
"completions/min_length": 1826.0,
"completions/min_terminated_length": 1826.0,
"entropy": 0.3159148655831814,
"epoch": 0.24,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.30811336636543274,
"kl": 0.01128493210853776,
"learning_rate": 2.245793195571545e-05,
"loss": 0.2382,
"num_tokens": 6941480.0,
"reward": 0.5379999876022339,
"reward_std": 0.529421329498291,
"rewards/alfworld_rollout_reward_func/mean": 0.5379999876022339,
"rewards/alfworld_rollout_reward_func/std": 0.5327406525611877,
"sampling/importance_sampling_ratio/max": 2.1434435844421387,
"sampling/importance_sampling_ratio/mean": 0.42453715205192566,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 55.80344009399414,
"sampling/sampling_logp_difference/mean": 0.030401449650526047,
"step": 30,
"step_time": 299.59452940400297
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8485.0,
"completions/max_terminated_length": 8485.0,
"completions/mean_length": 6588.875,
"completions/mean_terminated_length": 6588.875,
"completions/min_length": 1445.0,
"completions/min_terminated_length": 1445.0,
"entropy": 0.24451400735415518,
"epoch": 0.248,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3296390771865845,
"kl": 0.005532389182917541,
"learning_rate": 2.22584231396466e-05,
"loss": -0.1623,
"num_tokens": 7173284.0,
"reward": 0.10481249541044235,
"reward_std": 0.294689416885376,
"rewards/alfworld_rollout_reward_func/mean": 0.10481249541044235,
"rewards/alfworld_rollout_reward_func/std": 0.41431817412376404,
"sampling/importance_sampling_ratio/max": 2.418684959411621,
"sampling/importance_sampling_ratio/mean": 0.5402201414108276,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.55106782913208,
"sampling/sampling_logp_difference/mean": 0.014965346083045006,
"step": 31,
"step_time": 294.43344296000396
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13133.0,
"completions/max_terminated_length": 13133.0,
"completions/mean_length": 7847.59375,
"completions/mean_terminated_length": 7847.59375,
"completions/min_length": 2243.0,
"completions/min_terminated_length": 2243.0,
"entropy": 0.2889693870674819,
"epoch": 0.256,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5536743402481079,
"kl": 0.007658715800062055,
"learning_rate": 2.2052336469215616e-05,
"loss": 0.6745,
"num_tokens": 7456151.0,
"reward": 0.47606247663497925,
"reward_std": 0.5644147396087646,
"rewards/alfworld_rollout_reward_func/mean": 0.47606247663497925,
"rewards/alfworld_rollout_reward_func/std": 0.5439455509185791,
"sampling/importance_sampling_ratio/max": 2.5001707077026367,
"sampling/importance_sampling_ratio/mean": 0.7739982008934021,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.0981035232543945,
"sampling/sampling_logp_difference/mean": 0.020820828154683113,
"step": 32,
"step_time": 411.82160600200405
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5437.0,
"completions/max_terminated_length": 5437.0,
"completions/mean_length": 4673.25,
"completions/mean_terminated_length": 4673.25,
"completions/min_length": 2119.0,
"completions/min_terminated_length": 2119.0,
"entropy": 0.2834106246009469,
"epoch": 0.264,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4467843174934387,
"kl": 0.010782435245346278,
"learning_rate": 2.183981086113933e-05,
"loss": 0.5872,
"num_tokens": 7623039.0,
"reward": 0.0091249980032444,
"reward_std": 0.2119598239660263,
"rewards/alfworld_rollout_reward_func/mean": 0.0091249980032444,
"rewards/alfworld_rollout_reward_func/std": 0.31305113434791565,
"sampling/importance_sampling_ratio/max": 2.066004514694214,
"sampling/importance_sampling_ratio/mean": 0.5565071105957031,
"sampling/importance_sampling_ratio/min": 3.582804466278037e-30,
"sampling/sampling_logp_difference/max": 58.839195251464844,
"sampling/sampling_logp_difference/mean": 0.025124380365014076,
"step": 33,
"step_time": 180.57811164100167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9775.0,
"completions/max_terminated_length": 9775.0,
"completions/mean_length": 7425.125,
"completions/mean_terminated_length": 7425.125,
"completions/min_length": 2879.0,
"completions/min_terminated_length": 2879.0,
"entropy": 0.34114605700597167,
"epoch": 0.272,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4392341375350952,
"kl": 0.012745806110615376,
"learning_rate": 2.1620989572425376e-05,
"loss": 0.0367,
"num_tokens": 7886531.0,
"reward": 0.29856249690055847,
"reward_std": 0.5038343667984009,
"rewards/alfworld_rollout_reward_func/mean": 0.29856249690055847,
"rewards/alfworld_rollout_reward_func/std": 0.5456701517105103,
"sampling/importance_sampling_ratio/max": 2.859746217727661,
"sampling/importance_sampling_ratio/mean": 0.40094250440597534,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.6913090944290161,
"sampling/sampling_logp_difference/mean": 0.02110915444791317,
"step": 34,
"step_time": 348.9884387079983
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9581.0,
"completions/max_terminated_length": 9581.0,
"completions/mean_length": 6927.1875,
"completions/mean_terminated_length": 6927.1875,
"completions/min_length": 2349.0,
"completions/min_terminated_length": 2349.0,
"entropy": 0.3079565931111574,
"epoch": 0.28,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7179110050201416,
"kl": 0.0068448090823949315,
"learning_rate": 2.1396020103807003e-05,
"loss": 1.2054,
"num_tokens": 8133961.0,
"reward": 0.30643752217292786,
"reward_std": 0.5345775485038757,
"rewards/alfworld_rollout_reward_func/mean": 0.30643752217292786,
"rewards/alfworld_rollout_reward_func/std": 0.5518923997879028,
"sampling/importance_sampling_ratio/max": 2.6090545654296875,
"sampling/importance_sampling_ratio/mean": 0.5262423157691956,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2512085437774658,
"sampling/sampling_logp_difference/mean": 0.019684435799717903,
"step": 35,
"step_time": 332.1077152680009
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6221.0,
"completions/max_terminated_length": 6221.0,
"completions/mean_length": 5477.1875,
"completions/mean_terminated_length": 5477.1875,
"completions/min_length": 3469.0,
"completions/min_terminated_length": 3469.0,
"entropy": 0.28742302511818707,
"epoch": 0.288,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3029775023460388,
"kl": 0.007389042439172044,
"learning_rate": 2.1165054100317364e-05,
"loss": -0.0729,
"num_tokens": 8328591.0,
"reward": -0.043937504291534424,
"reward_std": 0.08157414942979813,
"rewards/alfworld_rollout_reward_func/mean": -0.043937504291534424,
"rewards/alfworld_rollout_reward_func/std": 0.184916689991951,
"sampling/importance_sampling_ratio/max": 2.357729911804199,
"sampling/importance_sampling_ratio/mean": 0.4288977086544037,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.2753419876098633,
"sampling/sampling_logp_difference/mean": 0.017678607255220413,
"step": 36,
"step_time": 228.19555244999538
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10279.0,
"completions/max_terminated_length": 10279.0,
"completions/mean_length": 8721.09375,
"completions/mean_terminated_length": 8721.09375,
"completions/min_length": 2359.0,
"completions/min_terminated_length": 2359.0,
"entropy": 0.3117677140980959,
"epoch": 0.296,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4540128707885742,
"kl": 0.006687425287964288,
"learning_rate": 2.0928247249070227e-05,
"loss": 0.323,
"num_tokens": 8633362.0,
"reward": -0.011625003069639206,
"reward_std": 0.16723808646202087,
"rewards/alfworld_rollout_reward_func/mean": -0.011625003069639206,
"rewards/alfworld_rollout_reward_func/std": 0.32382330298423767,
"sampling/importance_sampling_ratio/max": 2.514087438583374,
"sampling/importance_sampling_ratio/mean": 0.3541397750377655,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.362422466278076,
"sampling/sampling_logp_difference/mean": 0.020557893440127373,
"step": 37,
"step_time": 371.70526020600664
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5569.0,
"completions/max_terminated_length": 5569.0,
"completions/mean_length": 4266.03125,
"completions/mean_terminated_length": 4266.03125,
"completions/min_length": 1452.0,
"completions/min_terminated_length": 1452.0,
"entropy": 0.39905168302357197,
"epoch": 0.304,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6425251960754395,
"kl": 0.007170766235503834,
"learning_rate": 2.0685759174316066e-05,
"loss": 0.3563,
"num_tokens": 8786803.0,
"reward": 0.31306248903274536,
"reward_std": 0.5542499423027039,
"rewards/alfworld_rollout_reward_func/mean": 0.31306248903274536,
"rewards/alfworld_rollout_reward_func/std": 0.528755784034729,
"sampling/importance_sampling_ratio/max": 2.680379629135132,
"sampling/importance_sampling_ratio/mean": 0.4211962819099426,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.3480374813079834,
"sampling/sampling_logp_difference/mean": 0.02187519334256649,
"step": 38,
"step_time": 221.66309013099817
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6825.0,
"completions/max_terminated_length": 6825.0,
"completions/mean_length": 4017.1875,
"completions/mean_terminated_length": 4017.1875,
"completions/min_length": 1882.0,
"completions/min_terminated_length": 1882.0,
"entropy": 0.27494299272075295,
"epoch": 0.312,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4104083776473999,
"kl": 0.008522690397512633,
"learning_rate": 2.0437753329844232e-05,
"loss": 0.3644,
"num_tokens": 8935129.0,
"reward": 0.5581250190734863,
"reward_std": 0.5189322829246521,
"rewards/alfworld_rollout_reward_func/mean": 0.5581250190734863,
"rewards/alfworld_rollout_reward_func/std": 0.5356204509735107,
"sampling/importance_sampling_ratio/max": 1.939102292060852,
"sampling/importance_sampling_ratio/mean": 0.6145648956298828,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.6772491931915283,
"sampling/sampling_logp_difference/mean": 0.01894262433052063,
"step": 39,
"step_time": 210.97975667200626
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5856.0,
"completions/max_terminated_length": 5856.0,
"completions/mean_length": 5156.53125,
"completions/mean_terminated_length": 5156.53125,
"completions/min_length": 4694.0,
"completions/min_terminated_length": 4694.0,
"entropy": 0.29539695545099676,
"epoch": 0.32,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5827170014381409,
"kl": 0.01248152791231405,
"learning_rate": 2.0184396888803762e-05,
"loss": -0.077,
"num_tokens": 9118090.0,
"reward": -0.09750000387430191,
"reward_std": 0.0399014875292778,
"rewards/alfworld_rollout_reward_func/mean": -0.09750000387430191,
"rewards/alfworld_rollout_reward_func/std": 0.042804885655641556,
"sampling/importance_sampling_ratio/max": 1.8288264274597168,
"sampling/importance_sampling_ratio/mean": 0.4271472096443176,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 53.8599967956543,
"sampling/sampling_logp_difference/mean": 0.03358942270278931,
"step": 40,
"step_time": 203.72415082099906
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5133.0,
"completions/max_terminated_length": 5133.0,
"completions/mean_length": 2081.125,
"completions/mean_terminated_length": 2081.125,
"completions/min_length": 857.0,
"completions/min_terminated_length": 857.0,
"entropy": 0.3698639366775751,
"epoch": 0.328,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.39960983395576477,
"kl": 0.01080034705955768,
"learning_rate": 1.9925860631017078e-05,
"loss": -0.0291,
"num_tokens": 9202190.0,
"reward": 0.8761249780654907,
"reward_std": 0.17561057209968567,
"rewards/alfworld_rollout_reward_func/mean": 0.8761249780654907,
"rewards/alfworld_rollout_reward_func/std": 0.2839300036430359,
"sampling/importance_sampling_ratio/max": 2.4484190940856934,
"sampling/importance_sampling_ratio/mean": 0.6725524663925171,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.6942663192749023,
"sampling/sampling_logp_difference/mean": 0.02107181027531624,
"step": 41,
"step_time": 132.85459781399732
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10435.0,
"completions/max_terminated_length": 10435.0,
"completions/mean_length": 9668.15625,
"completions/mean_terminated_length": 9668.15625,
"completions/min_length": 7131.0,
"completions/min_terminated_length": 7131.0,
"entropy": 0.28246624674648046,
"epoch": 0.336,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.488521009683609,
"kl": 0.005378451882279478,
"learning_rate": 1.9662318827862527e-05,
"loss": -0.6846,
"num_tokens": 9538611.0,
"reward": 0.004812499508261681,
"reward_std": 0.16284173727035522,
"rewards/alfworld_rollout_reward_func/mean": 0.004812499508261681,
"rewards/alfworld_rollout_reward_func/std": 0.3033957779407501,
"sampling/importance_sampling_ratio/max": 1.696621298789978,
"sampling/importance_sampling_ratio/mean": 0.5417707562446594,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.047061562538147,
"sampling/sampling_logp_difference/mean": 0.01622496359050274,
"step": 42,
"step_time": 423.1995726519999
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5896.0,
"completions/max_terminated_length": 5896.0,
"completions/mean_length": 3412.84375,
"completions/mean_terminated_length": 3412.84375,
"completions/min_length": 777.0,
"completions/min_terminated_length": 777.0,
"entropy": 0.45027640648186207,
"epoch": 0.344,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7471272945404053,
"kl": 0.010089495350257494,
"learning_rate": 1.9393949124803384e-05,
"loss": 0.2412,
"num_tokens": 9663502.0,
"reward": 0.44075000286102295,
"reward_std": 0.5147863626480103,
"rewards/alfworld_rollout_reward_func/mean": 0.44075000286102295,
"rewards/alfworld_rollout_reward_func/std": 0.5600635409355164,
"sampling/importance_sampling_ratio/max": 2.6056442260742188,
"sampling/importance_sampling_ratio/mean": 0.4545601010322571,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2315409183502197,
"sampling/sampling_logp_difference/mean": 0.023922625929117203,
"step": 43,
"step_time": 203.09121891599898
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8859.0,
"completions/max_terminated_length": 8859.0,
"completions/mean_length": 6845.78125,
"completions/mean_terminated_length": 6845.78125,
"completions/min_length": 1717.0,
"completions/min_terminated_length": 1717.0,
"entropy": 0.32285092724487185,
"epoch": 0.352,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4024052023887634,
"kl": 0.008475492584693711,
"learning_rate": 1.9120932421642484e-05,
"loss": 0.5293,
"num_tokens": 9905671.0,
"reward": 0.13581249117851257,
"reward_std": 0.2240503877401352,
"rewards/alfworld_rollout_reward_func/mean": 0.13581249117851257,
"rewards/alfworld_rollout_reward_func/std": 0.4513090252876282,
"sampling/importance_sampling_ratio/max": 2.646895170211792,
"sampling/importance_sampling_ratio/mean": 0.4667580723762512,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.5646392107009888,
"sampling/sampling_logp_difference/mean": 0.019861401990056038,
"step": 44,
"step_time": 315.9145935930028
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9315.0,
"completions/max_terminated_length": 9315.0,
"completions/mean_length": 8032.53125,
"completions/mean_terminated_length": 8032.53125,
"completions/min_length": 4993.0,
"completions/min_terminated_length": 4993.0,
"entropy": 0.29978093737736344,
"epoch": 0.36,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3842543065547943,
"kl": 0.006619712650717702,
"learning_rate": 1.8843452750583195e-05,
"loss": -0.05,
"num_tokens": 10185592.0,
"reward": 0.03681249916553497,
"reward_std": 0.26903754472732544,
"rewards/alfworld_rollout_reward_func/mean": 0.03681249916553497,
"rewards/alfworld_rollout_reward_func/std": 0.3464287221431732,
"sampling/importance_sampling_ratio/max": 2.4647469520568848,
"sampling/importance_sampling_ratio/mean": 0.3931768536567688,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.668698310852051,
"sampling/sampling_logp_difference/mean": 0.019018925726413727,
"step": 45,
"step_time": 344.13413975400545
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6213.0,
"completions/max_terminated_length": 6213.0,
"completions/mean_length": 5737.6875,
"completions/mean_terminated_length": 5737.6875,
"completions/min_length": 3759.0,
"completions/min_terminated_length": 3759.0,
"entropy": 0.3290684539824724,
"epoch": 0.368,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6180854439735413,
"kl": 0.024709066594368778,
"learning_rate": 1.856169715217896e-05,
"loss": 0.3411,
"num_tokens": 10387950.0,
"reward": -0.06937500089406967,
"reward_std": 0.16165806353092194,
"rewards/alfworld_rollout_reward_func/mean": -0.06937500089406967,
"rewards/alfworld_rollout_reward_func/std": 0.2620733380317688,
"sampling/importance_sampling_ratio/max": 1.737358808517456,
"sampling/importance_sampling_ratio/mean": 0.3259963095188141,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.5090951919555664,
"sampling/sampling_logp_difference/mean": 0.019301004707813263,
"step": 46,
"step_time": 242.2902739599922
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10361.0,
"completions/max_terminated_length": 10361.0,
"completions/mean_length": 8327.21875,
"completions/mean_terminated_length": 8327.21875,
"completions/min_length": 2893.0,
"completions/min_terminated_length": 2893.0,
"entropy": 0.3274059356190264,
"epoch": 0.376,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4274188280105591,
"kl": 0.010049835465906654,
"learning_rate": 1.8275855549254953e-05,
"loss": -0.2629,
"num_tokens": 10679093.0,
"reward": 0.08993750810623169,
"reward_std": 0.35606837272644043,
"rewards/alfworld_rollout_reward_func/mean": 0.08993750810623169,
"rewards/alfworld_rollout_reward_func/std": 0.42536285519599915,
"sampling/importance_sampling_ratio/max": 2.3043253421783447,
"sampling/importance_sampling_ratio/mean": 0.4068600535392761,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.6962127685546875,
"sampling/sampling_logp_difference/mean": 0.020758148282766342,
"step": 47,
"step_time": 370.7235914090161
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5495.0,
"completions/max_terminated_length": 5495.0,
"completions/mean_length": 3065.90625,
"completions/mean_terminated_length": 3065.90625,
"completions/min_length": 579.0,
"completions/min_terminated_length": 579.0,
"entropy": 0.3348121759481728,
"epoch": 0.384,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2784886062145233,
"kl": 0.009557744706398807,
"learning_rate": 1.798612061888695e-05,
"loss": -0.0538,
"num_tokens": 10794834.0,
"reward": 0.6332499980926514,
"reward_std": 0.5017424821853638,
"rewards/alfworld_rollout_reward_func/mean": 0.6332499980926514,
"rewards/alfworld_rollout_reward_func/std": 0.49993231892585754,
"sampling/importance_sampling_ratio/max": 1.7117670774459839,
"sampling/importance_sampling_ratio/mean": 0.546383261680603,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.1129664182662964,
"sampling/sampling_logp_difference/mean": 0.02091594971716404,
"step": 48,
"step_time": 173.00113746399438
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 4629.0,
"completions/max_terminated_length": 4629.0,
"completions/mean_length": 1015.0,
"completions/mean_terminated_length": 1015.0,
"completions/min_length": 505.0,
"completions/min_terminated_length": 505.0,
"entropy": 0.3150682970881462,
"epoch": 0.392,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.39293837547302246,
"kl": 0.01074478572991211,
"learning_rate": 1.7692687662523583e-05,
"loss": 0.3488,
"num_tokens": 10841970.0,
"reward": 0.9421250224113464,
"reward_std": 0.08922727406024933,
"rewards/alfworld_rollout_reward_func/mean": 0.9421250224113464,
"rewards/alfworld_rollout_reward_func/std": 0.20758825540542603,
"sampling/importance_sampling_ratio/max": 2.490204095840454,
"sampling/importance_sampling_ratio/mean": 0.959223210811615,
"sampling/importance_sampling_ratio/min": 0.02937433123588562,
"sampling/sampling_logp_difference/max": 1.6584293842315674,
"sampling/sampling_logp_difference/mean": 0.020461907610297203,
"step": 49,
"step_time": 92.39997174798918
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7372.0,
"completions/max_terminated_length": 7372.0,
"completions/mean_length": 6640.0,
"completions/mean_terminated_length": 6640.0,
"completions/min_length": 6025.0,
"completions/min_terminated_length": 6025.0,
"entropy": 0.3124278010800481,
"epoch": 0.4,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25588029623031616,
"kl": 0.010321544992621057,
"learning_rate": 1.739575447433963e-05,
"loss": -0.0403,
"num_tokens": 11075154.0,
"reward": -0.08562500029802322,
"reward_std": 0.03361169621348381,
"rewards/alfworld_rollout_reward_func/mean": -0.08562500029802322,
"rewards/alfworld_rollout_reward_func/std": 0.04079354181885719,
"sampling/importance_sampling_ratio/max": 2.596904754638672,
"sampling/importance_sampling_ratio/mean": 0.33153465390205383,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 57.544639587402344,
"sampling/sampling_logp_difference/mean": 0.02437865547835827,
"step": 50,
"step_time": 265.4482773190066
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10133.0,
"completions/max_terminated_length": 10133.0,
"completions/mean_length": 7551.125,
"completions/mean_terminated_length": 7551.125,
"completions/min_length": 1574.0,
"completions/min_terminated_length": 1574.0,
"entropy": 0.3586070057936013,
"epoch": 0.408,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5439701676368713,
"kl": 0.009389235550770536,
"learning_rate": 1.7095521207909e-05,
"loss": 0.1081,
"num_tokens": 11339414.0,
"reward": 0.03581250086426735,
"reward_std": 0.29458412528038025,
"rewards/alfworld_rollout_reward_func/mean": 0.03581250086426735,
"rewards/alfworld_rollout_reward_func/std": 0.39730069041252136,
"sampling/importance_sampling_ratio/max": 2.3939778804779053,
"sampling/importance_sampling_ratio/mean": 0.46745312213897705,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.9706189632415771,
"sampling/sampling_logp_difference/mean": 0.02153097279369831,
"step": 51,
"step_time": 354.3591809840109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9795.0,
"completions/max_terminated_length": 9795.0,
"completions/mean_length": 8446.46875,
"completions/mean_terminated_length": 8446.46875,
"completions/min_length": 3995.0,
"completions/min_terminated_length": 3995.0,
"entropy": 0.36232828767970204,
"epoch": 0.416,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5382326245307922,
"kl": 0.011482410060125403,
"learning_rate": 1.6792190241287358e-05,
"loss": 0.7703,
"num_tokens": 11634213.0,
"reward": 0.05806249752640724,
"reward_std": 0.2869499921798706,
"rewards/alfworld_rollout_reward_func/mean": 0.05806249752640724,
"rewards/alfworld_rollout_reward_func/std": 0.3851432800292969,
"sampling/importance_sampling_ratio/max": 1.6283434629440308,
"sampling/importance_sampling_ratio/mean": 0.35767966508865356,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 50.320884704589844,
"sampling/sampling_logp_difference/mean": 0.028291059657931328,
"step": 52,
"step_time": 381.02555549900353
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5540.0,
"completions/max_terminated_length": 5540.0,
"completions/mean_length": 4920.09375,
"completions/mean_terminated_length": 4920.09375,
"completions/min_length": 3916.0,
"completions/min_terminated_length": 3916.0,
"entropy": 0.33526887465268373,
"epoch": 0.424,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.49836915731430054,
"kl": 0.006760991345799994,
"learning_rate": 1.6485966040595234e-05,
"loss": 0.2884,
"num_tokens": 11809064.0,
"reward": 0.041937507688999176,
"reward_std": 0.2676677107810974,
"rewards/alfworld_rollout_reward_func/mean": 0.041937507688999176,
"rewards/alfworld_rollout_reward_func/std": 0.3419354557991028,
"sampling/importance_sampling_ratio/max": 2.4798102378845215,
"sampling/importance_sampling_ratio/mean": 0.39536625146865845,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.4147335290908813,
"sampling/sampling_logp_difference/mean": 0.020684119313955307,
"step": 53,
"step_time": 234.1235085829867
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5491.0,
"completions/max_terminated_length": 5491.0,
"completions/mean_length": 4343.40625,
"completions/mean_terminated_length": 4343.40625,
"completions/min_length": 2950.0,
"completions/min_terminated_length": 2950.0,
"entropy": 0.3150371379451826,
"epoch": 0.432,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3623775541782379,
"kl": 0.016882637079106644,
"learning_rate": 1.6177055022193705e-05,
"loss": -0.0492,
"num_tokens": 11964309.0,
"reward": 0.09349999576807022,
"reward_std": 0.2926024794578552,
"rewards/alfworld_rollout_reward_func/mean": 0.09349999576807022,
"rewards/alfworld_rollout_reward_func/std": 0.41188251972198486,
"sampling/importance_sampling_ratio/max": 2.349982261657715,
"sampling/importance_sampling_ratio/mean": 0.3969458341598511,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.997692108154297,
"sampling/sampling_logp_difference/mean": 0.02303631231188774,
"step": 54,
"step_time": 198.36046067398638
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9719.0,
"completions/max_terminated_length": 9719.0,
"completions/mean_length": 7502.34375,
"completions/mean_terminated_length": 7502.34375,
"completions/min_length": 1859.0,
"completions/min_terminated_length": 1859.0,
"entropy": 0.30430709826759994,
"epoch": 0.44,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.44352656602859497,
"kl": 0.0072795703235897236,
"learning_rate": 1.5865665413545433e-05,
"loss": 0.3331,
"num_tokens": 12230304.0,
"reward": 0.2774375081062317,
"reward_std": 0.5174306035041809,
"rewards/alfworld_rollout_reward_func/mean": 0.2774375081062317,
"rewards/alfworld_rollout_reward_func/std": 0.537744402885437,
"sampling/importance_sampling_ratio/max": 2.195627212524414,
"sampling/importance_sampling_ratio/mean": 0.3545913100242615,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.711559534072876,
"sampling/sampling_logp_difference/mean": 0.01915351301431656,
"step": 55,
"step_time": 336.9539677490029
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5623.0,
"completions/max_terminated_length": 5623.0,
"completions/mean_length": 3929.40625,
"completions/mean_terminated_length": 3929.40625,
"completions/min_length": 696.0,
"completions/min_terminated_length": 696.0,
"entropy": 0.3456273628398776,
"epoch": 0.448,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2932792007923126,
"kl": 0.011585585234570317,
"learning_rate": 1.5552007112854894e-05,
"loss": 0.2115,
"num_tokens": 12373933.0,
"reward": 0.5378749966621399,
"reward_std": 0.5084604024887085,
"rewards/alfworld_rollout_reward_func/mean": 0.5378749966621399,
"rewards/alfworld_rollout_reward_func/std": 0.535780131816864,
"sampling/importance_sampling_ratio/max": 2.6537976264953613,
"sampling/importance_sampling_ratio/mean": 0.4917897582054138,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.6209819316864014,
"sampling/sampling_logp_difference/mean": 0.02049492858350277,
"step": 56,
"step_time": 196.3063545789919
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7242.0,
"completions/max_terminated_length": 7242.0,
"completions/mean_length": 6137.5,
"completions/mean_terminated_length": 6137.5,
"completions/min_length": 1471.0,
"completions/min_terminated_length": 1471.0,
"entropy": 0.2984403392765671,
"epoch": 0.456,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.34220486879348755,
"kl": 0.006747693880242878,
"learning_rate": 1.5236291547582437e-05,
"loss": 0.3575,
"num_tokens": 12591325.0,
"reward": 0.14399999380111694,
"reward_std": 0.29427629709243774,
"rewards/alfworld_rollout_reward_func/mean": 0.14399999380111694,
"rewards/alfworld_rollout_reward_func/std": 0.4284597933292389,
"sampling/importance_sampling_ratio/max": 2.0604751110076904,
"sampling/importance_sampling_ratio/mean": 0.42357033491134644,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.4621469974517822,
"sampling/sampling_logp_difference/mean": 0.01661795936524868,
"step": 57,
"step_time": 275.63650079599756
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9986.0,
"completions/max_terminated_length": 9986.0,
"completions/mean_length": 8814.6875,
"completions/mean_terminated_length": 8814.6875,
"completions/min_length": 2337.0,
"completions/min_terminated_length": 2337.0,
"entropy": 0.3586568986065686,
"epoch": 0.464,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.47871214151382446,
"kl": 0.007779015490086749,
"learning_rate": 1.4918731531927497e-05,
"loss": -0.5934,
"num_tokens": 12899251.0,
"reward": 0.044374994933605194,
"reward_std": 0.3030627369880676,
"rewards/alfworld_rollout_reward_func/mean": 0.044374994933605194,
"rewards/alfworld_rollout_reward_func/std": 0.39909231662750244,
"sampling/importance_sampling_ratio/max": 2.446885347366333,
"sampling/importance_sampling_ratio/mean": 0.42126935720443726,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.7695283889770508,
"sampling/sampling_logp_difference/mean": 0.02191038988530636,
"step": 58,
"step_time": 390.69952411300983
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7052.0,
"completions/max_terminated_length": 7052.0,
"completions/mean_length": 5947.40625,
"completions/mean_terminated_length": 5947.40625,
"completions/min_length": 2393.0,
"completions/min_terminated_length": 2393.0,
"entropy": 0.3142848704010248,
"epoch": 0.472,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5331009030342102,
"kl": 0.012195366049127188,
"learning_rate": 1.4599541123377061e-05,
"loss": -0.4684,
"num_tokens": 13110208.0,
"reward": 0.07756249606609344,
"reward_std": 0.3419586420059204,
"rewards/alfworld_rollout_reward_func/mean": 0.07756249606609344,
"rewards/alfworld_rollout_reward_func/std": 0.3887258768081665,
"sampling/importance_sampling_ratio/max": 2.6775338649749756,
"sampling/importance_sampling_ratio/mean": 0.4351937770843506,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.95979118347168,
"sampling/sampling_logp_difference/mean": 0.020216144621372223,
"step": 59,
"step_time": 243.03599319599743
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9655.0,
"completions/max_terminated_length": 9655.0,
"completions/mean_length": 5772.5625,
"completions/mean_terminated_length": 5772.5625,
"completions/min_length": 1919.0,
"completions/min_terminated_length": 1919.0,
"entropy": 0.30983406328596175,
"epoch": 0.48,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.53172367811203,
"kl": 0.010136082419194281,
"learning_rate": 1.4278935478416066e-05,
"loss": 0.0168,
"num_tokens": 13317810.0,
"reward": 0.5102499723434448,
"reward_std": 0.5633312463760376,
"rewards/alfworld_rollout_reward_func/mean": 0.5102499723434448,
"rewards/alfworld_rollout_reward_func/std": 0.5470424294471741,
"sampling/importance_sampling_ratio/max": 2.9595675468444824,
"sampling/importance_sampling_ratio/mean": 0.6123539805412292,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.608839988708496,
"sampling/sampling_logp_difference/mean": 0.020261507481336594,
"step": 60,
"step_time": 298.4086459939899
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10755.0,
"completions/max_terminated_length": 10755.0,
"completions/mean_length": 6594.71875,
"completions/mean_terminated_length": 6594.71875,
"completions/min_length": 3297.0,
"completions/min_terminated_length": 3297.0,
"entropy": 0.253853059373796,
"epoch": 0.488,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6812533140182495,
"kl": 0.006578363383596297,
"learning_rate": 1.3957130707496991e-05,
"loss": 0.6806,
"num_tokens": 13549897.0,
"reward": 0.29243749380111694,
"reward_std": 0.47611552476882935,
"rewards/alfworld_rollout_reward_func/mean": 0.29243749380111694,
"rewards/alfworld_rollout_reward_func/std": 0.5179726481437683,
"sampling/importance_sampling_ratio/max": 2.8079097270965576,
"sampling/importance_sampling_ratio/mean": 0.6100484132766724,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.366208553314209,
"sampling/sampling_logp_difference/mean": 0.01589721068739891,
"step": 61,
"step_time": 345.328052976005
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6121.0,
"completions/max_terminated_length": 6121.0,
"completions/mean_length": 5639.4375,
"completions/mean_terminated_length": 5639.4375,
"completions/min_length": 1575.0,
"completions/min_terminated_length": 1575.0,
"entropy": 0.3537342040799558,
"epoch": 0.496,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4658977687358856,
"kl": 0.008820160976029001,
"learning_rate": 1.363434372936643e-05,
"loss": 0.0373,
"num_tokens": 13749239.0,
"reward": -0.04087500274181366,
"reward_std": 0.07875211536884308,
"rewards/alfworld_rollout_reward_func/mean": -0.04087500274181366,
"rewards/alfworld_rollout_reward_func/std": 0.18735504150390625,
"sampling/importance_sampling_ratio/max": 2.8549065589904785,
"sampling/importance_sampling_ratio/mean": 0.4742495119571686,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.172121286392212,
"sampling/sampling_logp_difference/mean": 0.019000394269824028,
"step": 62,
"step_time": 261.6466159019983
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 4729.0,
"completions/max_terminated_length": 4729.0,
"completions/mean_length": 3591.78125,
"completions/mean_terminated_length": 3591.78125,
"completions/min_length": 980.0,
"completions/min_terminated_length": 980.0,
"entropy": 0.39169206377118826,
"epoch": 0.504,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.247410386800766,
"kl": 0.008050664589973167,
"learning_rate": 1.3310792124846788e-05,
"loss": -0.2754,
"num_tokens": 13878928.0,
"reward": 0.16506250202655792,
"reward_std": 0.45310160517692566,
"rewards/alfworld_rollout_reward_func/mean": 0.16506250202655792,
"rewards/alfworld_rollout_reward_func/std": 0.5017751455307007,
"sampling/importance_sampling_ratio/max": 1.3678574562072754,
"sampling/importance_sampling_ratio/mean": 0.2883392870426178,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.1917786598205566,
"sampling/sampling_logp_difference/mean": 0.021689990535378456,
"step": 63,
"step_time": 211.52569950699763
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5346.0,
"completions/max_terminated_length": 5346.0,
"completions/mean_length": 4886.34375,
"completions/mean_terminated_length": 4886.34375,
"completions/min_length": 2880.0,
"completions/min_terminated_length": 2880.0,
"entropy": 0.3370038694702089,
"epoch": 0.512,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6040080189704895,
"kl": 0.007575266048661433,
"learning_rate": 1.2986693990171722e-05,
"loss": 1.0625,
"num_tokens": 14052507.0,
"reward": -0.017375001683831215,
"reward_std": 0.17574599385261536,
"rewards/alfworld_rollout_reward_func/mean": -0.017375001683831215,
"rewards/alfworld_rollout_reward_func/std": 0.3123514950275421,
"sampling/importance_sampling_ratio/max": 2.4749438762664795,
"sampling/importance_sampling_ratio/mean": 0.4803212881088257,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2786328792572021,
"sampling/sampling_logp_difference/mean": 0.018290970474481583,
"step": 64,
"step_time": 227.5868356469873
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5429.0,
"completions/max_terminated_length": 5429.0,
"completions/mean_length": 2972.3125,
"completions/mean_terminated_length": 2972.3125,
"completions/min_length": 539.0,
"completions/min_terminated_length": 539.0,
"entropy": 0.3263692925684154,
"epoch": 0.52,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17109909653663635,
"kl": 0.007112714993127156,
"learning_rate": 1.2662267789974137e-05,
"loss": -0.1672,
"num_tokens": 14165157.0,
"reward": 0.6520624756813049,
"reward_std": 0.3671799898147583,
"rewards/alfworld_rollout_reward_func/mean": 0.6520624756813049,
"rewards/alfworld_rollout_reward_func/std": 0.49012401700019836,
"sampling/importance_sampling_ratio/max": 1.6191086769104004,
"sampling/importance_sampling_ratio/mean": 0.40486860275268555,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.247586965560913,
"sampling/sampling_logp_difference/mean": 0.019062954932451248,
"step": 65,
"step_time": 169.92830283098374
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10669.0,
"completions/max_terminated_length": 10669.0,
"completions/mean_length": 9819.1875,
"completions/mean_terminated_length": 9819.1875,
"completions/min_length": 5938.0,
"completions/min_terminated_length": 5938.0,
"entropy": 0.28255544137209654,
"epoch": 0.528,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7858016490936279,
"kl": 0.016726812566048466,
"learning_rate": 1.2337732210025866e-05,
"loss": 0.8719,
"num_tokens": 14506315.0,
"reward": -0.07387499511241913,
"reward_std": 0.10663385689258575,
"rewards/alfworld_rollout_reward_func/mean": -0.07387499511241913,
"rewards/alfworld_rollout_reward_func/std": 0.19619078934192657,
"sampling/importance_sampling_ratio/max": 2.002222776412964,
"sampling/importance_sampling_ratio/mean": 0.39136213064193726,
"sampling/importance_sampling_ratio/min": 1.4739535799890291e-05,
"sampling/sampling_logp_difference/max": 5.542607307434082,
"sampling/sampling_logp_difference/mean": 0.01821236126124859,
"step": 66,
"step_time": 419.0694372189937
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10873.0,
"completions/max_terminated_length": 10873.0,
"completions/mean_length": 9183.96875,
"completions/mean_terminated_length": 9183.96875,
"completions/min_length": 3183.0,
"completions/min_terminated_length": 3183.0,
"entropy": 0.38195388251915574,
"epoch": 0.536,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4079846143722534,
"kl": 0.0140747545810882,
"learning_rate": 1.2013306009828281e-05,
"loss": 0.3006,
"num_tokens": 14827306.0,
"reward": 0.018437502905726433,
"reward_std": 0.22722002863883972,
"rewards/alfworld_rollout_reward_func/mean": 0.018437502905726433,
"rewards/alfworld_rollout_reward_func/std": 0.3590969741344452,
"sampling/importance_sampling_ratio/max": 2.1235499382019043,
"sampling/importance_sampling_ratio/mean": 0.36683690547943115,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.5769914388656616,
"sampling/sampling_logp_difference/mean": 0.024590400978922844,
"step": 67,
"step_time": 386.19285881999167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5260.0,
"completions/max_terminated_length": 5260.0,
"completions/mean_length": 4081.625,
"completions/mean_terminated_length": 4081.625,
"completions/min_length": 1937.0,
"completions/min_terminated_length": 1937.0,
"entropy": 0.35961280949413776,
"epoch": 0.544,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5163018107414246,
"kl": 0.011940339973079972,
"learning_rate": 1.1689207875153212e-05,
"loss": 0.474,
"num_tokens": 14974590.0,
"reward": 0.37187498807907104,
"reward_std": 0.46666616201400757,
"rewards/alfworld_rollout_reward_func/mean": 0.37187498807907104,
"rewards/alfworld_rollout_reward_func/std": 0.5451789498329163,
"sampling/importance_sampling_ratio/max": 2.092132568359375,
"sampling/importance_sampling_ratio/mean": 0.5891081094741821,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.025409460067749,
"sampling/sampling_logp_difference/mean": 0.022658195346593857,
"step": 68,
"step_time": 189.4174261460139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6060.0,
"completions/max_terminated_length": 6060.0,
"completions/mean_length": 1962.71875,
"completions/mean_terminated_length": 1962.71875,
"completions/min_length": 780.0,
"completions/min_terminated_length": 780.0,
"entropy": 0.24381280411034822,
"epoch": 0.552,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.283759206533432,
"kl": 0.028047901825630106,
"learning_rate": 1.1365656270633572e-05,
"loss": 0.1314,
"num_tokens": 15056149.0,
"reward": 0.8946250081062317,
"reward_std": 0.16643308103084564,
"rewards/alfworld_rollout_reward_func/mean": 0.8946250081062317,
"rewards/alfworld_rollout_reward_func/std": 0.28478240966796875,
"sampling/importance_sampling_ratio/max": 1.3010145425796509,
"sampling/importance_sampling_ratio/mean": 0.6231130361557007,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.1777536869049072,
"sampling/sampling_logp_difference/mean": 0.01496939081698656,
"step": 69,
"step_time": 136.05892026298898
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8510.0,
"completions/max_terminated_length": 8510.0,
"completions/mean_length": 7672.125,
"completions/mean_terminated_length": 7672.125,
"completions/min_length": 7299.0,
"completions/min_terminated_length": 7299.0,
"entropy": 0.3060699696652591,
"epoch": 0.56,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5419595837593079,
"kl": 0.0077493647404480726,
"learning_rate": 1.1042869292503012e-05,
"loss": 0.5471,
"num_tokens": 15324345.0,
"reward": -0.08312499523162842,
"reward_std": 0.02605646476149559,
"rewards/alfworld_rollout_reward_func/mean": -0.08312499523162842,
"rewards/alfworld_rollout_reward_func/std": 0.0315653458237648,
"sampling/importance_sampling_ratio/max": 1.9645739793777466,
"sampling/importance_sampling_ratio/mean": 0.5267171859741211,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.3863511085510254,
"sampling/sampling_logp_difference/mean": 0.017259342595934868,
"step": 70,
"step_time": 321.4900386369991
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10572.0,
"completions/max_terminated_length": 10572.0,
"completions/mean_length": 9703.9375,
"completions/mean_terminated_length": 9703.9375,
"completions/min_length": 4990.0,
"completions/min_terminated_length": 4990.0,
"entropy": 0.3014857741072774,
"epoch": 0.568,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6044378280639648,
"kl": 0.010106177673151251,
"learning_rate": 1.0721064521583937e-05,
"loss": 0.0864,
"num_tokens": 15662295.0,
"reward": -0.03849999979138374,
"reward_std": 0.14729847013950348,
"rewards/alfworld_rollout_reward_func/mean": -0.03849999979138374,
"rewards/alfworld_rollout_reward_func/std": 0.26008138060569763,
"sampling/importance_sampling_ratio/max": 2.1879913806915283,
"sampling/importance_sampling_ratio/mean": 0.5329915285110474,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.4026594161987305,
"sampling/sampling_logp_difference/mean": 0.017540952190756798,
"step": 71,
"step_time": 421.89684666199173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6152.0,
"completions/max_terminated_length": 6152.0,
"completions/mean_length": 5449.53125,
"completions/mean_terminated_length": 5449.53125,
"completions/min_length": 4948.0,
"completions/min_terminated_length": 4948.0,
"entropy": 0.3671146659180522,
"epoch": 0.576,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5124484896659851,
"kl": 0.009964851851691492,
"learning_rate": 1.0400458876622939e-05,
"loss": 0.4011,
"num_tokens": 15854952.0,
"reward": -0.0793749988079071,
"reward_std": 0.020163455978035927,
"rewards/alfworld_rollout_reward_func/mean": -0.0793749988079071,
"rewards/alfworld_rollout_reward_func/std": 0.021987900137901306,
"sampling/importance_sampling_ratio/max": 1.6604212522506714,
"sampling/importance_sampling_ratio/mean": 0.45693981647491455,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 50.06657791137695,
"sampling/sampling_logp_difference/mean": 0.022638168185949326,
"step": 72,
"step_time": 250.9183966080127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8627.0,
"completions/max_terminated_length": 8627.0,
"completions/mean_length": 7516.8125,
"completions/mean_terminated_length": 7516.8125,
"completions/min_length": 2413.0,
"completions/min_terminated_length": 2413.0,
"entropy": 0.30488129099830985,
"epoch": 0.584,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9326367378234863,
"kl": 0.009634538357204292,
"learning_rate": 1.0081268468072504e-05,
"loss": -0.7301,
"num_tokens": 16118786.0,
"reward": 0.13762499392032623,
"reward_std": 0.4009949862957001,
"rewards/alfworld_rollout_reward_func/mean": 0.13762499392032623,
"rewards/alfworld_rollout_reward_func/std": 0.43177902698516846,
"sampling/importance_sampling_ratio/max": 2.628365993499756,
"sampling/importance_sampling_ratio/mean": 0.6898477077484131,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.0378785133361816,
"sampling/sampling_logp_difference/mean": 0.01770058088004589,
"step": 73,
"step_time": 324.3881754160029
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8399.0,
"completions/max_terminated_length": 8399.0,
"completions/mean_length": 4676.71875,
"completions/mean_terminated_length": 4676.71875,
"completions/min_length": 1857.0,
"completions/min_terminated_length": 1857.0,
"entropy": 0.3089015153236687,
"epoch": 0.592,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3285577893257141,
"kl": 0.013235979298769962,
"learning_rate": 9.763708452417566e-06,
"loss": 0.3012,
"num_tokens": 16291545.0,
"reward": 0.6901249885559082,
"reward_std": 0.35391491651535034,
"rewards/alfworld_rollout_reward_func/mean": 0.6901249885559082,
"rewards/alfworld_rollout_reward_func/std": 0.4766641855239868,
"sampling/importance_sampling_ratio/max": 1.9804767370224,
"sampling/importance_sampling_ratio/mean": 0.5676183104515076,
"sampling/importance_sampling_ratio/min": 1.8734867390329565e-35,
"sampling/sampling_logp_difference/max": 52.56696319580078,
"sampling/sampling_logp_difference/mean": 0.02483256347477436,
"step": 74,
"step_time": 270.81252180201045
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5990.0,
"completions/max_terminated_length": 5990.0,
"completions/mean_length": 2401.125,
"completions/mean_terminated_length": 2401.125,
"completions/min_length": 692.0,
"completions/min_terminated_length": 692.0,
"entropy": 0.2977066827006638,
"epoch": 0.6,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.7037971615791321,
"kl": 0.019976254399807658,
"learning_rate": 9.44799288714511e-06,
"loss": 0.9671,
"num_tokens": 16387805.0,
"reward": 0.8665000200271606,
"reward_std": 0.16209571063518524,
"rewards/alfworld_rollout_reward_func/mean": 0.8665000200271606,
"rewards/alfworld_rollout_reward_func/std": 0.31956785917282104,
"sampling/importance_sampling_ratio/max": 2.9782536029815674,
"sampling/importance_sampling_ratio/mean": 0.8318759202957153,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.887710452079773,
"sampling/sampling_logp_difference/mean": 0.018892308697104454,
"step": 75,
"step_time": 166.59206653699948
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5156.0,
"completions/max_terminated_length": 5156.0,
"completions/mean_length": 3333.34375,
"completions/mean_terminated_length": 3333.34375,
"completions/min_length": 1359.0,
"completions/min_terminated_length": 1359.0,
"entropy": 0.3274269704706967,
"epoch": 0.608,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4839501678943634,
"kl": 0.01072185774683021,
"learning_rate": 9.134334586454569e-06,
"loss": 0.3904,
"num_tokens": 16510632.0,
"reward": 0.5139999985694885,
"reward_std": 0.49413302540779114,
"rewards/alfworld_rollout_reward_func/mean": 0.5139999985694885,
"rewards/alfworld_rollout_reward_func/std": 0.5383288264274597,
"sampling/importance_sampling_ratio/max": 1.796036958694458,
"sampling/importance_sampling_ratio/mean": 0.42258331179618835,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.0271837711334229,
"sampling/sampling_logp_difference/mean": 0.019824257120490074,
"step": 76,
"step_time": 179.70215874598216
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9625.0,
"completions/max_terminated_length": 9625.0,
"completions/mean_length": 8155.625,
"completions/mean_terminated_length": 8155.625,
"completions/min_length": 1912.0,
"completions/min_terminated_length": 1912.0,
"entropy": 0.3397471741773188,
"epoch": 0.616,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.26549482345581055,
"kl": 0.01535673845501151,
"learning_rate": 8.822944977806296e-06,
"loss": -0.0306,
"num_tokens": 16796252.0,
"reward": 0.06531249731779099,
"reward_std": 0.2820407748222351,
"rewards/alfworld_rollout_reward_func/mean": 0.06531249731779099,
"rewards/alfworld_rollout_reward_func/std": 0.3899073898792267,
"sampling/importance_sampling_ratio/max": 2.449633836746216,
"sampling/importance_sampling_ratio/mean": 0.22956383228302002,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 49.174781799316406,
"sampling/sampling_logp_difference/mean": 0.025772254914045334,
"step": 77,
"step_time": 386.00914664800075
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9988.0,
"completions/max_terminated_length": 9988.0,
"completions/mean_length": 7138.4375,
"completions/mean_terminated_length": 7138.4375,
"completions/min_length": 1521.0,
"completions/min_terminated_length": 1521.0,
"entropy": 0.4407801004126668,
"epoch": 0.624,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3279080390930176,
"kl": 0.010239457304123789,
"learning_rate": 8.514033959404768e-06,
"loss": 0.3758,
"num_tokens": 17044234.0,
"reward": 0.0768750011920929,
"reward_std": 0.36151063442230225,
"rewards/alfworld_rollout_reward_func/mean": 0.0768750011920929,
"rewards/alfworld_rollout_reward_func/std": 0.4254858195781708,
"sampling/importance_sampling_ratio/max": 2.027116298675537,
"sampling/importance_sampling_ratio/mean": 0.24855472147464752,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.5956943035125732,
"sampling/sampling_logp_difference/mean": 0.025417163968086243,
"step": 78,
"step_time": 341.2563546900019
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 6459.0,
"completions/max_terminated_length": 5516.0,
"completions/mean_length": 4264.03125,
"completions/mean_terminated_length": 4193.2255859375,
"completions/min_length": 1240.0,
"completions/min_terminated_length": 1240.0,
"entropy": 0.35723056783899665,
"epoch": 0.632,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5109708905220032,
"kl": 0.008868186720064841,
"learning_rate": 8.207809758712648e-06,
"loss": -0.6946,
"num_tokens": 17197227.0,
"reward": 0.23375000059604645,
"reward_std": 0.4359171390533447,
"rewards/alfworld_rollout_reward_func/mean": 0.23375000059604645,
"rewards/alfworld_rollout_reward_func/std": 0.5250879526138306,
"sampling/importance_sampling_ratio/max": 2.4404332637786865,
"sampling/importance_sampling_ratio/mean": 0.4912329316139221,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 19.107128143310547,
"sampling/sampling_logp_difference/mean": 0.019834455102682114,
"step": 79,
"step_time": 239.72380435900413
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6371.0,
"completions/max_terminated_length": 6371.0,
"completions/mean_length": 5661.84375,
"completions/mean_terminated_length": 5661.84375,
"completions/min_length": 1258.0,
"completions/min_terminated_length": 1258.0,
"entropy": 0.276084772311151,
"epoch": 0.64,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6164595484733582,
"kl": 0.009095613342651632,
"learning_rate": 7.904478792090999e-06,
"loss": -1.3354,
"num_tokens": 17397702.0,
"reward": 0.052375003695487976,
"reward_std": 0.23210693895816803,
"rewards/alfworld_rollout_reward_func/mean": 0.052375003695487976,
"rewards/alfworld_rollout_reward_func/std": 0.38171443343162537,
"sampling/importance_sampling_ratio/max": 2.6272292137145996,
"sampling/importance_sampling_ratio/mean": 0.7341758608818054,
"sampling/importance_sampling_ratio/min": 0.03513141721487045,
"sampling/sampling_logp_difference/max": 1.0201144218444824,
"sampling/sampling_logp_difference/mean": 0.01651364006102085,
"step": 80,
"step_time": 256.61792850801066
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 4864.0,
"completions/max_terminated_length": 4864.0,
"completions/mean_length": 4409.25,
"completions/mean_terminated_length": 4409.25,
"completions/min_length": 1956.0,
"completions/min_terminated_length": 1956.0,
"entropy": 0.4008057755418122,
"epoch": 0.648,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.48246461153030396,
"kl": 0.006945399109099526,
"learning_rate": 7.604245525660372e-06,
"loss": -0.4693,
"num_tokens": 17554830.0,
"reward": 0.05581250041723251,
"reward_std": 0.0946279764175415,
"rewards/alfworld_rollout_reward_func/mean": 0.05581250041723251,
"rewards/alfworld_rollout_reward_func/std": 0.3801369369029999,
"sampling/importance_sampling_ratio/max": 2.2769978046417236,
"sampling/importance_sampling_ratio/mean": 0.3646348714828491,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9721014499664307,
"sampling/sampling_logp_difference/mean": 0.02023322880268097,
"step": 81,
"step_time": 234.1297659530137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5459.0,
"completions/max_terminated_length": 5459.0,
"completions/mean_length": 2836.8125,
"completions/mean_terminated_length": 2836.8125,
"completions/min_length": 746.0,
"completions/min_terminated_length": 746.0,
"entropy": 0.3316339133307338,
"epoch": 0.656,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.617982029914856,
"kl": 0.01853690284769982,
"learning_rate": 7.307312337476421e-06,
"loss": 0.9743,
"num_tokens": 17663112.0,
"reward": 0.5675625205039978,
"reward_std": 0.4371797442436218,
"rewards/alfworld_rollout_reward_func/mean": 0.5675625205039978,
"rewards/alfworld_rollout_reward_func/std": 0.5398529171943665,
"sampling/importance_sampling_ratio/max": 2.917351007461548,
"sampling/importance_sampling_ratio/mean": 0.7250344753265381,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 49.39155197143555,
"sampling/sampling_logp_difference/mean": 0.026660067960619926,
"step": 82,
"step_time": 163.41237712898874
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8902.0,
"completions/max_terminated_length": 8902.0,
"completions/mean_length": 2005.8125,
"completions/mean_terminated_length": 2005.8125,
"completions/min_length": 872.0,
"completions/min_terminated_length": 872.0,
"entropy": 0.31633000262081623,
"epoch": 0.664,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.2705594599246979,
"kl": 0.0168228562688455,
"learning_rate": 7.013879381113055e-06,
"loss": 0.0649,
"num_tokens": 17751938.0,
"reward": 0.9136250019073486,
"reward_std": 0.14719459414482117,
"rewards/alfworld_rollout_reward_func/mean": 0.9136250019073486,
"rewards/alfworld_rollout_reward_func/std": 0.26975658535957336,
"sampling/importance_sampling_ratio/max": 2.27459454536438,
"sampling/importance_sampling_ratio/mean": 0.6756496429443359,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.0981965065002441,
"sampling/sampling_logp_difference/mean": 0.022044427692890167,
"step": 83,
"step_time": 200.4008226580081
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6071.0,
"completions/max_terminated_length": 6071.0,
"completions/mean_length": 2893.40625,
"completions/mean_terminated_length": 2893.40625,
"completions/min_length": 822.0,
"completions/min_terminated_length": 822.0,
"entropy": 0.26535007590427995,
"epoch": 0.672,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24309861660003662,
"kl": 0.013757865759544075,
"learning_rate": 6.7241444507450474e-06,
"loss": -0.0118,
"num_tokens": 17863375.0,
"reward": 0.7944375276565552,
"reward_std": 0.3526097536087036,
"rewards/alfworld_rollout_reward_func/mean": 0.7944375276565552,
"rewards/alfworld_rollout_reward_func/std": 0.4054880440235138,
"sampling/importance_sampling_ratio/max": 2.7985546588897705,
"sampling/importance_sampling_ratio/mean": 0.6256030797958374,
"sampling/importance_sampling_ratio/min": 0.02893834561109543,
"sampling/sampling_logp_difference/max": 1.398754596710205,
"sampling/sampling_logp_difference/mean": 0.017740968614816666,
"step": 84,
"step_time": 165.64675680000073
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 4573.0,
"completions/max_terminated_length": 4573.0,
"completions/mean_length": 3059.5625,
"completions/mean_terminated_length": 3059.5625,
"completions/min_length": 682.0,
"completions/min_terminated_length": 682.0,
"entropy": 0.4013794925995171,
"epoch": 0.68,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.32808718085289,
"kl": 0.01802923761715647,
"learning_rate": 6.438302847821043e-06,
"loss": 0.1523,
"num_tokens": 17977121.0,
"reward": 0.33393749594688416,
"reward_std": 0.5907015204429626,
"rewards/alfworld_rollout_reward_func/mean": 0.33393749594688416,
"rewards/alfworld_rollout_reward_func/std": 0.5358816385269165,
"sampling/importance_sampling_ratio/max": 1.924164891242981,
"sampling/importance_sampling_ratio/mean": 0.3141007125377655,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 51.21648025512695,
"sampling/sampling_logp_difference/mean": 0.03382871672511101,
"step": 85,
"step_time": 164.81202543501058
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10292.0,
"completions/max_terminated_length": 10292.0,
"completions/mean_length": 8584.96875,
"completions/mean_terminated_length": 8584.96875,
"completions/min_length": 2859.0,
"completions/min_terminated_length": 2859.0,
"entropy": 0.3392215413041413,
"epoch": 0.688,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6542012095451355,
"kl": 0.011928978288779035,
"learning_rate": 6.1565472494168055e-06,
"loss": 0.2016,
"num_tokens": 18277600.0,
"reward": 0.0807500034570694,
"reward_std": 0.30580368638038635,
"rewards/alfworld_rollout_reward_func/mean": 0.0807500034570694,
"rewards/alfworld_rollout_reward_func/std": 0.4279302656650543,
"sampling/importance_sampling_ratio/max": 2.9803245067596436,
"sampling/importance_sampling_ratio/mean": 0.6060307025909424,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.7128372192382812,
"sampling/sampling_logp_difference/mean": 0.022582147270441055,
"step": 86,
"step_time": 378.8619575729863
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9014.0,
"completions/max_terminated_length": 9014.0,
"completions/mean_length": 5255.03125,
"completions/mean_terminated_length": 5255.03125,
"completions/min_length": 2081.0,
"completions/min_terminated_length": 2081.0,
"entropy": 0.27021760190837085,
"epoch": 0.696,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5374975800514221,
"kl": 0.009437592703761766,
"learning_rate": 5.879067578357521e-06,
"loss": 0.851,
"num_tokens": 18470369.0,
"reward": 0.5600000023841858,
"reward_std": 0.4250272810459137,
"rewards/alfworld_rollout_reward_func/mean": 0.5600000023841858,
"rewards/alfworld_rollout_reward_func/std": 0.5282309055328369,
"sampling/importance_sampling_ratio/max": 2.8345797061920166,
"sampling/importance_sampling_ratio/mean": 0.8371989130973816,
"sampling/importance_sampling_ratio/min": 0.00144236593041569,
"sampling/sampling_logp_difference/max": 1.072211742401123,
"sampling/sampling_logp_difference/mean": 0.017154190689325333,
"step": 87,
"step_time": 283.20546714899683
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6547.0,
"completions/max_terminated_length": 6547.0,
"completions/mean_length": 5855.5,
"completions/mean_terminated_length": 5855.5,
"completions/min_length": 3651.0,
"completions/min_terminated_length": 3651.0,
"entropy": 0.3743594288825989,
"epoch": 0.704,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3856624364852905,
"kl": 0.0597444533923408,
"learning_rate": 5.6060508751966186e-06,
"loss": 0.8742,
"num_tokens": 18676593.0,
"reward": -0.04043750464916229,
"reward_std": 0.1648387759923935,
"rewards/alfworld_rollout_reward_func/mean": -0.04043750464916229,
"rewards/alfworld_rollout_reward_func/std": 0.31141039729118347,
"sampling/importance_sampling_ratio/max": 2.986142635345459,
"sampling/importance_sampling_ratio/mean": 0.5198343992233276,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.3955113887786865,
"sampling/sampling_logp_difference/mean": 0.02004336006939411,
"step": 88,
"step_time": 254.65699046499503
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6107.0,
"completions/max_terminated_length": 6107.0,
"completions/mean_length": 2628.4375,
"completions/mean_terminated_length": 2628.4375,
"completions/min_length": 1046.0,
"completions/min_terminated_length": 1046.0,
"entropy": 0.2884806345682591,
"epoch": 0.712,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4651772379875183,
"kl": 0.012305404627113603,
"learning_rate": 5.3376811721374765e-06,
"loss": 0.1412,
"num_tokens": 18777311.0,
"reward": 0.6704374551773071,
"reward_std": 0.28652966022491455,
"rewards/alfworld_rollout_reward_func/mean": 0.6704374551773071,
"rewards/alfworld_rollout_reward_func/std": 0.486878365278244,
"sampling/importance_sampling_ratio/max": 2.315708875656128,
"sampling/importance_sampling_ratio/mean": 0.7435892820358276,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.3662152290344238,
"sampling/sampling_logp_difference/mean": 0.02060602232813835,
"step": 89,
"step_time": 160.6579905329927
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5816.0,
"completions/max_terminated_length": 5816.0,
"completions/mean_length": 4033.40625,
"completions/mean_terminated_length": 4033.40625,
"completions/min_length": 1956.0,
"completions/min_terminated_length": 1956.0,
"entropy": 0.39709911681711674,
"epoch": 0.72,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5232568383216858,
"kl": 0.008380314080568496,
"learning_rate": 5.074139368982922e-06,
"loss": -0.0378,
"num_tokens": 18921996.0,
"reward": 0.36518752574920654,
"reward_std": 0.5018154978752136,
"rewards/alfworld_rollout_reward_func/mean": 0.36518752574920654,
"rewards/alfworld_rollout_reward_func/std": 0.5176995992660522,
"sampling/importance_sampling_ratio/max": 2.772024393081665,
"sampling/importance_sampling_ratio/mean": 0.5188237428665161,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2695866823196411,
"sampling/sampling_logp_difference/mean": 0.021091943606734276,
"step": 90,
"step_time": 223.63505906601858
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 8433.0,
"completions/max_terminated_length": 8433.0,
"completions/mean_length": 5452.28125,
"completions/mean_terminated_length": 5452.28125,
"completions/min_length": 1688.0,
"completions/min_terminated_length": 1688.0,
"entropy": 0.31468175584450364,
"epoch": 0.728,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6314539909362793,
"kl": 0.010496957955183461,
"learning_rate": 4.81560311119624e-06,
"loss": 0.7913,
"num_tokens": 19119445.0,
"reward": 0.515999972820282,
"reward_std": 0.49320709705352783,
"rewards/alfworld_rollout_reward_func/mean": 0.515999972820282,
"rewards/alfworld_rollout_reward_func/std": 0.5465896725654602,
"sampling/importance_sampling_ratio/max": 2.785933017730713,
"sampling/importance_sampling_ratio/mean": 0.6670159697532654,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.1129628419876099,
"sampling/sampling_logp_difference/mean": 0.019474081695079803,
"step": 91,
"step_time": 287.403252778
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 4995.0,
"completions/max_terminated_length": 4995.0,
"completions/mean_length": 3509.4375,
"completions/mean_terminated_length": 3509.4375,
"completions/min_length": 1214.0,
"completions/min_terminated_length": 1214.0,
"entropy": 0.40855799289420247,
"epoch": 0.736,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7601175308227539,
"kl": 0.010548980098974425,
"learning_rate": 4.562246670155769e-06,
"loss": -0.6266,
"num_tokens": 19245923.0,
"reward": 0.2528125047683716,
"reward_std": 0.4320943057537079,
"rewards/alfworld_rollout_reward_func/mean": 0.2528125047683716,
"rewards/alfworld_rollout_reward_func/std": 0.5050470232963562,
"sampling/importance_sampling_ratio/max": 1.8565300703048706,
"sampling/importance_sampling_ratio/mean": 0.5377044081687927,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.7181758880615234,
"sampling/sampling_logp_difference/mean": 0.022393332794308662,
"step": 92,
"step_time": 197.56333248000374
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7078.0,
"completions/max_terminated_length": 7078.0,
"completions/mean_length": 3108.125,
"completions/mean_terminated_length": 3108.125,
"completions/min_length": 727.0,
"completions/min_terminated_length": 727.0,
"entropy": 0.27243693731725216,
"epoch": 0.744,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20756395161151886,
"kl": 0.012253020591742825,
"learning_rate": 4.314240825683938e-06,
"loss": 0.1539,
"num_tokens": 19366375.0,
"reward": 0.6510000228881836,
"reward_std": 0.4009626507759094,
"rewards/alfworld_rollout_reward_func/mean": 0.6510000228881836,
"rewards/alfworld_rollout_reward_func/std": 0.5017029047012329,
"sampling/importance_sampling_ratio/max": 1.8515329360961914,
"sampling/importance_sampling_ratio/mean": 0.6361359357833862,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 51.044925689697266,
"sampling/sampling_logp_difference/mean": 0.023394249379634857,
"step": 93,
"step_time": 195.112117294997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 5563.0,
"completions/max_terminated_length": 5298.0,
"completions/mean_length": 3876.15625,
"completions/mean_terminated_length": 3821.741943359375,
"completions/min_length": 1038.0,
"completions/min_terminated_length": 1038.0,
"entropy": 0.3827931974083185,
"epoch": 0.752,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22425270080566406,
"kl": 0.009305694955401123,
"learning_rate": 4.071752750929776e-06,
"loss": -0.2032,
"num_tokens": 19505132.0,
"reward": 0.09706249833106995,
"reward_std": 0.36825689673423767,
"rewards/alfworld_rollout_reward_func/mean": 0.09706249833106995,
"rewards/alfworld_rollout_reward_func/std": 0.45590633153915405,
"sampling/importance_sampling_ratio/max": 1.0314832925796509,
"sampling/importance_sampling_ratio/mean": 0.26068440079689026,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.0274133682250977,
"sampling/sampling_logp_difference/mean": 0.021985212340950966,
"step": 94,
"step_time": 213.6653461980095
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10037.0,
"completions/max_terminated_length": 10037.0,
"completions/mean_length": 7284.71875,
"completions/mean_terminated_length": 7284.71875,
"completions/min_length": 1864.0,
"completions/min_terminated_length": 1864.0,
"entropy": 0.32221671054139733,
"epoch": 0.76,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.28681808710098267,
"kl": 0.009068961640878115,
"learning_rate": 3.834945899682642e-06,
"loss": 0.2213,
"num_tokens": 19762819.0,
"reward": 0.211062490940094,
"reward_std": 0.41743093729019165,
"rewards/alfworld_rollout_reward_func/mean": 0.211062490940094,
"rewards/alfworld_rollout_reward_func/std": 0.4870281219482422,
"sampling/importance_sampling_ratio/max": 2.7327306270599365,
"sampling/importance_sampling_ratio/mean": 0.4521108865737915,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.1790721416473389,
"sampling/sampling_logp_difference/mean": 0.020200295373797417,
"step": 95,
"step_time": 362.24372679200314
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6032.0,
"completions/max_terminated_length": 6032.0,
"completions/mean_length": 3912.6875,
"completions/mean_terminated_length": 3912.6875,
"completions/min_length": 1226.0,
"completions/min_terminated_length": 1226.0,
"entropy": 0.3016722968313843,
"epoch": 0.768,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7907325625419617,
"kl": 0.007123322902771179,
"learning_rate": 3.6039798961929995e-06,
"loss": 2.2501,
"num_tokens": 19905657.0,
"reward": 0.29868751764297485,
"reward_std": 0.4651522636413574,
"rewards/alfworld_rollout_reward_func/mean": 0.29868751764297485,
"rewards/alfworld_rollout_reward_func/std": 0.5572153329849243,
"sampling/importance_sampling_ratio/max": 2.9006409645080566,
"sampling/importance_sampling_ratio/mean": 0.6974336504936218,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.1841108798980713,
"sampling/sampling_logp_difference/mean": 0.01824130117893219,
"step": 96,
"step_time": 207.6763199779889
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13391.0,
"completions/max_terminated_length": 13391.0,
"completions/mean_length": 9185.15625,
"completions/mean_terminated_length": 9185.15625,
"completions/min_length": 1412.0,
"completions/min_terminated_length": 1412.0,
"entropy": 0.3281913371756673,
"epoch": 0.776,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5702968835830688,
"kl": 0.01132094238710124,
"learning_rate": 3.379010427574625e-06,
"loss": 1.5224,
"num_tokens": 20232670.0,
"reward": 0.33243751525878906,
"reward_std": 0.5196202993392944,
"rewards/alfworld_rollout_reward_func/mean": 0.33243751525878906,
"rewards/alfworld_rollout_reward_func/std": 0.5371140837669373,
"sampling/importance_sampling_ratio/max": 2.960601568222046,
"sampling/importance_sampling_ratio/mean": 0.5209211111068726,
"sampling/importance_sampling_ratio/min": 1.0663071004331643e-30,
"sampling/sampling_logp_difference/max": 53.7265510559082,
"sampling/sampling_logp_difference/mean": 0.023852139711380005,
"step": 97,
"step_time": 476.51617424899814
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9493.0,
"completions/max_terminated_length": 9493.0,
"completions/mean_length": 8688.34375,
"completions/mean_terminated_length": 8688.34375,
"completions/min_length": 7963.0,
"completions/min_terminated_length": 7963.0,
"entropy": 0.2913727913983166,
"epoch": 0.784,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.47073104977607727,
"kl": 0.016036324086599052,
"learning_rate": 3.160189138860671e-06,
"loss": 0.1501,
"num_tokens": 20535273.0,
"reward": -0.08656249940395355,
"reward_std": 0.028624679893255234,
"rewards/alfworld_rollout_reward_func/mean": -0.08656249940395355,
"rewards/alfworld_rollout_reward_func/std": 0.03789709135890007,
"sampling/importance_sampling_ratio/max": 1.8403443098068237,
"sampling/importance_sampling_ratio/mean": 0.3093043565750122,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.646960735321045,
"sampling/sampling_logp_difference/mean": 0.020046139135956764,
"step": 98,
"step_time": 365.8523392380048
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5668.0,
"completions/max_terminated_length": 5668.0,
"completions/mean_length": 4537.65625,
"completions/mean_terminated_length": 4537.65625,
"completions/min_length": 1378.0,
"completions/min_terminated_length": 1378.0,
"entropy": 0.3762055607512593,
"epoch": 0.792,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.27318060398101807,
"kl": 0.00881329235562589,
"learning_rate": 2.947663530784388e-06,
"loss": 0.4014,
"num_tokens": 20697662.0,
"reward": 0.2487500011920929,
"reward_std": 0.48654043674468994,
"rewards/alfworld_rollout_reward_func/mean": 0.2487500011920929,
"rewards/alfworld_rollout_reward_func/std": 0.5086416006088257,
"sampling/importance_sampling_ratio/max": 1.298455834388733,
"sampling/importance_sampling_ratio/mean": 0.33954960107803345,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 57.009605407714844,
"sampling/sampling_logp_difference/mean": 0.023928016424179077,
"step": 99,
"step_time": 227.85088505800013
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7124.0,
"completions/max_terminated_length": 7124.0,
"completions/mean_length": 3760.96875,
"completions/mean_terminated_length": 3760.96875,
"completions/min_length": 1125.0,
"completions/min_terminated_length": 1125.0,
"entropy": 0.3388551725074649,
"epoch": 0.8,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3301438093185425,
"kl": 0.014853068540105596,
"learning_rate": 2.7415768603533996e-06,
"loss": 0.0864,
"num_tokens": 20837821.0,
"reward": 0.6660000085830688,
"reward_std": 0.4354756474494934,
"rewards/alfworld_rollout_reward_func/mean": 0.6660000085830688,
"rewards/alfworld_rollout_reward_func/std": 0.486807644367218,
"sampling/importance_sampling_ratio/max": 2.7073638439178467,
"sampling/importance_sampling_ratio/mean": 0.5111602544784546,
"sampling/importance_sampling_ratio/min": 0.004309141077101231,
"sampling/sampling_logp_difference/max": 1.547012209892273,
"sampling/sampling_logp_difference/mean": 0.021454855799674988,
"step": 100,
"step_time": 210.89971194700775
}
],
"logging_steps": 1,
"max_steps": 125,
"num_input_tokens_seen": 20837821,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}