example-repo / trainer_state.json
trenden's picture
Upload task output 6
3bd08d8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.04631578947368421,
"eval_steps": 500,
"global_step": 44,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 39.0,
"completions/max_terminated_length": 39.0,
"completions/mean_length": 38.0,
"completions/mean_terminated_length": 38.0,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.4614375829696655,
"epoch": 0.0010526315789473684,
"frac_reward_zero_std": 0.0,
"grad_norm": 23.0,
"kl": 0.002436438575387001,
"learning_rate": 0.0,
"loss": 0.1571,
"num_tokens": 8138.0,
"reward": -0.10500000417232513,
"reward_std": 0.021213199943304062,
"rewards/alfworld_rollout_reward_func/mean": -0.10500000417232513,
"rewards/alfworld_rollout_reward_func/std": 0.021213199943304062,
"sampling/importance_sampling_ratio/max": 0.9498974680900574,
"sampling/importance_sampling_ratio/mean": 0.7463880777359009,
"sampling/importance_sampling_ratio/min": 0.5428786873817444,
"sampling/sampling_logp_difference/max": 0.2791634798049927,
"sampling/sampling_logp_difference/mean": 0.02213391289114952,
"step": 1,
"step_time": 21.471763861
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 54.0,
"completions/max_terminated_length": 54.0,
"completions/mean_length": 49.5,
"completions/mean_terminated_length": 49.5,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"entropy": 0.6891850829124451,
"epoch": 0.002105263157894737,
"frac_reward_zero_std": 0.0,
"grad_norm": 13.4375,
"kl": 0.0023963379207998514,
"learning_rate": 2.0000000000000002e-07,
"loss": -0.0442,
"num_tokens": 16245.0,
"reward": -0.08500000089406967,
"reward_std": 0.007071071770042181,
"rewards/alfworld_rollout_reward_func/mean": -0.08500000089406967,
"rewards/alfworld_rollout_reward_func/std": 0.007071071770042181,
"sampling/importance_sampling_ratio/max": 0.6407822966575623,
"sampling/importance_sampling_ratio/mean": 0.5291908383369446,
"sampling/importance_sampling_ratio/min": 0.4175994098186493,
"sampling/sampling_logp_difference/max": 0.26871776580810547,
"sampling/sampling_logp_difference/mean": 0.0313858687877655,
"step": 2,
"step_time": 19.41586231600013
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 35.0,
"completions/max_terminated_length": 35.0,
"completions/mean_length": 32.5,
"completions/mean_terminated_length": 32.5,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"entropy": 0.2943471372127533,
"epoch": 0.003157894736842105,
"frac_reward_zero_std": 0.0,
"grad_norm": 9.0625,
"kl": 0.00032700574956834316,
"learning_rate": 4.0000000000000003e-07,
"loss": -0.0682,
"num_tokens": 24330.0,
"reward": -0.07000000029802322,
"reward_std": 0.05656854063272476,
"rewards/alfworld_rollout_reward_func/mean": -0.07000000029802322,
"rewards/alfworld_rollout_reward_func/std": 0.05656854063272476,
"sampling/importance_sampling_ratio/max": 1.0007613897323608,
"sampling/importance_sampling_ratio/mean": 0.8396192193031311,
"sampling/importance_sampling_ratio/min": 0.6784770488739014,
"sampling/sampling_logp_difference/max": 0.22780990600585938,
"sampling/sampling_logp_difference/mean": 0.013574070297181606,
"step": 3,
"step_time": 17.228165518000196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 45.0,
"completions/max_terminated_length": 45.0,
"completions/mean_length": 37.5,
"completions/mean_terminated_length": 37.5,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"entropy": 0.4852295517921448,
"epoch": 0.004210526315789474,
"frac_reward_zero_std": 0.0,
"grad_norm": 22.75,
"kl": 0.0009841235587373376,
"learning_rate": 6.000000000000001e-07,
"loss": 0.0262,
"num_tokens": 32171.0,
"reward": -0.04500000178813934,
"reward_std": 0.0353553406894207,
"rewards/alfworld_rollout_reward_func/mean": -0.04500000178813934,
"rewards/alfworld_rollout_reward_func/std": 0.0353553369641304,
"sampling/importance_sampling_ratio/max": 0.9375013709068298,
"sampling/importance_sampling_ratio/mean": 0.7503011226654053,
"sampling/importance_sampling_ratio/min": 0.5631008148193359,
"sampling/sampling_logp_difference/max": 0.28363165259361267,
"sampling/sampling_logp_difference/mean": 0.02271696925163269,
"step": 4,
"step_time": 17.49860281700012
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 51.0,
"completions/max_terminated_length": 51.0,
"completions/mean_length": 27.5,
"completions/mean_terminated_length": 27.5,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"entropy": 0.5301113724708557,
"epoch": 0.005263157894736842,
"frac_reward_zero_std": 0.0,
"grad_norm": 28.125,
"kl": 0.0012831644853577018,
"learning_rate": 8.000000000000001e-07,
"loss": 0.4587,
"num_tokens": 40277.0,
"reward": -0.004999999888241291,
"reward_std": 0.007071067579090595,
"rewards/alfworld_rollout_reward_func/mean": -0.004999999888241291,
"rewards/alfworld_rollout_reward_func/std": 0.007071067579090595,
"sampling/importance_sampling_ratio/max": 1.0000011920928955,
"sampling/importance_sampling_ratio/mean": 0.893904447555542,
"sampling/importance_sampling_ratio/min": 0.7878076434135437,
"sampling/sampling_logp_difference/max": 0.21004503965377808,
"sampling/sampling_logp_difference/mean": 0.02911142073571682,
"step": 5,
"step_time": 12.491938435000066
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 26.0,
"completions/max_terminated_length": 26.0,
"completions/mean_length": 16.5,
"completions/mean_terminated_length": 16.5,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"entropy": 0.1027420163154602,
"epoch": 0.00631578947368421,
"frac_reward_zero_std": 0.0,
"grad_norm": 18.875,
"kl": 4.723216625279747e-05,
"learning_rate": 1.0000000000000002e-06,
"loss": -0.397,
"num_tokens": 48155.0,
"reward": -0.029999999329447746,
"reward_std": 0.01414213515818119,
"rewards/alfworld_rollout_reward_func/mean": -0.029999999329447746,
"rewards/alfworld_rollout_reward_func/std": 0.01414213515818119,
"sampling/importance_sampling_ratio/max": 1.0009899139404297,
"sampling/importance_sampling_ratio/mean": 0.9940400719642639,
"sampling/importance_sampling_ratio/min": 0.9870902299880981,
"sampling/sampling_logp_difference/max": 0.15823769569396973,
"sampling/sampling_logp_difference/mean": 0.009266156703233719,
"step": 6,
"step_time": 23.18857599299986
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 37.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 24.0,
"completions/mean_terminated_length": 24.0,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"entropy": 0.45579978823661804,
"epoch": 0.007368421052631579,
"frac_reward_zero_std": 0.0,
"grad_norm": 46.75,
"kl": 0.0014351233839988708,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.607,
"num_tokens": 55983.0,
"reward": -0.019999999552965164,
"reward_std": 0.02828427031636238,
"rewards/alfworld_rollout_reward_func/mean": -0.019999999552965164,
"rewards/alfworld_rollout_reward_func/std": 0.02828427031636238,
"sampling/importance_sampling_ratio/max": 1.4146320819854736,
"sampling/importance_sampling_ratio/mean": 1.2070283889770508,
"sampling/importance_sampling_ratio/min": 0.9994246959686279,
"sampling/sampling_logp_difference/max": 0.1907503604888916,
"sampling/sampling_logp_difference/mean": 0.028974320739507675,
"step": 7,
"step_time": 18.246992883999837
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 47.0,
"completions/max_terminated_length": 47.0,
"completions/mean_length": 28.0,
"completions/mean_terminated_length": 28.0,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"entropy": 0.5107156038284302,
"epoch": 0.008421052631578947,
"frac_reward_zero_std": 0.0,
"grad_norm": 58.75,
"kl": 0.0012022192822769284,
"learning_rate": 1.4000000000000001e-06,
"loss": -0.6365,
"num_tokens": 64025.0,
"reward": -0.08500000089406967,
"reward_std": 0.007071071770042181,
"rewards/alfworld_rollout_reward_func/mean": -0.08500000089406967,
"rewards/alfworld_rollout_reward_func/std": 0.007071071770042181,
"sampling/importance_sampling_ratio/max": 1.279162883758545,
"sampling/importance_sampling_ratio/mean": 1.1397144794464111,
"sampling/importance_sampling_ratio/min": 1.0002660751342773,
"sampling/sampling_logp_difference/max": 0.5383334159851074,
"sampling/sampling_logp_difference/mean": 0.034098681062459946,
"step": 8,
"step_time": 14.293913408000208
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 56.0,
"completions/max_terminated_length": 56.0,
"completions/mean_length": 44.5,
"completions/mean_terminated_length": 44.5,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.4052967131137848,
"epoch": 0.009473684210526316,
"frac_reward_zero_std": 0.0,
"grad_norm": 16.625,
"kl": 0.0016700377454981208,
"learning_rate": 1.6000000000000001e-06,
"loss": -0.2022,
"num_tokens": 71652.0,
"reward": 0.4650000035762787,
"reward_std": 0.6576092839241028,
"rewards/alfworld_rollout_reward_func/mean": 0.4650000035762787,
"rewards/alfworld_rollout_reward_func/std": 0.6576092839241028,
"sampling/importance_sampling_ratio/max": 0.7853229641914368,
"sampling/importance_sampling_ratio/mean": 0.6733799576759338,
"sampling/importance_sampling_ratio/min": 0.5614369511604309,
"sampling/sampling_logp_difference/max": 0.27764952182769775,
"sampling/sampling_logp_difference/mean": 0.01565438136458397,
"step": 9,
"step_time": 16.68387536299997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 35.0,
"completions/max_terminated_length": 35.0,
"completions/mean_length": 30.5,
"completions/mean_terminated_length": 30.5,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"entropy": 0.33000221848487854,
"epoch": 0.010526315789473684,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.013671875,
"kl": 0.0005048786988481879,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.0,
"num_tokens": 79470.0,
"reward": -0.029999999329447746,
"reward_std": 0.0,
"rewards/alfworld_rollout_reward_func/mean": -0.029999999329447746,
"rewards/alfworld_rollout_reward_func/std": 0.0,
"sampling/importance_sampling_ratio/max": 0.9141315221786499,
"sampling/importance_sampling_ratio/mean": 0.7841682434082031,
"sampling/importance_sampling_ratio/min": 0.6542050242424011,
"sampling/sampling_logp_difference/max": 0.24437618255615234,
"sampling/sampling_logp_difference/mean": 0.01810554973781109,
"step": 10,
"step_time": 17.949971448000042
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 45.0,
"completions/max_terminated_length": 45.0,
"completions/mean_length": 41.5,
"completions/mean_terminated_length": 41.5,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.5000134706497192,
"epoch": 0.011578947368421053,
"frac_reward_zero_std": 0.0,
"grad_norm": 14.125,
"kl": 0.001739501953125,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0346,
"num_tokens": 87509.0,
"reward": -0.044999998062849045,
"reward_std": 0.04949747398495674,
"rewards/alfworld_rollout_reward_func/mean": -0.044999998062849045,
"rewards/alfworld_rollout_reward_func/std": 0.04949747398495674,
"sampling/importance_sampling_ratio/max": 0.6740682721138,
"sampling/importance_sampling_ratio/mean": 0.6668994426727295,
"sampling/importance_sampling_ratio/min": 0.6597306132316589,
"sampling/sampling_logp_difference/max": 0.35384368896484375,
"sampling/sampling_logp_difference/mean": 0.021013759076595306,
"step": 11,
"step_time": 15.37352415700002
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 45.0,
"completions/max_terminated_length": 45.0,
"completions/mean_length": 29.0,
"completions/mean_terminated_length": 29.0,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"entropy": 0.37302640080451965,
"epoch": 0.01263157894736842,
"frac_reward_zero_std": 0.0,
"grad_norm": 39.0,
"kl": 0.0027003493160009384,
"learning_rate": 2.2e-06,
"loss": -0.5275,
"num_tokens": 95262.0,
"reward": -0.06000000238418579,
"reward_std": 0.04242640733718872,
"rewards/alfworld_rollout_reward_func/mean": -0.06000000238418579,
"rewards/alfworld_rollout_reward_func/std": 0.04242641106247902,
"sampling/importance_sampling_ratio/max": 1.2358118295669556,
"sampling/importance_sampling_ratio/mean": 1.088789463043213,
"sampling/importance_sampling_ratio/min": 0.9417669773101807,
"sampling/sampling_logp_difference/max": 0.3406403064727783,
"sampling/sampling_logp_difference/mean": 0.028803091496229172,
"step": 12,
"step_time": 20.290826388999903
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 23.0,
"completions/max_terminated_length": 23.0,
"completions/mean_length": 15.0,
"completions/mean_terminated_length": 15.0,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"entropy": 0.07970059663057327,
"epoch": 0.01368421052631579,
"frac_reward_zero_std": 0.0,
"grad_norm": 25.125,
"kl": 0.0003672480524983257,
"learning_rate": 2.4000000000000003e-06,
"loss": -0.3785,
"num_tokens": 102993.0,
"reward": -0.004999999888241291,
"reward_std": 0.007071067579090595,
"rewards/alfworld_rollout_reward_func/mean": -0.004999999888241291,
"rewards/alfworld_rollout_reward_func/std": 0.007071067579090595,
"sampling/importance_sampling_ratio/max": 1.012474536895752,
"sampling/importance_sampling_ratio/mean": 1.006240963935852,
"sampling/importance_sampling_ratio/min": 1.0000073909759521,
"sampling/sampling_logp_difference/max": 0.011289931833744049,
"sampling/sampling_logp_difference/mean": 0.00044049008283764124,
"step": 13,
"step_time": 18.20201583400012
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 36.0,
"completions/max_terminated_length": 36.0,
"completions/mean_length": 24.5,
"completions/mean_terminated_length": 24.5,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"entropy": 0.21557021141052246,
"epoch": 0.014736842105263158,
"frac_reward_zero_std": 0.0,
"grad_norm": 28.375,
"kl": 0.005430301651358604,
"learning_rate": 2.6e-06,
"loss": -0.3004,
"num_tokens": 111031.0,
"reward": 0.42500001192092896,
"reward_std": 0.7141778469085693,
"rewards/alfworld_rollout_reward_func/mean": 0.42500001192092896,
"rewards/alfworld_rollout_reward_func/std": 0.7141778469085693,
"sampling/importance_sampling_ratio/max": 0.8329165577888489,
"sampling/importance_sampling_ratio/mean": 0.7687587738037109,
"sampling/importance_sampling_ratio/min": 0.7046010494232178,
"sampling/sampling_logp_difference/max": 0.3576321601867676,
"sampling/sampling_logp_difference/mean": 0.016674496233463287,
"step": 14,
"step_time": 11.272263890000204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 23.0,
"completions/max_terminated_length": 23.0,
"completions/mean_length": 15.0,
"completions/mean_terminated_length": 15.0,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"entropy": 0.18071487545967102,
"epoch": 0.015789473684210527,
"frac_reward_zero_std": 0.0,
"grad_norm": 16.625,
"kl": 0.00020178158592898399,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.4877,
"num_tokens": 118867.0,
"reward": -0.004999999888241291,
"reward_std": 0.007071067579090595,
"rewards/alfworld_rollout_reward_func/mean": -0.004999999888241291,
"rewards/alfworld_rollout_reward_func/std": 0.007071067579090595,
"sampling/importance_sampling_ratio/max": 1.216752290725708,
"sampling/importance_sampling_ratio/mean": 1.1083970069885254,
"sampling/importance_sampling_ratio/min": 1.0000418424606323,
"sampling/sampling_logp_difference/max": 0.11035466194152832,
"sampling/sampling_logp_difference/mean": 0.007579161319881678,
"step": 15,
"step_time": 20.160193882000158
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 11.5,
"completions/mean_terminated_length": 11.5,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"entropy": 0.07483004778623581,
"epoch": 0.016842105263157894,
"frac_reward_zero_std": 0.0,
"grad_norm": 29.625,
"kl": 0.00041633585351519287,
"learning_rate": 3e-06,
"loss": -0.0959,
"num_tokens": 126279.0,
"reward": 0.429999977350235,
"reward_std": 0.7495331764221191,
"rewards/alfworld_rollout_reward_func/mean": 0.429999977350235,
"rewards/alfworld_rollout_reward_func/std": 0.7495331764221191,
"sampling/importance_sampling_ratio/max": 1.1467688083648682,
"sampling/importance_sampling_ratio/mean": 1.057037591934204,
"sampling/importance_sampling_ratio/min": 0.9673064351081848,
"sampling/sampling_logp_difference/max": 0.11966276168823242,
"sampling/sampling_logp_difference/mean": 0.007402568124234676,
"step": 16,
"step_time": 10.745571063999932
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 36.0,
"completions/max_terminated_length": 36.0,
"completions/mean_length": 33.0,
"completions/mean_terminated_length": 33.0,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"entropy": 0.5801213383674622,
"epoch": 0.017894736842105262,
"frac_reward_zero_std": 0.0,
"grad_norm": 30.125,
"kl": 0.001962649170309305,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.1349,
"num_tokens": 134227.0,
"reward": 0.4749999940395355,
"reward_std": 0.6858935952186584,
"rewards/alfworld_rollout_reward_func/mean": 0.4749999940395355,
"rewards/alfworld_rollout_reward_func/std": 0.6858935952186584,
"sampling/importance_sampling_ratio/max": 1.1694220304489136,
"sampling/importance_sampling_ratio/mean": 1.0765215158462524,
"sampling/importance_sampling_ratio/min": 0.9836210608482361,
"sampling/sampling_logp_difference/max": 0.17440319061279297,
"sampling/sampling_logp_difference/mean": 0.022032134234905243,
"step": 17,
"step_time": 12.921286662000057
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 41.0,
"completions/max_terminated_length": 41.0,
"completions/mean_length": 39.5,
"completions/mean_terminated_length": 39.5,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.48242872953414917,
"epoch": 0.018947368421052633,
"frac_reward_zero_std": 0.0,
"grad_norm": 19.5,
"kl": 0.0010035536251962185,
"learning_rate": 3.4000000000000005e-06,
"loss": -0.0587,
"num_tokens": 142157.0,
"reward": -0.04999999701976776,
"reward_std": 0.04242640733718872,
"rewards/alfworld_rollout_reward_func/mean": -0.04999999701976776,
"rewards/alfworld_rollout_reward_func/std": 0.04242640733718872,
"sampling/importance_sampling_ratio/max": 1.0037174224853516,
"sampling/importance_sampling_ratio/mean": 0.9568833112716675,
"sampling/importance_sampling_ratio/min": 0.9100492596626282,
"sampling/sampling_logp_difference/max": 0.17302274703979492,
"sampling/sampling_logp_difference/mean": 0.02212933637201786,
"step": 18,
"step_time": 23.997690939999984
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 46.0,
"completions/max_terminated_length": 46.0,
"completions/mean_length": 44.0,
"completions/mean_terminated_length": 44.0,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"entropy": 0.44313955307006836,
"epoch": 0.02,
"frac_reward_zero_std": 0.0,
"grad_norm": 19.5,
"kl": 0.0014339183690026402,
"learning_rate": 3.6000000000000003e-06,
"loss": -0.1594,
"num_tokens": 150213.0,
"reward": -0.05999999865889549,
"reward_std": 0.0707106813788414,
"rewards/alfworld_rollout_reward_func/mean": -0.05999999865889549,
"rewards/alfworld_rollout_reward_func/std": 0.0707106739282608,
"sampling/importance_sampling_ratio/max": 1.1801482439041138,
"sampling/importance_sampling_ratio/mean": 0.9998883008956909,
"sampling/importance_sampling_ratio/min": 0.8196282982826233,
"sampling/sampling_logp_difference/max": 0.21845340728759766,
"sampling/sampling_logp_difference/mean": 0.022122323513031006,
"step": 19,
"step_time": 15.372385936
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 39.0,
"completions/max_terminated_length": 39.0,
"completions/mean_length": 36.5,
"completions/mean_terminated_length": 36.5,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.26607948541641235,
"epoch": 0.021052631578947368,
"frac_reward_zero_std": 0.0,
"grad_norm": 21.125,
"kl": 0.0008539336849935353,
"learning_rate": 3.8000000000000005e-06,
"loss": -0.0343,
"num_tokens": 157583.0,
"reward": 0.45500001311302185,
"reward_std": 0.742462158203125,
"rewards/alfworld_rollout_reward_func/mean": 0.45500001311302185,
"rewards/alfworld_rollout_reward_func/std": 0.7424620985984802,
"sampling/importance_sampling_ratio/max": 0.9992303848266602,
"sampling/importance_sampling_ratio/mean": 0.8897002339363098,
"sampling/importance_sampling_ratio/min": 0.7801700830459595,
"sampling/sampling_logp_difference/max": 0.2817434072494507,
"sampling/sampling_logp_difference/mean": 0.01816781423985958,
"step": 20,
"step_time": 21.228662558999986
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 26.0,
"completions/max_terminated_length": 26.0,
"completions/mean_length": 19.5,
"completions/mean_terminated_length": 19.5,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"entropy": 0.28872889280319214,
"epoch": 0.022105263157894735,
"frac_reward_zero_std": 0.0,
"grad_norm": 18.75,
"kl": 0.00026875274488702416,
"learning_rate": 4.000000000000001e-06,
"loss": 0.191,
"num_tokens": 162725.0,
"reward": 0.9650000333786011,
"reward_std": 0.04949747025966644,
"rewards/alfworld_rollout_reward_func/mean": 0.9650000333786011,
"rewards/alfworld_rollout_reward_func/std": 0.04949747025966644,
"sampling/importance_sampling_ratio/max": 0.9999924898147583,
"sampling/importance_sampling_ratio/mean": 0.9529882669448853,
"sampling/importance_sampling_ratio/min": 0.905984103679657,
"sampling/sampling_logp_difference/max": 0.13952183723449707,
"sampling/sampling_logp_difference/mean": 0.012355787679553032,
"step": 21,
"step_time": 8.206261441999914
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 41.0,
"completions/max_terminated_length": 41.0,
"completions/mean_length": 27.0,
"completions/mean_terminated_length": 27.0,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"entropy": 0.4186505675315857,
"epoch": 0.023157894736842106,
"frac_reward_zero_std": 0.0,
"grad_norm": 70.0,
"kl": 0.0038073172327131033,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.6479,
"num_tokens": 170762.0,
"reward": 0.4650000035762787,
"reward_std": 0.7566042542457581,
"rewards/alfworld_rollout_reward_func/mean": 0.4650000035762787,
"rewards/alfworld_rollout_reward_func/std": 0.7566042542457581,
"sampling/importance_sampling_ratio/max": 1.66000235080719,
"sampling/importance_sampling_ratio/mean": 1.544608235359192,
"sampling/importance_sampling_ratio/min": 1.4292141199111938,
"sampling/sampling_logp_difference/max": 0.23494195938110352,
"sampling/sampling_logp_difference/mean": 0.03184577822685242,
"step": 22,
"step_time": 17.66746830000011
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 44.0,
"completions/max_terminated_length": 44.0,
"completions/mean_length": 34.5,
"completions/mean_terminated_length": 34.5,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 0.3005879819393158,
"epoch": 0.024210526315789474,
"frac_reward_zero_std": 0.0,
"grad_norm": 20.5,
"kl": 0.02588764950633049,
"learning_rate": 4.4e-06,
"loss": 0.1134,
"num_tokens": 178722.0,
"reward": 0.48000001907348633,
"reward_std": 0.6929646730422974,
"rewards/alfworld_rollout_reward_func/mean": 0.48000001907348633,
"rewards/alfworld_rollout_reward_func/std": 0.6929646730422974,
"sampling/importance_sampling_ratio/max": 0.9999445676803589,
"sampling/importance_sampling_ratio/mean": 0.6585712432861328,
"sampling/importance_sampling_ratio/min": 0.3171979784965515,
"sampling/sampling_logp_difference/max": 1.331534504890442,
"sampling/sampling_logp_difference/mean": 0.03551221266388893,
"step": 23,
"step_time": 14.36746498499997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 35.0,
"completions/max_terminated_length": 35.0,
"completions/mean_length": 23.5,
"completions/mean_terminated_length": 23.5,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"entropy": 0.3918939232826233,
"epoch": 0.02526315789473684,
"frac_reward_zero_std": 0.0,
"grad_norm": 43.25,
"kl": 0.0027948389761149883,
"learning_rate": 4.600000000000001e-06,
"loss": 0.5853,
"num_tokens": 185227.0,
"reward": 0.4650000035762787,
"reward_std": 0.7424620389938354,
"rewards/alfworld_rollout_reward_func/mean": 0.4650000035762787,
"rewards/alfworld_rollout_reward_func/std": 0.7424620389938354,
"sampling/importance_sampling_ratio/max": 1.4053035974502563,
"sampling/importance_sampling_ratio/mean": 1.130873203277588,
"sampling/importance_sampling_ratio/min": 0.8564428091049194,
"sampling/sampling_logp_difference/max": 0.16124820709228516,
"sampling/sampling_logp_difference/mean": 0.016462432220578194,
"step": 24,
"step_time": 11.062920657999939
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 31.0,
"completions/max_terminated_length": 31.0,
"completions/mean_length": 29.0,
"completions/mean_terminated_length": 29.0,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 0.2529997229576111,
"epoch": 0.02631578947368421,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.018310546875,
"kl": 0.0004380304308142513,
"learning_rate": 4.800000000000001e-06,
"loss": 0.0,
"num_tokens": 193142.0,
"reward": -0.019999999552965164,
"reward_std": 0.0,
"rewards/alfworld_rollout_reward_func/mean": -0.019999999552965164,
"rewards/alfworld_rollout_reward_func/std": 0.0,
"sampling/importance_sampling_ratio/max": 1.0052870512008667,
"sampling/importance_sampling_ratio/mean": 0.8423590660095215,
"sampling/importance_sampling_ratio/min": 0.6794310212135315,
"sampling/sampling_logp_difference/max": 0.18489933013916016,
"sampling/sampling_logp_difference/mean": 0.014295091852545738,
"step": 25,
"step_time": 17.847312069000054
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 49.0,
"completions/max_terminated_length": 49.0,
"completions/mean_length": 31.0,
"completions/mean_terminated_length": 31.0,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"entropy": 0.3845043182373047,
"epoch": 0.02736842105263158,
"frac_reward_zero_std": 0.0,
"grad_norm": 50.0,
"kl": 0.002633685013279319,
"learning_rate": 5e-06,
"loss": -0.273,
"num_tokens": 200881.0,
"reward": -0.10999999940395355,
"reward_std": 0.014142133295536041,
"rewards/alfworld_rollout_reward_func/mean": -0.10999999940395355,
"rewards/alfworld_rollout_reward_func/std": 0.014142133295536041,
"sampling/importance_sampling_ratio/max": 1.0188355445861816,
"sampling/importance_sampling_ratio/mean": 0.8905454874038696,
"sampling/importance_sampling_ratio/min": 0.7622554302215576,
"sampling/sampling_logp_difference/max": 0.29382169246673584,
"sampling/sampling_logp_difference/mean": 0.03841578587889671,
"step": 26,
"step_time": 22.61463799199987
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 26.0,
"completions/max_terminated_length": 26.0,
"completions/mean_length": 17.0,
"completions/mean_terminated_length": 17.0,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"entropy": 0.32032543420791626,
"epoch": 0.028421052631578948,
"frac_reward_zero_std": 0.0,
"grad_norm": 33.75,
"kl": 0.0024931079242378473,
"learning_rate": 5.2e-06,
"loss": 0.3737,
"num_tokens": 208999.0,
"reward": 0.4950000047683716,
"reward_std": 0.7141778469085693,
"rewards/alfworld_rollout_reward_func/mean": 0.4950000047683716,
"rewards/alfworld_rollout_reward_func/std": 0.7141778469085693,
"sampling/importance_sampling_ratio/max": 1.1281967163085938,
"sampling/importance_sampling_ratio/mean": 1.083227276802063,
"sampling/importance_sampling_ratio/min": 1.0382578372955322,
"sampling/sampling_logp_difference/max": 0.17356586456298828,
"sampling/sampling_logp_difference/mean": 0.017684968188405037,
"step": 27,
"step_time": 11.613979460999872
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 14.0,
"completions/max_terminated_length": 14.0,
"completions/mean_length": 11.0,
"completions/mean_terminated_length": 11.0,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"entropy": 0.05829498916864395,
"epoch": 0.029473684210526315,
"frac_reward_zero_std": 0.0,
"grad_norm": 32.25,
"kl": 0.0006212808657437563,
"learning_rate": 5.400000000000001e-06,
"loss": -0.2479,
"num_tokens": 214157.0,
"reward": 0.4650000035762787,
"reward_std": 0.7424620389938354,
"rewards/alfworld_rollout_reward_func/mean": 0.4650000035762787,
"rewards/alfworld_rollout_reward_func/std": 0.7424620389938354,
"sampling/importance_sampling_ratio/max": 0.9999884366989136,
"sampling/importance_sampling_ratio/mean": 0.8929275274276733,
"sampling/importance_sampling_ratio/min": 0.7858666181564331,
"sampling/sampling_logp_difference/max": 0.21855998039245605,
"sampling/sampling_logp_difference/mean": 0.01126509066671133,
"step": 28,
"step_time": 9.151633475999915
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 37.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 36.5,
"completions/mean_terminated_length": 36.5,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.46773216128349304,
"epoch": 0.030526315789473683,
"frac_reward_zero_std": 0.0,
"grad_norm": 86.0,
"kl": 0.015533313155174255,
"learning_rate": 5.600000000000001e-06,
"loss": -0.5057,
"num_tokens": 222111.0,
"reward": -0.03500000014901161,
"reward_std": 0.02121320366859436,
"rewards/alfworld_rollout_reward_func/mean": -0.03500000014901161,
"rewards/alfworld_rollout_reward_func/std": 0.02121320366859436,
"sampling/importance_sampling_ratio/max": 2.2050247192382812,
"sampling/importance_sampling_ratio/mean": 1.4661935567855835,
"sampling/importance_sampling_ratio/min": 0.727362334728241,
"sampling/sampling_logp_difference/max": 1.248981237411499,
"sampling/sampling_logp_difference/mean": 0.042725156992673874,
"step": 29,
"step_time": 20.633877447999794
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 44.0,
"completions/max_terminated_length": 44.0,
"completions/mean_length": 28.0,
"completions/mean_terminated_length": 28.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"entropy": 0.18183040618896484,
"epoch": 0.031578947368421054,
"frac_reward_zero_std": 0.0,
"grad_norm": 29.625,
"kl": 0.0012305846903473139,
"learning_rate": 5.8e-06,
"loss": 0.3004,
"num_tokens": 229439.0,
"reward": 0.44999998807907104,
"reward_std": 0.7778174877166748,
"rewards/alfworld_rollout_reward_func/mean": 0.44999998807907104,
"rewards/alfworld_rollout_reward_func/std": 0.7778174877166748,
"sampling/importance_sampling_ratio/max": 0.9991921782493591,
"sampling/importance_sampling_ratio/mean": 0.9062168598175049,
"sampling/importance_sampling_ratio/min": 0.8132414817810059,
"sampling/sampling_logp_difference/max": 0.1453406810760498,
"sampling/sampling_logp_difference/mean": 0.007052628789097071,
"step": 30,
"step_time": 15.40591485699997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 56.0,
"completions/max_terminated_length": 56.0,
"completions/mean_length": 49.5,
"completions/mean_terminated_length": 49.5,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"entropy": 0.3357900381088257,
"epoch": 0.03263157894736842,
"frac_reward_zero_std": 0.0,
"grad_norm": 17.875,
"kl": 0.004221746232360601,
"learning_rate": 6e-06,
"loss": -0.0807,
"num_tokens": 237387.0,
"reward": -0.05999999865889549,
"reward_std": 0.02828427031636238,
"rewards/alfworld_rollout_reward_func/mean": -0.05999999865889549,
"rewards/alfworld_rollout_reward_func/std": 0.02828427031636238,
"sampling/importance_sampling_ratio/max": 0.8237836956977844,
"sampling/importance_sampling_ratio/mean": 0.8164201974868774,
"sampling/importance_sampling_ratio/min": 0.8090566992759705,
"sampling/sampling_logp_difference/max": 0.2831292152404785,
"sampling/sampling_logp_difference/mean": 0.024411508813500404,
"step": 31,
"step_time": 19.268974757000024
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 33.0,
"completions/max_terminated_length": 33.0,
"completions/mean_length": 32.0,
"completions/mean_terminated_length": 32.0,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 0.2340346872806549,
"epoch": 0.03368421052631579,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.049560546875,
"kl": 0.0027104970067739487,
"learning_rate": 6.200000000000001e-06,
"loss": 0.0,
"num_tokens": 245449.0,
"reward": -0.05000000074505806,
"reward_std": 0.0,
"rewards/alfworld_rollout_reward_func/mean": -0.05000000074505806,
"rewards/alfworld_rollout_reward_func/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.6312272548675537,
"sampling/importance_sampling_ratio/mean": 1.798392653465271,
"sampling/importance_sampling_ratio/min": 0.9655579924583435,
"sampling/sampling_logp_difference/max": 0.5748621225357056,
"sampling/sampling_logp_difference/mean": 0.029400669038295746,
"step": 32,
"step_time": 24.658211100000017
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 31.0,
"completions/max_terminated_length": 31.0,
"completions/mean_length": 20.5,
"completions/mean_terminated_length": 20.5,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"entropy": 0.050181157886981964,
"epoch": 0.034736842105263156,
"frac_reward_zero_std": 0.0,
"grad_norm": 19.125,
"kl": 0.003014294197782874,
"learning_rate": 6.4000000000000006e-06,
"loss": -0.3172,
"num_tokens": 253476.0,
"reward": -0.014999999664723873,
"reward_std": 0.007071067579090595,
"rewards/alfworld_rollout_reward_func/mean": -0.014999999664723873,
"rewards/alfworld_rollout_reward_func/std": 0.007071067579090595,
"sampling/importance_sampling_ratio/max": 1.2256042957305908,
"sampling/importance_sampling_ratio/mean": 1.1113841533660889,
"sampling/importance_sampling_ratio/min": 0.9971638917922974,
"sampling/sampling_logp_difference/max": 0.23240363597869873,
"sampling/sampling_logp_difference/mean": 0.006807921454310417,
"step": 33,
"step_time": 12.894395297000074
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 13.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"entropy": 8.056841761572286e-05,
"epoch": 0.035789473684210524,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.031494140625,
"kl": 9.934107758624577e-09,
"learning_rate": 6.600000000000001e-06,
"loss": -0.0589,
"num_tokens": 259774.0,
"reward": 0.49000000953674316,
"reward_std": 0.7212488651275635,
"rewards/alfworld_rollout_reward_func/mean": 0.49000000953674316,
"rewards/alfworld_rollout_reward_func/std": 0.7212488651275635,
"sampling/importance_sampling_ratio/max": 1.0000007152557373,
"sampling/importance_sampling_ratio/mean": 0.9999703764915466,
"sampling/importance_sampling_ratio/min": 0.999940037727356,
"sampling/sampling_logp_difference/max": 5.8182922657579184e-05,
"sampling/sampling_logp_difference/mean": 2.677608563317335e-06,
"step": 34,
"step_time": 13.956647035999822
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 62.0,
"completions/max_terminated_length": 62.0,
"completions/mean_length": 49.0,
"completions/mean_terminated_length": 49.0,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.489044189453125,
"epoch": 0.03684210526315789,
"frac_reward_zero_std": 0.0,
"grad_norm": 46.0,
"kl": 0.004022765904664993,
"learning_rate": 6.800000000000001e-06,
"loss": -0.8836,
"num_tokens": 267539.0,
"reward": 0.9549999833106995,
"reward_std": 0.007071061059832573,
"rewards/alfworld_rollout_reward_func/mean": 0.9549999833106995,
"rewards/alfworld_rollout_reward_func/std": 0.007071061059832573,
"sampling/importance_sampling_ratio/max": 2.5752766132354736,
"sampling/importance_sampling_ratio/mean": 1.780219316482544,
"sampling/importance_sampling_ratio/min": 0.985162079334259,
"sampling/sampling_logp_difference/max": 0.38198375701904297,
"sampling/sampling_logp_difference/mean": 0.03108775056898594,
"step": 35,
"step_time": 13.362450941999896
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 29.0,
"completions/max_terminated_length": 29.0,
"completions/mean_length": 28.5,
"completions/mean_terminated_length": 28.5,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.03428112342953682,
"epoch": 0.037894736842105266,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.005950927734375,
"kl": 9.688996215118095e-05,
"learning_rate": 7e-06,
"loss": 0.0,
"num_tokens": 275461.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/alfworld_rollout_reward_func/mean": 0.0,
"rewards/alfworld_rollout_reward_func/std": 0.0,
"sampling/importance_sampling_ratio/max": 1.0216329097747803,
"sampling/importance_sampling_ratio/mean": 1.010430932044983,
"sampling/importance_sampling_ratio/min": 0.9992288947105408,
"sampling/sampling_logp_difference/max": 0.04146456718444824,
"sampling/sampling_logp_difference/mean": 0.0011078877141699195,
"step": 36,
"step_time": 16.18710341299993
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 51.0,
"completions/max_terminated_length": 51.0,
"completions/mean_length": 49.0,
"completions/mean_terminated_length": 49.0,
"completions/min_length": 47.0,
"completions/min_terminated_length": 47.0,
"entropy": 0.375491201877594,
"epoch": 0.03894736842105263,
"frac_reward_zero_std": 0.0,
"grad_norm": 27.375,
"kl": 0.008600625209510326,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.3645,
"num_tokens": 283600.0,
"reward": -0.08500000089406967,
"reward_std": 0.02121320366859436,
"rewards/alfworld_rollout_reward_func/mean": -0.08500000089406967,
"rewards/alfworld_rollout_reward_func/std": 0.02121320366859436,
"sampling/importance_sampling_ratio/max": 1.302950382232666,
"sampling/importance_sampling_ratio/mean": 0.8185228705406189,
"sampling/importance_sampling_ratio/min": 0.3340953290462494,
"sampling/sampling_logp_difference/max": 0.3411126136779785,
"sampling/sampling_logp_difference/mean": 0.03013680875301361,
"step": 37,
"step_time": 20.539627713000073
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 8.0,
"completions/mean_terminated_length": 8.0,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"entropy": 0.09109717607498169,
"epoch": 0.04,
"frac_reward_zero_std": 0.0,
"grad_norm": 170.0,
"kl": 0.019233860075473785,
"learning_rate": 7.4e-06,
"loss": 0.2797,
"num_tokens": 291611.0,
"reward": 0.49000000953674316,
"reward_std": 0.7071067690849304,
"rewards/alfworld_rollout_reward_func/mean": 0.49000000953674316,
"rewards/alfworld_rollout_reward_func/std": 0.7071067690849304,
"sampling/importance_sampling_ratio/max": 1.4181230068206787,
"sampling/importance_sampling_ratio/mean": 1.2089695930480957,
"sampling/importance_sampling_ratio/min": 0.9998162984848022,
"sampling/sampling_logp_difference/max": 0.3511829376220703,
"sampling/sampling_logp_difference/mean": 0.022082466632127762,
"step": 38,
"step_time": 13.460569201999988
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 47.0,
"completions/max_terminated_length": 47.0,
"completions/mean_length": 31.0,
"completions/mean_terminated_length": 31.0,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"entropy": 0.35754403471946716,
"epoch": 0.04105263157894737,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.4375,
"kl": 0.01082681491971016,
"learning_rate": 7.600000000000001e-06,
"loss": -0.0723,
"num_tokens": 299145.0,
"reward": 0.4350000023841858,
"reward_std": 0.7990306615829468,
"rewards/alfworld_rollout_reward_func/mean": 0.4350000023841858,
"rewards/alfworld_rollout_reward_func/std": 0.7990306615829468,
"sampling/importance_sampling_ratio/max": 1.0000016689300537,
"sampling/importance_sampling_ratio/mean": 0.5920178294181824,
"sampling/importance_sampling_ratio/min": 0.18403403460979462,
"sampling/sampling_logp_difference/max": 0.6585979461669922,
"sampling/sampling_logp_difference/mean": 0.041399676352739334,
"step": 39,
"step_time": 15.72952194599975
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 37.0,
"completions/max_terminated_length": 37.0,
"completions/mean_length": 22.5,
"completions/mean_terminated_length": 22.5,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"entropy": 0.35843947529792786,
"epoch": 0.042105263157894736,
"frac_reward_zero_std": 0.0,
"grad_norm": 28.625,
"kl": 0.0046605560928583145,
"learning_rate": 7.800000000000002e-06,
"loss": -0.3742,
"num_tokens": 307197.0,
"reward": -0.009999999776482582,
"reward_std": 0.01414213515818119,
"rewards/alfworld_rollout_reward_func/mean": -0.009999999776482582,
"rewards/alfworld_rollout_reward_func/std": 0.01414213515818119,
"sampling/importance_sampling_ratio/max": 1.0000009536743164,
"sampling/importance_sampling_ratio/mean": 0.9322052001953125,
"sampling/importance_sampling_ratio/min": 0.8644094467163086,
"sampling/sampling_logp_difference/max": 0.34821510314941406,
"sampling/sampling_logp_difference/mean": 0.02805318869650364,
"step": 40,
"step_time": 13.67377691799993
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 12.0,
"completions/max_terminated_length": 12.0,
"completions/mean_length": 12.0,
"completions/mean_terminated_length": 12.0,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"entropy": 0.028120441362261772,
"epoch": 0.0431578947368421,
"frac_reward_zero_std": 0.0,
"grad_norm": 92.5,
"kl": 0.0015685707330703735,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0001,
"num_tokens": 313777.0,
"reward": 0.9399999976158142,
"reward_std": 0.05656857416033745,
"rewards/alfworld_rollout_reward_func/mean": 0.9399999976158142,
"rewards/alfworld_rollout_reward_func/std": 0.05656857416033745,
"sampling/importance_sampling_ratio/max": 1.0000027418136597,
"sampling/importance_sampling_ratio/mean": 0.9999538064002991,
"sampling/importance_sampling_ratio/min": 0.9999048709869385,
"sampling/sampling_logp_difference/max": 9.777725790627301e-05,
"sampling/sampling_logp_difference/mean": 4.304089543438749e-06,
"step": 41,
"step_time": 11.189458066000043
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 40.0,
"completions/max_terminated_length": 40.0,
"completions/mean_length": 36.0,
"completions/mean_terminated_length": 36.0,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"entropy": 0.24259884655475616,
"epoch": 0.04421052631578947,
"frac_reward_zero_std": 0.0,
"grad_norm": 16.75,
"kl": 0.0016654051141813397,
"learning_rate": 8.2e-06,
"loss": 0.1025,
"num_tokens": 321642.0,
"reward": -0.014999999664723873,
"reward_std": 0.02121320366859436,
"rewards/alfworld_rollout_reward_func/mean": -0.014999999664723873,
"rewards/alfworld_rollout_reward_func/std": 0.02121320366859436,
"sampling/importance_sampling_ratio/max": 1.060155987739563,
"sampling/importance_sampling_ratio/mean": 1.0288386344909668,
"sampling/importance_sampling_ratio/min": 0.9975212216377258,
"sampling/sampling_logp_difference/max": 0.16695499420166016,
"sampling/sampling_logp_difference/mean": 0.009143915958702564,
"step": 42,
"step_time": 22.64027631700037
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 38.0,
"completions/max_terminated_length": 38.0,
"completions/mean_length": 23.5,
"completions/mean_terminated_length": 23.5,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"entropy": 0.20157602429389954,
"epoch": 0.045263157894736845,
"frac_reward_zero_std": 0.0,
"grad_norm": 63.25,
"kl": 0.022462664172053337,
"learning_rate": 8.400000000000001e-06,
"loss": 0.2766,
"num_tokens": 325516.0,
"reward": 0.9850000143051147,
"reward_std": 0.007071061059832573,
"rewards/alfworld_rollout_reward_func/mean": 0.9850000143051147,
"rewards/alfworld_rollout_reward_func/std": 0.007071061059832573,
"sampling/importance_sampling_ratio/max": 0.9990339875221252,
"sampling/importance_sampling_ratio/mean": 0.8629282712936401,
"sampling/importance_sampling_ratio/min": 0.7268226146697998,
"sampling/sampling_logp_difference/max": 0.17243313789367676,
"sampling/sampling_logp_difference/mean": 0.015407886356115341,
"step": 43,
"step_time": 5.622832681000091
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 35.0,
"completions/max_terminated_length": 35.0,
"completions/mean_length": 33.5,
"completions/mean_terminated_length": 33.5,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"entropy": 0.2535250782966614,
"epoch": 0.04631578947368421,
"frac_reward_zero_std": 0.0,
"grad_norm": 19.25,
"kl": 0.004778198432177305,
"learning_rate": 8.6e-06,
"loss": 0.0797,
"num_tokens": 333519.0,
"reward": 0.4699999988079071,
"reward_std": 0.7495331764221191,
"rewards/alfworld_rollout_reward_func/mean": 0.4699999988079071,
"rewards/alfworld_rollout_reward_func/std": 0.7495331764221191,
"sampling/importance_sampling_ratio/max": 0.7066987752914429,
"sampling/importance_sampling_ratio/mean": 0.6219298839569092,
"sampling/importance_sampling_ratio/min": 0.5371610522270203,
"sampling/sampling_logp_difference/max": 0.34627819061279297,
"sampling/sampling_logp_difference/mean": 0.017883947119116783,
"step": 44,
"step_time": 15.114469847999771
}
],
"logging_steps": 1,
"max_steps": 4750,
"num_input_tokens_seen": 333519,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}