{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.04631578947368421, "eval_steps": 500, "global_step": 44, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 38.0, "completions/mean_terminated_length": 38.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.4614375829696655, "epoch": 0.0010526315789473684, "frac_reward_zero_std": 0.0, "grad_norm": 23.0, "kl": 0.002436438575387001, "learning_rate": 0.0, "loss": 0.1571, "num_tokens": 8138.0, "reward": -0.10500000417232513, "reward_std": 0.021213199943304062, "rewards/alfworld_rollout_reward_func/mean": -0.10500000417232513, "rewards/alfworld_rollout_reward_func/std": 0.021213199943304062, "sampling/importance_sampling_ratio/max": 0.9498974680900574, "sampling/importance_sampling_ratio/mean": 0.7463880777359009, "sampling/importance_sampling_ratio/min": 0.5428786873817444, "sampling/sampling_logp_difference/max": 0.2791634798049927, "sampling/sampling_logp_difference/mean": 0.02213391289114952, "step": 1, "step_time": 21.471763861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.6891850829124451, "epoch": 0.002105263157894737, "frac_reward_zero_std": 0.0, "grad_norm": 13.4375, "kl": 0.0023963379207998514, "learning_rate": 2.0000000000000002e-07, "loss": -0.0442, "num_tokens": 16245.0, "reward": -0.08500000089406967, "reward_std": 0.007071071770042181, "rewards/alfworld_rollout_reward_func/mean": -0.08500000089406967, "rewards/alfworld_rollout_reward_func/std": 0.007071071770042181, "sampling/importance_sampling_ratio/max": 0.6407822966575623, "sampling/importance_sampling_ratio/mean": 0.5291908383369446, "sampling/importance_sampling_ratio/min": 0.4175994098186493, "sampling/sampling_logp_difference/max": 0.26871776580810547, "sampling/sampling_logp_difference/mean": 0.0313858687877655, "step": 2, "step_time": 19.41586231600013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 32.5, "completions/mean_terminated_length": 32.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.2943471372127533, "epoch": 0.003157894736842105, "frac_reward_zero_std": 0.0, "grad_norm": 9.0625, "kl": 0.00032700574956834316, "learning_rate": 4.0000000000000003e-07, "loss": -0.0682, "num_tokens": 24330.0, "reward": -0.07000000029802322, "reward_std": 0.05656854063272476, "rewards/alfworld_rollout_reward_func/mean": -0.07000000029802322, "rewards/alfworld_rollout_reward_func/std": 0.05656854063272476, "sampling/importance_sampling_ratio/max": 1.0007613897323608, "sampling/importance_sampling_ratio/mean": 0.8396192193031311, "sampling/importance_sampling_ratio/min": 0.6784770488739014, "sampling/sampling_logp_difference/max": 0.22780990600585938, "sampling/sampling_logp_difference/mean": 0.013574070297181606, "step": 3, "step_time": 17.228165518000196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 37.5, "completions/mean_terminated_length": 37.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.4852295517921448, "epoch": 0.004210526315789474, "frac_reward_zero_std": 0.0, "grad_norm": 22.75, "kl": 0.0009841235587373376, "learning_rate": 6.000000000000001e-07, "loss": 0.0262, "num_tokens": 32171.0, "reward": -0.04500000178813934, "reward_std": 0.0353553406894207, "rewards/alfworld_rollout_reward_func/mean": -0.04500000178813934, "rewards/alfworld_rollout_reward_func/std": 0.0353553369641304, "sampling/importance_sampling_ratio/max": 0.9375013709068298, "sampling/importance_sampling_ratio/mean": 0.7503011226654053, "sampling/importance_sampling_ratio/min": 0.5631008148193359, "sampling/sampling_logp_difference/max": 0.28363165259361267, "sampling/sampling_logp_difference/mean": 0.02271696925163269, "step": 4, "step_time": 17.49860281700012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 27.5, "completions/mean_terminated_length": 27.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5301113724708557, "epoch": 0.005263157894736842, "frac_reward_zero_std": 0.0, "grad_norm": 28.125, "kl": 0.0012831644853577018, "learning_rate": 8.000000000000001e-07, "loss": 0.4587, "num_tokens": 40277.0, "reward": -0.004999999888241291, "reward_std": 0.007071067579090595, "rewards/alfworld_rollout_reward_func/mean": -0.004999999888241291, "rewards/alfworld_rollout_reward_func/std": 0.007071067579090595, "sampling/importance_sampling_ratio/max": 1.0000011920928955, "sampling/importance_sampling_ratio/mean": 0.893904447555542, "sampling/importance_sampling_ratio/min": 0.7878076434135437, "sampling/sampling_logp_difference/max": 0.21004503965377808, "sampling/sampling_logp_difference/mean": 0.02911142073571682, "step": 5, "step_time": 12.491938435000066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 16.5, "completions/mean_terminated_length": 16.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 0.1027420163154602, "epoch": 0.00631578947368421, "frac_reward_zero_std": 0.0, "grad_norm": 18.875, "kl": 4.723216625279747e-05, "learning_rate": 1.0000000000000002e-06, "loss": -0.397, "num_tokens": 48155.0, "reward": -0.029999999329447746, "reward_std": 0.01414213515818119, "rewards/alfworld_rollout_reward_func/mean": -0.029999999329447746, "rewards/alfworld_rollout_reward_func/std": 0.01414213515818119, "sampling/importance_sampling_ratio/max": 1.0009899139404297, "sampling/importance_sampling_ratio/mean": 0.9940400719642639, "sampling/importance_sampling_ratio/min": 0.9870902299880981, "sampling/sampling_logp_difference/max": 0.15823769569396973, "sampling/sampling_logp_difference/mean": 0.009266156703233719, "step": 6, "step_time": 23.18857599299986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 24.0, "completions/mean_terminated_length": 24.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.45579978823661804, "epoch": 0.007368421052631579, "frac_reward_zero_std": 0.0, "grad_norm": 46.75, "kl": 0.0014351233839988708, "learning_rate": 1.2000000000000002e-06, "loss": 0.607, "num_tokens": 55983.0, "reward": -0.019999999552965164, "reward_std": 0.02828427031636238, "rewards/alfworld_rollout_reward_func/mean": -0.019999999552965164, "rewards/alfworld_rollout_reward_func/std": 0.02828427031636238, "sampling/importance_sampling_ratio/max": 1.4146320819854736, "sampling/importance_sampling_ratio/mean": 1.2070283889770508, "sampling/importance_sampling_ratio/min": 0.9994246959686279, "sampling/sampling_logp_difference/max": 0.1907503604888916, "sampling/sampling_logp_difference/mean": 0.028974320739507675, "step": 7, "step_time": 18.246992883999837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.5107156038284302, "epoch": 0.008421052631578947, "frac_reward_zero_std": 0.0, "grad_norm": 58.75, "kl": 0.0012022192822769284, "learning_rate": 1.4000000000000001e-06, "loss": -0.6365, "num_tokens": 64025.0, "reward": -0.08500000089406967, "reward_std": 0.007071071770042181, "rewards/alfworld_rollout_reward_func/mean": -0.08500000089406967, "rewards/alfworld_rollout_reward_func/std": 0.007071071770042181, "sampling/importance_sampling_ratio/max": 1.279162883758545, "sampling/importance_sampling_ratio/mean": 1.1397144794464111, "sampling/importance_sampling_ratio/min": 1.0002660751342773, "sampling/sampling_logp_difference/max": 0.5383334159851074, "sampling/sampling_logp_difference/mean": 0.034098681062459946, "step": 8, "step_time": 14.293913408000208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.4052967131137848, "epoch": 0.009473684210526316, "frac_reward_zero_std": 0.0, "grad_norm": 16.625, "kl": 0.0016700377454981208, "learning_rate": 1.6000000000000001e-06, "loss": -0.2022, "num_tokens": 71652.0, "reward": 0.4650000035762787, "reward_std": 0.6576092839241028, "rewards/alfworld_rollout_reward_func/mean": 0.4650000035762787, "rewards/alfworld_rollout_reward_func/std": 0.6576092839241028, "sampling/importance_sampling_ratio/max": 0.7853229641914368, "sampling/importance_sampling_ratio/mean": 0.6733799576759338, "sampling/importance_sampling_ratio/min": 0.5614369511604309, "sampling/sampling_logp_difference/max": 0.27764952182769775, "sampling/sampling_logp_difference/mean": 0.01565438136458397, "step": 9, "step_time": 16.68387536299997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 30.5, "completions/mean_terminated_length": 30.5, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.33000221848487854, "epoch": 0.010526315789473684, "frac_reward_zero_std": 1.0, "grad_norm": 0.013671875, "kl": 0.0005048786988481879, "learning_rate": 1.8000000000000001e-06, "loss": 0.0, "num_tokens": 79470.0, "reward": -0.029999999329447746, "reward_std": 0.0, "rewards/alfworld_rollout_reward_func/mean": -0.029999999329447746, "rewards/alfworld_rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 0.9141315221786499, "sampling/importance_sampling_ratio/mean": 0.7841682434082031, "sampling/importance_sampling_ratio/min": 0.6542050242424011, "sampling/sampling_logp_difference/max": 0.24437618255615234, "sampling/sampling_logp_difference/mean": 0.01810554973781109, "step": 10, "step_time": 17.949971448000042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 41.5, "completions/mean_terminated_length": 41.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.5000134706497192, "epoch": 0.011578947368421053, "frac_reward_zero_std": 0.0, "grad_norm": 14.125, "kl": 0.001739501953125, "learning_rate": 2.0000000000000003e-06, "loss": 0.0346, "num_tokens": 87509.0, "reward": -0.044999998062849045, "reward_std": 0.04949747398495674, "rewards/alfworld_rollout_reward_func/mean": -0.044999998062849045, "rewards/alfworld_rollout_reward_func/std": 0.04949747398495674, "sampling/importance_sampling_ratio/max": 0.6740682721138, "sampling/importance_sampling_ratio/mean": 0.6668994426727295, "sampling/importance_sampling_ratio/min": 0.6597306132316589, "sampling/sampling_logp_difference/max": 0.35384368896484375, "sampling/sampling_logp_difference/mean": 0.021013759076595306, "step": 11, "step_time": 15.37352415700002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.0, "completions/max_terminated_length": 45.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.37302640080451965, "epoch": 0.01263157894736842, "frac_reward_zero_std": 0.0, "grad_norm": 39.0, "kl": 0.0027003493160009384, "learning_rate": 2.2e-06, "loss": -0.5275, "num_tokens": 95262.0, "reward": -0.06000000238418579, "reward_std": 0.04242640733718872, "rewards/alfworld_rollout_reward_func/mean": -0.06000000238418579, "rewards/alfworld_rollout_reward_func/std": 0.04242641106247902, "sampling/importance_sampling_ratio/max": 1.2358118295669556, "sampling/importance_sampling_ratio/mean": 1.088789463043213, "sampling/importance_sampling_ratio/min": 0.9417669773101807, "sampling/sampling_logp_difference/max": 0.3406403064727783, "sampling/sampling_logp_difference/mean": 0.028803091496229172, "step": 12, "step_time": 20.290826388999903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 0.07970059663057327, "epoch": 0.01368421052631579, "frac_reward_zero_std": 0.0, "grad_norm": 25.125, "kl": 0.0003672480524983257, "learning_rate": 2.4000000000000003e-06, "loss": -0.3785, "num_tokens": 102993.0, "reward": -0.004999999888241291, "reward_std": 0.007071067579090595, "rewards/alfworld_rollout_reward_func/mean": -0.004999999888241291, "rewards/alfworld_rollout_reward_func/std": 0.007071067579090595, "sampling/importance_sampling_ratio/max": 1.012474536895752, "sampling/importance_sampling_ratio/mean": 1.006240963935852, "sampling/importance_sampling_ratio/min": 1.0000073909759521, "sampling/sampling_logp_difference/max": 0.011289931833744049, "sampling/sampling_logp_difference/mean": 0.00044049008283764124, "step": 13, "step_time": 18.20201583400012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 24.5, "completions/mean_terminated_length": 24.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.21557021141052246, "epoch": 0.014736842105263158, "frac_reward_zero_std": 0.0, "grad_norm": 28.375, "kl": 0.005430301651358604, "learning_rate": 2.6e-06, "loss": -0.3004, "num_tokens": 111031.0, "reward": 0.42500001192092896, "reward_std": 0.7141778469085693, "rewards/alfworld_rollout_reward_func/mean": 0.42500001192092896, "rewards/alfworld_rollout_reward_func/std": 0.7141778469085693, "sampling/importance_sampling_ratio/max": 0.8329165577888489, "sampling/importance_sampling_ratio/mean": 0.7687587738037109, "sampling/importance_sampling_ratio/min": 0.7046010494232178, "sampling/sampling_logp_difference/max": 0.3576321601867676, "sampling/sampling_logp_difference/mean": 0.016674496233463287, "step": 14, "step_time": 11.272263890000204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 23.0, "completions/max_terminated_length": 23.0, "completions/mean_length": 15.0, "completions/mean_terminated_length": 15.0, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 0.18071487545967102, "epoch": 0.015789473684210527, "frac_reward_zero_std": 0.0, "grad_norm": 16.625, "kl": 0.00020178158592898399, "learning_rate": 2.8000000000000003e-06, "loss": 0.4877, "num_tokens": 118867.0, "reward": -0.004999999888241291, "reward_std": 0.007071067579090595, "rewards/alfworld_rollout_reward_func/mean": -0.004999999888241291, "rewards/alfworld_rollout_reward_func/std": 0.007071067579090595, "sampling/importance_sampling_ratio/max": 1.216752290725708, "sampling/importance_sampling_ratio/mean": 1.1083970069885254, "sampling/importance_sampling_ratio/min": 1.0000418424606323, "sampling/sampling_logp_difference/max": 0.11035466194152832, "sampling/sampling_logp_difference/mean": 0.007579161319881678, "step": 15, "step_time": 20.160193882000158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.5, "completions/mean_terminated_length": 11.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.07483004778623581, "epoch": 0.016842105263157894, "frac_reward_zero_std": 0.0, "grad_norm": 29.625, "kl": 0.00041633585351519287, "learning_rate": 3e-06, "loss": -0.0959, "num_tokens": 126279.0, "reward": 0.429999977350235, "reward_std": 0.7495331764221191, "rewards/alfworld_rollout_reward_func/mean": 0.429999977350235, "rewards/alfworld_rollout_reward_func/std": 0.7495331764221191, "sampling/importance_sampling_ratio/max": 1.1467688083648682, "sampling/importance_sampling_ratio/mean": 1.057037591934204, "sampling/importance_sampling_ratio/min": 0.9673064351081848, "sampling/sampling_logp_difference/max": 0.11966276168823242, "sampling/sampling_logp_difference/mean": 0.007402568124234676, "step": 16, "step_time": 10.745571063999932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 36.0, "completions/max_terminated_length": 36.0, "completions/mean_length": 33.0, "completions/mean_terminated_length": 33.0, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.5801213383674622, "epoch": 0.017894736842105262, "frac_reward_zero_std": 0.0, "grad_norm": 30.125, "kl": 0.001962649170309305, "learning_rate": 3.2000000000000003e-06, "loss": 0.1349, "num_tokens": 134227.0, "reward": 0.4749999940395355, "reward_std": 0.6858935952186584, "rewards/alfworld_rollout_reward_func/mean": 0.4749999940395355, "rewards/alfworld_rollout_reward_func/std": 0.6858935952186584, "sampling/importance_sampling_ratio/max": 1.1694220304489136, "sampling/importance_sampling_ratio/mean": 1.0765215158462524, "sampling/importance_sampling_ratio/min": 0.9836210608482361, "sampling/sampling_logp_difference/max": 0.17440319061279297, "sampling/sampling_logp_difference/mean": 0.022032134234905243, "step": 17, "step_time": 12.921286662000057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 39.5, "completions/mean_terminated_length": 39.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.48242872953414917, "epoch": 0.018947368421052633, "frac_reward_zero_std": 0.0, "grad_norm": 19.5, "kl": 0.0010035536251962185, "learning_rate": 3.4000000000000005e-06, "loss": -0.0587, "num_tokens": 142157.0, "reward": -0.04999999701976776, "reward_std": 0.04242640733718872, "rewards/alfworld_rollout_reward_func/mean": -0.04999999701976776, "rewards/alfworld_rollout_reward_func/std": 0.04242640733718872, "sampling/importance_sampling_ratio/max": 1.0037174224853516, "sampling/importance_sampling_ratio/mean": 0.9568833112716675, "sampling/importance_sampling_ratio/min": 0.9100492596626282, "sampling/sampling_logp_difference/max": 0.17302274703979492, "sampling/sampling_logp_difference/mean": 0.02212933637201786, "step": 18, "step_time": 23.997690939999984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.0, "completions/max_terminated_length": 46.0, "completions/mean_length": 44.0, "completions/mean_terminated_length": 44.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.44313955307006836, "epoch": 0.02, "frac_reward_zero_std": 0.0, "grad_norm": 19.5, "kl": 0.0014339183690026402, "learning_rate": 3.6000000000000003e-06, "loss": -0.1594, "num_tokens": 150213.0, "reward": -0.05999999865889549, "reward_std": 0.0707106813788414, "rewards/alfworld_rollout_reward_func/mean": -0.05999999865889549, "rewards/alfworld_rollout_reward_func/std": 0.0707106739282608, "sampling/importance_sampling_ratio/max": 1.1801482439041138, "sampling/importance_sampling_ratio/mean": 0.9998883008956909, "sampling/importance_sampling_ratio/min": 0.8196282982826233, "sampling/sampling_logp_difference/max": 0.21845340728759766, "sampling/sampling_logp_difference/mean": 0.022122323513031006, "step": 19, "step_time": 15.372385936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.0, "completions/max_terminated_length": 39.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.26607948541641235, "epoch": 0.021052631578947368, "frac_reward_zero_std": 0.0, "grad_norm": 21.125, "kl": 0.0008539336849935353, "learning_rate": 3.8000000000000005e-06, "loss": -0.0343, "num_tokens": 157583.0, "reward": 0.45500001311302185, "reward_std": 0.742462158203125, "rewards/alfworld_rollout_reward_func/mean": 0.45500001311302185, "rewards/alfworld_rollout_reward_func/std": 0.7424620985984802, "sampling/importance_sampling_ratio/max": 0.9992303848266602, "sampling/importance_sampling_ratio/mean": 0.8897002339363098, "sampling/importance_sampling_ratio/min": 0.7801700830459595, "sampling/sampling_logp_difference/max": 0.2817434072494507, "sampling/sampling_logp_difference/mean": 0.01816781423985958, "step": 20, "step_time": 21.228662558999986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 19.5, "completions/mean_terminated_length": 19.5, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.28872889280319214, "epoch": 0.022105263157894735, "frac_reward_zero_std": 0.0, "grad_norm": 18.75, "kl": 0.00026875274488702416, "learning_rate": 4.000000000000001e-06, "loss": 0.191, "num_tokens": 162725.0, "reward": 0.9650000333786011, "reward_std": 0.04949747025966644, "rewards/alfworld_rollout_reward_func/mean": 0.9650000333786011, "rewards/alfworld_rollout_reward_func/std": 0.04949747025966644, "sampling/importance_sampling_ratio/max": 0.9999924898147583, "sampling/importance_sampling_ratio/mean": 0.9529882669448853, "sampling/importance_sampling_ratio/min": 0.905984103679657, "sampling/sampling_logp_difference/max": 0.13952183723449707, "sampling/sampling_logp_difference/mean": 0.012355787679553032, "step": 21, "step_time": 8.206261441999914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.0, "completions/max_terminated_length": 41.0, "completions/mean_length": 27.0, "completions/mean_terminated_length": 27.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.4186505675315857, "epoch": 0.023157894736842106, "frac_reward_zero_std": 0.0, "grad_norm": 70.0, "kl": 0.0038073172327131033, "learning_rate": 4.2000000000000004e-06, "loss": 0.6479, "num_tokens": 170762.0, "reward": 0.4650000035762787, "reward_std": 0.7566042542457581, "rewards/alfworld_rollout_reward_func/mean": 0.4650000035762787, "rewards/alfworld_rollout_reward_func/std": 0.7566042542457581, "sampling/importance_sampling_ratio/max": 1.66000235080719, "sampling/importance_sampling_ratio/mean": 1.544608235359192, "sampling/importance_sampling_ratio/min": 1.4292141199111938, "sampling/sampling_logp_difference/max": 0.23494195938110352, "sampling/sampling_logp_difference/mean": 0.03184577822685242, "step": 22, "step_time": 17.66746830000011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 34.5, "completions/mean_terminated_length": 34.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.3005879819393158, "epoch": 0.024210526315789474, "frac_reward_zero_std": 0.0, "grad_norm": 20.5, "kl": 0.02588764950633049, "learning_rate": 4.4e-06, "loss": 0.1134, "num_tokens": 178722.0, "reward": 0.48000001907348633, "reward_std": 0.6929646730422974, "rewards/alfworld_rollout_reward_func/mean": 0.48000001907348633, "rewards/alfworld_rollout_reward_func/std": 0.6929646730422974, "sampling/importance_sampling_ratio/max": 0.9999445676803589, "sampling/importance_sampling_ratio/mean": 0.6585712432861328, "sampling/importance_sampling_ratio/min": 0.3171979784965515, "sampling/sampling_logp_difference/max": 1.331534504890442, "sampling/sampling_logp_difference/mean": 0.03551221266388893, "step": 23, "step_time": 14.36746498499997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.3918939232826233, "epoch": 0.02526315789473684, "frac_reward_zero_std": 0.0, "grad_norm": 43.25, "kl": 0.0027948389761149883, "learning_rate": 4.600000000000001e-06, "loss": 0.5853, "num_tokens": 185227.0, "reward": 0.4650000035762787, "reward_std": 0.7424620389938354, "rewards/alfworld_rollout_reward_func/mean": 0.4650000035762787, "rewards/alfworld_rollout_reward_func/std": 0.7424620389938354, "sampling/importance_sampling_ratio/max": 1.4053035974502563, "sampling/importance_sampling_ratio/mean": 1.130873203277588, "sampling/importance_sampling_ratio/min": 0.8564428091049194, "sampling/sampling_logp_difference/max": 0.16124820709228516, "sampling/sampling_logp_difference/mean": 0.016462432220578194, "step": 24, "step_time": 11.062920657999939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 29.0, "completions/mean_terminated_length": 29.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 0.2529997229576111, "epoch": 0.02631578947368421, "frac_reward_zero_std": 1.0, "grad_norm": 0.018310546875, "kl": 0.0004380304308142513, "learning_rate": 4.800000000000001e-06, "loss": 0.0, "num_tokens": 193142.0, "reward": -0.019999999552965164, "reward_std": 0.0, "rewards/alfworld_rollout_reward_func/mean": -0.019999999552965164, "rewards/alfworld_rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0052870512008667, "sampling/importance_sampling_ratio/mean": 0.8423590660095215, "sampling/importance_sampling_ratio/min": 0.6794310212135315, "sampling/sampling_logp_difference/max": 0.18489933013916016, "sampling/sampling_logp_difference/mean": 0.014295091852545738, "step": 25, "step_time": 17.847312069000054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.3845043182373047, "epoch": 0.02736842105263158, "frac_reward_zero_std": 0.0, "grad_norm": 50.0, "kl": 0.002633685013279319, "learning_rate": 5e-06, "loss": -0.273, "num_tokens": 200881.0, "reward": -0.10999999940395355, "reward_std": 0.014142133295536041, "rewards/alfworld_rollout_reward_func/mean": -0.10999999940395355, "rewards/alfworld_rollout_reward_func/std": 0.014142133295536041, "sampling/importance_sampling_ratio/max": 1.0188355445861816, "sampling/importance_sampling_ratio/mean": 0.8905454874038696, "sampling/importance_sampling_ratio/min": 0.7622554302215576, "sampling/sampling_logp_difference/max": 0.29382169246673584, "sampling/sampling_logp_difference/mean": 0.03841578587889671, "step": 26, "step_time": 22.61463799199987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 26.0, "completions/max_terminated_length": 26.0, "completions/mean_length": 17.0, "completions/mean_terminated_length": 17.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.32032543420791626, "epoch": 0.028421052631578948, "frac_reward_zero_std": 0.0, "grad_norm": 33.75, "kl": 0.0024931079242378473, "learning_rate": 5.2e-06, "loss": 0.3737, "num_tokens": 208999.0, "reward": 0.4950000047683716, "reward_std": 0.7141778469085693, "rewards/alfworld_rollout_reward_func/mean": 0.4950000047683716, "rewards/alfworld_rollout_reward_func/std": 0.7141778469085693, "sampling/importance_sampling_ratio/max": 1.1281967163085938, "sampling/importance_sampling_ratio/mean": 1.083227276802063, "sampling/importance_sampling_ratio/min": 1.0382578372955322, "sampling/sampling_logp_difference/max": 0.17356586456298828, "sampling/sampling_logp_difference/mean": 0.017684968188405037, "step": 27, "step_time": 11.613979460999872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 11.0, "completions/mean_terminated_length": 11.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.05829498916864395, "epoch": 0.029473684210526315, "frac_reward_zero_std": 0.0, "grad_norm": 32.25, "kl": 0.0006212808657437563, "learning_rate": 5.400000000000001e-06, "loss": -0.2479, "num_tokens": 214157.0, "reward": 0.4650000035762787, "reward_std": 0.7424620389938354, "rewards/alfworld_rollout_reward_func/mean": 0.4650000035762787, "rewards/alfworld_rollout_reward_func/std": 0.7424620389938354, "sampling/importance_sampling_ratio/max": 0.9999884366989136, "sampling/importance_sampling_ratio/mean": 0.8929275274276733, "sampling/importance_sampling_ratio/min": 0.7858666181564331, "sampling/sampling_logp_difference/max": 0.21855998039245605, "sampling/sampling_logp_difference/mean": 0.01126509066671133, "step": 28, "step_time": 9.151633475999915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 36.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.46773216128349304, "epoch": 0.030526315789473683, "frac_reward_zero_std": 0.0, "grad_norm": 86.0, "kl": 0.015533313155174255, "learning_rate": 5.600000000000001e-06, "loss": -0.5057, "num_tokens": 222111.0, "reward": -0.03500000014901161, "reward_std": 0.02121320366859436, "rewards/alfworld_rollout_reward_func/mean": -0.03500000014901161, "rewards/alfworld_rollout_reward_func/std": 0.02121320366859436, "sampling/importance_sampling_ratio/max": 2.2050247192382812, "sampling/importance_sampling_ratio/mean": 1.4661935567855835, "sampling/importance_sampling_ratio/min": 0.727362334728241, "sampling/sampling_logp_difference/max": 1.248981237411499, "sampling/sampling_logp_difference/mean": 0.042725156992673874, "step": 29, "step_time": 20.633877447999794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 28.0, "completions/mean_terminated_length": 28.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.18183040618896484, "epoch": 0.031578947368421054, "frac_reward_zero_std": 0.0, "grad_norm": 29.625, "kl": 0.0012305846903473139, "learning_rate": 5.8e-06, "loss": 0.3004, "num_tokens": 229439.0, "reward": 0.44999998807907104, "reward_std": 0.7778174877166748, "rewards/alfworld_rollout_reward_func/mean": 0.44999998807907104, "rewards/alfworld_rollout_reward_func/std": 0.7778174877166748, "sampling/importance_sampling_ratio/max": 0.9991921782493591, "sampling/importance_sampling_ratio/mean": 0.9062168598175049, "sampling/importance_sampling_ratio/min": 0.8132414817810059, "sampling/sampling_logp_difference/max": 0.1453406810760498, "sampling/sampling_logp_difference/mean": 0.007052628789097071, "step": 30, "step_time": 15.40591485699997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.3357900381088257, "epoch": 0.03263157894736842, "frac_reward_zero_std": 0.0, "grad_norm": 17.875, "kl": 0.004221746232360601, "learning_rate": 6e-06, "loss": -0.0807, "num_tokens": 237387.0, "reward": -0.05999999865889549, "reward_std": 0.02828427031636238, "rewards/alfworld_rollout_reward_func/mean": -0.05999999865889549, "rewards/alfworld_rollout_reward_func/std": 0.02828427031636238, "sampling/importance_sampling_ratio/max": 0.8237836956977844, "sampling/importance_sampling_ratio/mean": 0.8164201974868774, "sampling/importance_sampling_ratio/min": 0.8090566992759705, "sampling/sampling_logp_difference/max": 0.2831292152404785, "sampling/sampling_logp_difference/mean": 0.024411508813500404, "step": 31, "step_time": 19.268974757000024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 33.0, "completions/max_terminated_length": 33.0, "completions/mean_length": 32.0, "completions/mean_terminated_length": 32.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.2340346872806549, "epoch": 0.03368421052631579, "frac_reward_zero_std": 1.0, "grad_norm": 0.049560546875, "kl": 0.0027104970067739487, "learning_rate": 6.200000000000001e-06, "loss": 0.0, "num_tokens": 245449.0, "reward": -0.05000000074505806, "reward_std": 0.0, "rewards/alfworld_rollout_reward_func/mean": -0.05000000074505806, "rewards/alfworld_rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6312272548675537, "sampling/importance_sampling_ratio/mean": 1.798392653465271, "sampling/importance_sampling_ratio/min": 0.9655579924583435, "sampling/sampling_logp_difference/max": 0.5748621225357056, "sampling/sampling_logp_difference/mean": 0.029400669038295746, "step": 32, "step_time": 24.658211100000017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 20.5, "completions/mean_terminated_length": 20.5, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.050181157886981964, "epoch": 0.034736842105263156, "frac_reward_zero_std": 0.0, "grad_norm": 19.125, "kl": 0.003014294197782874, "learning_rate": 6.4000000000000006e-06, "loss": -0.3172, "num_tokens": 253476.0, "reward": -0.014999999664723873, "reward_std": 0.007071067579090595, "rewards/alfworld_rollout_reward_func/mean": -0.014999999664723873, "rewards/alfworld_rollout_reward_func/std": 0.007071067579090595, "sampling/importance_sampling_ratio/max": 1.2256042957305908, "sampling/importance_sampling_ratio/mean": 1.1113841533660889, "sampling/importance_sampling_ratio/min": 0.9971638917922974, "sampling/sampling_logp_difference/max": 0.23240363597869873, "sampling/sampling_logp_difference/mean": 0.006807921454310417, "step": 33, "step_time": 12.894395297000074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 8.056841761572286e-05, "epoch": 0.035789473684210524, "frac_reward_zero_std": 0.0, "grad_norm": 0.031494140625, "kl": 9.934107758624577e-09, "learning_rate": 6.600000000000001e-06, "loss": -0.0589, "num_tokens": 259774.0, "reward": 0.49000000953674316, "reward_std": 0.7212488651275635, "rewards/alfworld_rollout_reward_func/mean": 0.49000000953674316, "rewards/alfworld_rollout_reward_func/std": 0.7212488651275635, "sampling/importance_sampling_ratio/max": 1.0000007152557373, "sampling/importance_sampling_ratio/mean": 0.9999703764915466, "sampling/importance_sampling_ratio/min": 0.999940037727356, "sampling/sampling_logp_difference/max": 5.8182922657579184e-05, "sampling/sampling_logp_difference/mean": 2.677608563317335e-06, "step": 34, "step_time": 13.956647035999822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.489044189453125, "epoch": 0.03684210526315789, "frac_reward_zero_std": 0.0, "grad_norm": 46.0, "kl": 0.004022765904664993, "learning_rate": 6.800000000000001e-06, "loss": -0.8836, "num_tokens": 267539.0, "reward": 0.9549999833106995, "reward_std": 0.007071061059832573, "rewards/alfworld_rollout_reward_func/mean": 0.9549999833106995, "rewards/alfworld_rollout_reward_func/std": 0.007071061059832573, "sampling/importance_sampling_ratio/max": 2.5752766132354736, "sampling/importance_sampling_ratio/mean": 1.780219316482544, "sampling/importance_sampling_ratio/min": 0.985162079334259, "sampling/sampling_logp_difference/max": 0.38198375701904297, "sampling/sampling_logp_difference/mean": 0.03108775056898594, "step": 35, "step_time": 13.362450941999896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 29.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 28.5, "completions/mean_terminated_length": 28.5, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.03428112342953682, "epoch": 0.037894736842105266, "frac_reward_zero_std": 1.0, "grad_norm": 0.005950927734375, "kl": 9.688996215118095e-05, "learning_rate": 7e-06, "loss": 0.0, "num_tokens": 275461.0, "reward": 0.0, "reward_std": 0.0, "rewards/alfworld_rollout_reward_func/mean": 0.0, "rewards/alfworld_rollout_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.0216329097747803, "sampling/importance_sampling_ratio/mean": 1.010430932044983, "sampling/importance_sampling_ratio/min": 0.9992288947105408, "sampling/sampling_logp_difference/max": 0.04146456718444824, "sampling/sampling_logp_difference/mean": 0.0011078877141699195, "step": 36, "step_time": 16.18710341299993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.375491201877594, "epoch": 0.03894736842105263, "frac_reward_zero_std": 0.0, "grad_norm": 27.375, "kl": 0.008600625209510326, "learning_rate": 7.2000000000000005e-06, "loss": 0.3645, "num_tokens": 283600.0, "reward": -0.08500000089406967, "reward_std": 0.02121320366859436, "rewards/alfworld_rollout_reward_func/mean": -0.08500000089406967, "rewards/alfworld_rollout_reward_func/std": 0.02121320366859436, "sampling/importance_sampling_ratio/max": 1.302950382232666, "sampling/importance_sampling_ratio/mean": 0.8185228705406189, "sampling/importance_sampling_ratio/min": 0.3340953290462494, "sampling/sampling_logp_difference/max": 0.3411126136779785, "sampling/sampling_logp_difference/mean": 0.03013680875301361, "step": 37, "step_time": 20.539627713000073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.0, "completions/mean_terminated_length": 8.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.09109717607498169, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 170.0, "kl": 0.019233860075473785, "learning_rate": 7.4e-06, "loss": 0.2797, "num_tokens": 291611.0, "reward": 0.49000000953674316, "reward_std": 0.7071067690849304, "rewards/alfworld_rollout_reward_func/mean": 0.49000000953674316, "rewards/alfworld_rollout_reward_func/std": 0.7071067690849304, "sampling/importance_sampling_ratio/max": 1.4181230068206787, "sampling/importance_sampling_ratio/mean": 1.2089695930480957, "sampling/importance_sampling_ratio/min": 0.9998162984848022, "sampling/sampling_logp_difference/max": 0.3511829376220703, "sampling/sampling_logp_difference/mean": 0.022082466632127762, "step": 38, "step_time": 13.460569201999988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.0, "completions/max_terminated_length": 47.0, "completions/mean_length": 31.0, "completions/mean_terminated_length": 31.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.35754403471946716, "epoch": 0.04105263157894737, "frac_reward_zero_std": 0.0, "grad_norm": 8.4375, "kl": 0.01082681491971016, "learning_rate": 7.600000000000001e-06, "loss": -0.0723, "num_tokens": 299145.0, "reward": 0.4350000023841858, "reward_std": 0.7990306615829468, "rewards/alfworld_rollout_reward_func/mean": 0.4350000023841858, "rewards/alfworld_rollout_reward_func/std": 0.7990306615829468, "sampling/importance_sampling_ratio/max": 1.0000016689300537, "sampling/importance_sampling_ratio/mean": 0.5920178294181824, "sampling/importance_sampling_ratio/min": 0.18403403460979462, "sampling/sampling_logp_difference/max": 0.6585979461669922, "sampling/sampling_logp_difference/mean": 0.041399676352739334, "step": 39, "step_time": 15.72952194599975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 37.0, "completions/max_terminated_length": 37.0, "completions/mean_length": 22.5, "completions/mean_terminated_length": 22.5, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.35843947529792786, "epoch": 0.042105263157894736, "frac_reward_zero_std": 0.0, "grad_norm": 28.625, "kl": 0.0046605560928583145, "learning_rate": 7.800000000000002e-06, "loss": -0.3742, "num_tokens": 307197.0, "reward": -0.009999999776482582, "reward_std": 0.01414213515818119, "rewards/alfworld_rollout_reward_func/mean": -0.009999999776482582, "rewards/alfworld_rollout_reward_func/std": 0.01414213515818119, "sampling/importance_sampling_ratio/max": 1.0000009536743164, "sampling/importance_sampling_ratio/mean": 0.9322052001953125, "sampling/importance_sampling_ratio/min": 0.8644094467163086, "sampling/sampling_logp_difference/max": 0.34821510314941406, "sampling/sampling_logp_difference/mean": 0.02805318869650364, "step": 40, "step_time": 13.67377691799993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 12.0, "completions/mean_terminated_length": 12.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.028120441362261772, "epoch": 0.0431578947368421, "frac_reward_zero_std": 0.0, "grad_norm": 92.5, "kl": 0.0015685707330703735, "learning_rate": 8.000000000000001e-06, "loss": 0.0001, "num_tokens": 313777.0, "reward": 0.9399999976158142, "reward_std": 0.05656857416033745, "rewards/alfworld_rollout_reward_func/mean": 0.9399999976158142, "rewards/alfworld_rollout_reward_func/std": 0.05656857416033745, "sampling/importance_sampling_ratio/max": 1.0000027418136597, "sampling/importance_sampling_ratio/mean": 0.9999538064002991, "sampling/importance_sampling_ratio/min": 0.9999048709869385, "sampling/sampling_logp_difference/max": 9.777725790627301e-05, "sampling/sampling_logp_difference/mean": 4.304089543438749e-06, "step": 41, "step_time": 11.189458066000043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 36.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.24259884655475616, "epoch": 0.04421052631578947, "frac_reward_zero_std": 0.0, "grad_norm": 16.75, "kl": 0.0016654051141813397, "learning_rate": 8.2e-06, "loss": 0.1025, "num_tokens": 321642.0, "reward": -0.014999999664723873, "reward_std": 0.02121320366859436, "rewards/alfworld_rollout_reward_func/mean": -0.014999999664723873, "rewards/alfworld_rollout_reward_func/std": 0.02121320366859436, "sampling/importance_sampling_ratio/max": 1.060155987739563, "sampling/importance_sampling_ratio/mean": 1.0288386344909668, "sampling/importance_sampling_ratio/min": 0.9975212216377258, "sampling/sampling_logp_difference/max": 0.16695499420166016, "sampling/sampling_logp_difference/mean": 0.009143915958702564, "step": 42, "step_time": 22.64027631700037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 23.5, "completions/mean_terminated_length": 23.5, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 0.20157602429389954, "epoch": 0.045263157894736845, "frac_reward_zero_std": 0.0, "grad_norm": 63.25, "kl": 0.022462664172053337, "learning_rate": 8.400000000000001e-06, "loss": 0.2766, "num_tokens": 325516.0, "reward": 0.9850000143051147, "reward_std": 0.007071061059832573, "rewards/alfworld_rollout_reward_func/mean": 0.9850000143051147, "rewards/alfworld_rollout_reward_func/std": 0.007071061059832573, "sampling/importance_sampling_ratio/max": 0.9990339875221252, "sampling/importance_sampling_ratio/mean": 0.8629282712936401, "sampling/importance_sampling_ratio/min": 0.7268226146697998, "sampling/sampling_logp_difference/max": 0.17243313789367676, "sampling/sampling_logp_difference/mean": 0.015407886356115341, "step": 43, "step_time": 5.622832681000091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 35.0, "completions/max_terminated_length": 35.0, "completions/mean_length": 33.5, "completions/mean_terminated_length": 33.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.2535250782966614, "epoch": 0.04631578947368421, "frac_reward_zero_std": 0.0, "grad_norm": 19.25, "kl": 0.004778198432177305, "learning_rate": 8.6e-06, "loss": 0.0797, "num_tokens": 333519.0, "reward": 0.4699999988079071, "reward_std": 0.7495331764221191, "rewards/alfworld_rollout_reward_func/mean": 0.4699999988079071, "rewards/alfworld_rollout_reward_func/std": 0.7495331764221191, "sampling/importance_sampling_ratio/max": 0.7066987752914429, "sampling/importance_sampling_ratio/mean": 0.6219298839569092, "sampling/importance_sampling_ratio/min": 0.5371610522270203, "sampling/sampling_logp_difference/max": 0.34627819061279297, "sampling/sampling_logp_difference/mean": 0.017883947119116783, "step": 44, "step_time": 15.114469847999771 } ], "logging_steps": 1, "max_steps": 4750, "num_input_tokens_seen": 333519, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }