| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.04631578947368421, |
| "eval_steps": 500, |
| "global_step": 44, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 39.0, |
| "completions/max_terminated_length": 39.0, |
| "completions/mean_length": 38.0, |
| "completions/mean_terminated_length": 38.0, |
| "completions/min_length": 37.0, |
| "completions/min_terminated_length": 37.0, |
| "entropy": 0.4614375829696655, |
| "epoch": 0.0010526315789473684, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 23.0, |
| "kl": 0.002436438575387001, |
| "learning_rate": 0.0, |
| "loss": 0.1571, |
| "num_tokens": 8138.0, |
| "reward": -0.10500000417232513, |
| "reward_std": 0.021213199943304062, |
| "rewards/alfworld_rollout_reward_func/mean": -0.10500000417232513, |
| "rewards/alfworld_rollout_reward_func/std": 0.021213199943304062, |
| "sampling/importance_sampling_ratio/max": 0.9498974680900574, |
| "sampling/importance_sampling_ratio/mean": 0.7463880777359009, |
| "sampling/importance_sampling_ratio/min": 0.5428786873817444, |
| "sampling/sampling_logp_difference/max": 0.2791634798049927, |
| "sampling/sampling_logp_difference/mean": 0.02213391289114952, |
| "step": 1, |
| "step_time": 21.471763861 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 54.0, |
| "completions/max_terminated_length": 54.0, |
| "completions/mean_length": 49.5, |
| "completions/mean_terminated_length": 49.5, |
| "completions/min_length": 45.0, |
| "completions/min_terminated_length": 45.0, |
| "entropy": 0.6891850829124451, |
| "epoch": 0.002105263157894737, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 13.4375, |
| "kl": 0.0023963379207998514, |
| "learning_rate": 2.0000000000000002e-07, |
| "loss": -0.0442, |
| "num_tokens": 16245.0, |
| "reward": -0.08500000089406967, |
| "reward_std": 0.007071071770042181, |
| "rewards/alfworld_rollout_reward_func/mean": -0.08500000089406967, |
| "rewards/alfworld_rollout_reward_func/std": 0.007071071770042181, |
| "sampling/importance_sampling_ratio/max": 0.6407822966575623, |
| "sampling/importance_sampling_ratio/mean": 0.5291908383369446, |
| "sampling/importance_sampling_ratio/min": 0.4175994098186493, |
| "sampling/sampling_logp_difference/max": 0.26871776580810547, |
| "sampling/sampling_logp_difference/mean": 0.0313858687877655, |
| "step": 2, |
| "step_time": 19.41586231600013 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 35.0, |
| "completions/max_terminated_length": 35.0, |
| "completions/mean_length": 32.5, |
| "completions/mean_terminated_length": 32.5, |
| "completions/min_length": 30.0, |
| "completions/min_terminated_length": 30.0, |
| "entropy": 0.2943471372127533, |
| "epoch": 0.003157894736842105, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 9.0625, |
| "kl": 0.00032700574956834316, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": -0.0682, |
| "num_tokens": 24330.0, |
| "reward": -0.07000000029802322, |
| "reward_std": 0.05656854063272476, |
| "rewards/alfworld_rollout_reward_func/mean": -0.07000000029802322, |
| "rewards/alfworld_rollout_reward_func/std": 0.05656854063272476, |
| "sampling/importance_sampling_ratio/max": 1.0007613897323608, |
| "sampling/importance_sampling_ratio/mean": 0.8396192193031311, |
| "sampling/importance_sampling_ratio/min": 0.6784770488739014, |
| "sampling/sampling_logp_difference/max": 0.22780990600585938, |
| "sampling/sampling_logp_difference/mean": 0.013574070297181606, |
| "step": 3, |
| "step_time": 17.228165518000196 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 45.0, |
| "completions/max_terminated_length": 45.0, |
| "completions/mean_length": 37.5, |
| "completions/mean_terminated_length": 37.5, |
| "completions/min_length": 30.0, |
| "completions/min_terminated_length": 30.0, |
| "entropy": 0.4852295517921448, |
| "epoch": 0.004210526315789474, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 22.75, |
| "kl": 0.0009841235587373376, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 0.0262, |
| "num_tokens": 32171.0, |
| "reward": -0.04500000178813934, |
| "reward_std": 0.0353553406894207, |
| "rewards/alfworld_rollout_reward_func/mean": -0.04500000178813934, |
| "rewards/alfworld_rollout_reward_func/std": 0.0353553369641304, |
| "sampling/importance_sampling_ratio/max": 0.9375013709068298, |
| "sampling/importance_sampling_ratio/mean": 0.7503011226654053, |
| "sampling/importance_sampling_ratio/min": 0.5631008148193359, |
| "sampling/sampling_logp_difference/max": 0.28363165259361267, |
| "sampling/sampling_logp_difference/mean": 0.02271696925163269, |
| "step": 4, |
| "step_time": 17.49860281700012 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 51.0, |
| "completions/max_terminated_length": 51.0, |
| "completions/mean_length": 27.5, |
| "completions/mean_terminated_length": 27.5, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 0.5301113724708557, |
| "epoch": 0.005263157894736842, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 28.125, |
| "kl": 0.0012831644853577018, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 0.4587, |
| "num_tokens": 40277.0, |
| "reward": -0.004999999888241291, |
| "reward_std": 0.007071067579090595, |
| "rewards/alfworld_rollout_reward_func/mean": -0.004999999888241291, |
| "rewards/alfworld_rollout_reward_func/std": 0.007071067579090595, |
| "sampling/importance_sampling_ratio/max": 1.0000011920928955, |
| "sampling/importance_sampling_ratio/mean": 0.893904447555542, |
| "sampling/importance_sampling_ratio/min": 0.7878076434135437, |
| "sampling/sampling_logp_difference/max": 0.21004503965377808, |
| "sampling/sampling_logp_difference/mean": 0.02911142073571682, |
| "step": 5, |
| "step_time": 12.491938435000066 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 26.0, |
| "completions/max_terminated_length": 26.0, |
| "completions/mean_length": 16.5, |
| "completions/mean_terminated_length": 16.5, |
| "completions/min_length": 7.0, |
| "completions/min_terminated_length": 7.0, |
| "entropy": 0.1027420163154602, |
| "epoch": 0.00631578947368421, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 18.875, |
| "kl": 4.723216625279747e-05, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": -0.397, |
| "num_tokens": 48155.0, |
| "reward": -0.029999999329447746, |
| "reward_std": 0.01414213515818119, |
| "rewards/alfworld_rollout_reward_func/mean": -0.029999999329447746, |
| "rewards/alfworld_rollout_reward_func/std": 0.01414213515818119, |
| "sampling/importance_sampling_ratio/max": 1.0009899139404297, |
| "sampling/importance_sampling_ratio/mean": 0.9940400719642639, |
| "sampling/importance_sampling_ratio/min": 0.9870902299880981, |
| "sampling/sampling_logp_difference/max": 0.15823769569396973, |
| "sampling/sampling_logp_difference/mean": 0.009266156703233719, |
| "step": 6, |
| "step_time": 23.18857599299986 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 37.0, |
| "completions/max_terminated_length": 37.0, |
| "completions/mean_length": 24.0, |
| "completions/mean_terminated_length": 24.0, |
| "completions/min_length": 11.0, |
| "completions/min_terminated_length": 11.0, |
| "entropy": 0.45579978823661804, |
| "epoch": 0.007368421052631579, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 46.75, |
| "kl": 0.0014351233839988708, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 0.607, |
| "num_tokens": 55983.0, |
| "reward": -0.019999999552965164, |
| "reward_std": 0.02828427031636238, |
| "rewards/alfworld_rollout_reward_func/mean": -0.019999999552965164, |
| "rewards/alfworld_rollout_reward_func/std": 0.02828427031636238, |
| "sampling/importance_sampling_ratio/max": 1.4146320819854736, |
| "sampling/importance_sampling_ratio/mean": 1.2070283889770508, |
| "sampling/importance_sampling_ratio/min": 0.9994246959686279, |
| "sampling/sampling_logp_difference/max": 0.1907503604888916, |
| "sampling/sampling_logp_difference/mean": 0.028974320739507675, |
| "step": 7, |
| "step_time": 18.246992883999837 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 47.0, |
| "completions/max_terminated_length": 47.0, |
| "completions/mean_length": 28.0, |
| "completions/mean_terminated_length": 28.0, |
| "completions/min_length": 9.0, |
| "completions/min_terminated_length": 9.0, |
| "entropy": 0.5107156038284302, |
| "epoch": 0.008421052631578947, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 58.75, |
| "kl": 0.0012022192822769284, |
| "learning_rate": 1.4000000000000001e-06, |
| "loss": -0.6365, |
| "num_tokens": 64025.0, |
| "reward": -0.08500000089406967, |
| "reward_std": 0.007071071770042181, |
| "rewards/alfworld_rollout_reward_func/mean": -0.08500000089406967, |
| "rewards/alfworld_rollout_reward_func/std": 0.007071071770042181, |
| "sampling/importance_sampling_ratio/max": 1.279162883758545, |
| "sampling/importance_sampling_ratio/mean": 1.1397144794464111, |
| "sampling/importance_sampling_ratio/min": 1.0002660751342773, |
| "sampling/sampling_logp_difference/max": 0.5383334159851074, |
| "sampling/sampling_logp_difference/mean": 0.034098681062459946, |
| "step": 8, |
| "step_time": 14.293913408000208 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 56.0, |
| "completions/max_terminated_length": 56.0, |
| "completions/mean_length": 44.5, |
| "completions/mean_terminated_length": 44.5, |
| "completions/min_length": 33.0, |
| "completions/min_terminated_length": 33.0, |
| "entropy": 0.4052967131137848, |
| "epoch": 0.009473684210526316, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 16.625, |
| "kl": 0.0016700377454981208, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": -0.2022, |
| "num_tokens": 71652.0, |
| "reward": 0.4650000035762787, |
| "reward_std": 0.6576092839241028, |
| "rewards/alfworld_rollout_reward_func/mean": 0.4650000035762787, |
| "rewards/alfworld_rollout_reward_func/std": 0.6576092839241028, |
| "sampling/importance_sampling_ratio/max": 0.7853229641914368, |
| "sampling/importance_sampling_ratio/mean": 0.6733799576759338, |
| "sampling/importance_sampling_ratio/min": 0.5614369511604309, |
| "sampling/sampling_logp_difference/max": 0.27764952182769775, |
| "sampling/sampling_logp_difference/mean": 0.01565438136458397, |
| "step": 9, |
| "step_time": 16.68387536299997 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 35.0, |
| "completions/max_terminated_length": 35.0, |
| "completions/mean_length": 30.5, |
| "completions/mean_terminated_length": 30.5, |
| "completions/min_length": 26.0, |
| "completions/min_terminated_length": 26.0, |
| "entropy": 0.33000221848487854, |
| "epoch": 0.010526315789473684, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.013671875, |
| "kl": 0.0005048786988481879, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 0.0, |
| "num_tokens": 79470.0, |
| "reward": -0.029999999329447746, |
| "reward_std": 0.0, |
| "rewards/alfworld_rollout_reward_func/mean": -0.029999999329447746, |
| "rewards/alfworld_rollout_reward_func/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 0.9141315221786499, |
| "sampling/importance_sampling_ratio/mean": 0.7841682434082031, |
| "sampling/importance_sampling_ratio/min": 0.6542050242424011, |
| "sampling/sampling_logp_difference/max": 0.24437618255615234, |
| "sampling/sampling_logp_difference/mean": 0.01810554973781109, |
| "step": 10, |
| "step_time": 17.949971448000042 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 45.0, |
| "completions/max_terminated_length": 45.0, |
| "completions/mean_length": 41.5, |
| "completions/mean_terminated_length": 41.5, |
| "completions/min_length": 38.0, |
| "completions/min_terminated_length": 38.0, |
| "entropy": 0.5000134706497192, |
| "epoch": 0.011578947368421053, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 14.125, |
| "kl": 0.001739501953125, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.0346, |
| "num_tokens": 87509.0, |
| "reward": -0.044999998062849045, |
| "reward_std": 0.04949747398495674, |
| "rewards/alfworld_rollout_reward_func/mean": -0.044999998062849045, |
| "rewards/alfworld_rollout_reward_func/std": 0.04949747398495674, |
| "sampling/importance_sampling_ratio/max": 0.6740682721138, |
| "sampling/importance_sampling_ratio/mean": 0.6668994426727295, |
| "sampling/importance_sampling_ratio/min": 0.6597306132316589, |
| "sampling/sampling_logp_difference/max": 0.35384368896484375, |
| "sampling/sampling_logp_difference/mean": 0.021013759076595306, |
| "step": 11, |
| "step_time": 15.37352415700002 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 45.0, |
| "completions/max_terminated_length": 45.0, |
| "completions/mean_length": 29.0, |
| "completions/mean_terminated_length": 29.0, |
| "completions/min_length": 13.0, |
| "completions/min_terminated_length": 13.0, |
| "entropy": 0.37302640080451965, |
| "epoch": 0.01263157894736842, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 39.0, |
| "kl": 0.0027003493160009384, |
| "learning_rate": 2.2e-06, |
| "loss": -0.5275, |
| "num_tokens": 95262.0, |
| "reward": -0.06000000238418579, |
| "reward_std": 0.04242640733718872, |
| "rewards/alfworld_rollout_reward_func/mean": -0.06000000238418579, |
| "rewards/alfworld_rollout_reward_func/std": 0.04242641106247902, |
| "sampling/importance_sampling_ratio/max": 1.2358118295669556, |
| "sampling/importance_sampling_ratio/mean": 1.088789463043213, |
| "sampling/importance_sampling_ratio/min": 0.9417669773101807, |
| "sampling/sampling_logp_difference/max": 0.3406403064727783, |
| "sampling/sampling_logp_difference/mean": 0.028803091496229172, |
| "step": 12, |
| "step_time": 20.290826388999903 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 23.0, |
| "completions/max_terminated_length": 23.0, |
| "completions/mean_length": 15.0, |
| "completions/mean_terminated_length": 15.0, |
| "completions/min_length": 7.0, |
| "completions/min_terminated_length": 7.0, |
| "entropy": 0.07970059663057327, |
| "epoch": 0.01368421052631579, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 25.125, |
| "kl": 0.0003672480524983257, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": -0.3785, |
| "num_tokens": 102993.0, |
| "reward": -0.004999999888241291, |
| "reward_std": 0.007071067579090595, |
| "rewards/alfworld_rollout_reward_func/mean": -0.004999999888241291, |
| "rewards/alfworld_rollout_reward_func/std": 0.007071067579090595, |
| "sampling/importance_sampling_ratio/max": 1.012474536895752, |
| "sampling/importance_sampling_ratio/mean": 1.006240963935852, |
| "sampling/importance_sampling_ratio/min": 1.0000073909759521, |
| "sampling/sampling_logp_difference/max": 0.011289931833744049, |
| "sampling/sampling_logp_difference/mean": 0.00044049008283764124, |
| "step": 13, |
| "step_time": 18.20201583400012 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 36.0, |
| "completions/max_terminated_length": 36.0, |
| "completions/mean_length": 24.5, |
| "completions/mean_terminated_length": 24.5, |
| "completions/min_length": 13.0, |
| "completions/min_terminated_length": 13.0, |
| "entropy": 0.21557021141052246, |
| "epoch": 0.014736842105263158, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 28.375, |
| "kl": 0.005430301651358604, |
| "learning_rate": 2.6e-06, |
| "loss": -0.3004, |
| "num_tokens": 111031.0, |
| "reward": 0.42500001192092896, |
| "reward_std": 0.7141778469085693, |
| "rewards/alfworld_rollout_reward_func/mean": 0.42500001192092896, |
| "rewards/alfworld_rollout_reward_func/std": 0.7141778469085693, |
| "sampling/importance_sampling_ratio/max": 0.8329165577888489, |
| "sampling/importance_sampling_ratio/mean": 0.7687587738037109, |
| "sampling/importance_sampling_ratio/min": 0.7046010494232178, |
| "sampling/sampling_logp_difference/max": 0.3576321601867676, |
| "sampling/sampling_logp_difference/mean": 0.016674496233463287, |
| "step": 14, |
| "step_time": 11.272263890000204 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 23.0, |
| "completions/max_terminated_length": 23.0, |
| "completions/mean_length": 15.0, |
| "completions/mean_terminated_length": 15.0, |
| "completions/min_length": 7.0, |
| "completions/min_terminated_length": 7.0, |
| "entropy": 0.18071487545967102, |
| "epoch": 0.015789473684210527, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 16.625, |
| "kl": 0.00020178158592898399, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 0.4877, |
| "num_tokens": 118867.0, |
| "reward": -0.004999999888241291, |
| "reward_std": 0.007071067579090595, |
| "rewards/alfworld_rollout_reward_func/mean": -0.004999999888241291, |
| "rewards/alfworld_rollout_reward_func/std": 0.007071067579090595, |
| "sampling/importance_sampling_ratio/max": 1.216752290725708, |
| "sampling/importance_sampling_ratio/mean": 1.1083970069885254, |
| "sampling/importance_sampling_ratio/min": 1.0000418424606323, |
| "sampling/sampling_logp_difference/max": 0.11035466194152832, |
| "sampling/sampling_logp_difference/mean": 0.007579161319881678, |
| "step": 15, |
| "step_time": 20.160193882000158 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12.0, |
| "completions/max_terminated_length": 12.0, |
| "completions/mean_length": 11.5, |
| "completions/mean_terminated_length": 11.5, |
| "completions/min_length": 11.0, |
| "completions/min_terminated_length": 11.0, |
| "entropy": 0.07483004778623581, |
| "epoch": 0.016842105263157894, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 29.625, |
| "kl": 0.00041633585351519287, |
| "learning_rate": 3e-06, |
| "loss": -0.0959, |
| "num_tokens": 126279.0, |
| "reward": 0.429999977350235, |
| "reward_std": 0.7495331764221191, |
| "rewards/alfworld_rollout_reward_func/mean": 0.429999977350235, |
| "rewards/alfworld_rollout_reward_func/std": 0.7495331764221191, |
| "sampling/importance_sampling_ratio/max": 1.1467688083648682, |
| "sampling/importance_sampling_ratio/mean": 1.057037591934204, |
| "sampling/importance_sampling_ratio/min": 0.9673064351081848, |
| "sampling/sampling_logp_difference/max": 0.11966276168823242, |
| "sampling/sampling_logp_difference/mean": 0.007402568124234676, |
| "step": 16, |
| "step_time": 10.745571063999932 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 36.0, |
| "completions/max_terminated_length": 36.0, |
| "completions/mean_length": 33.0, |
| "completions/mean_terminated_length": 33.0, |
| "completions/min_length": 30.0, |
| "completions/min_terminated_length": 30.0, |
| "entropy": 0.5801213383674622, |
| "epoch": 0.017894736842105262, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 30.125, |
| "kl": 0.001962649170309305, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 0.1349, |
| "num_tokens": 134227.0, |
| "reward": 0.4749999940395355, |
| "reward_std": 0.6858935952186584, |
| "rewards/alfworld_rollout_reward_func/mean": 0.4749999940395355, |
| "rewards/alfworld_rollout_reward_func/std": 0.6858935952186584, |
| "sampling/importance_sampling_ratio/max": 1.1694220304489136, |
| "sampling/importance_sampling_ratio/mean": 1.0765215158462524, |
| "sampling/importance_sampling_ratio/min": 0.9836210608482361, |
| "sampling/sampling_logp_difference/max": 0.17440319061279297, |
| "sampling/sampling_logp_difference/mean": 0.022032134234905243, |
| "step": 17, |
| "step_time": 12.921286662000057 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 41.0, |
| "completions/max_terminated_length": 41.0, |
| "completions/mean_length": 39.5, |
| "completions/mean_terminated_length": 39.5, |
| "completions/min_length": 38.0, |
| "completions/min_terminated_length": 38.0, |
| "entropy": 0.48242872953414917, |
| "epoch": 0.018947368421052633, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 19.5, |
| "kl": 0.0010035536251962185, |
| "learning_rate": 3.4000000000000005e-06, |
| "loss": -0.0587, |
| "num_tokens": 142157.0, |
| "reward": -0.04999999701976776, |
| "reward_std": 0.04242640733718872, |
| "rewards/alfworld_rollout_reward_func/mean": -0.04999999701976776, |
| "rewards/alfworld_rollout_reward_func/std": 0.04242640733718872, |
| "sampling/importance_sampling_ratio/max": 1.0037174224853516, |
| "sampling/importance_sampling_ratio/mean": 0.9568833112716675, |
| "sampling/importance_sampling_ratio/min": 0.9100492596626282, |
| "sampling/sampling_logp_difference/max": 0.17302274703979492, |
| "sampling/sampling_logp_difference/mean": 0.02212933637201786, |
| "step": 18, |
| "step_time": 23.997690939999984 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 46.0, |
| "completions/max_terminated_length": 46.0, |
| "completions/mean_length": 44.0, |
| "completions/mean_terminated_length": 44.0, |
| "completions/min_length": 42.0, |
| "completions/min_terminated_length": 42.0, |
| "entropy": 0.44313955307006836, |
| "epoch": 0.02, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 19.5, |
| "kl": 0.0014339183690026402, |
| "learning_rate": 3.6000000000000003e-06, |
| "loss": -0.1594, |
| "num_tokens": 150213.0, |
| "reward": -0.05999999865889549, |
| "reward_std": 0.0707106813788414, |
| "rewards/alfworld_rollout_reward_func/mean": -0.05999999865889549, |
| "rewards/alfworld_rollout_reward_func/std": 0.0707106739282608, |
| "sampling/importance_sampling_ratio/max": 1.1801482439041138, |
| "sampling/importance_sampling_ratio/mean": 0.9998883008956909, |
| "sampling/importance_sampling_ratio/min": 0.8196282982826233, |
| "sampling/sampling_logp_difference/max": 0.21845340728759766, |
| "sampling/sampling_logp_difference/mean": 0.022122323513031006, |
| "step": 19, |
| "step_time": 15.372385936 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 39.0, |
| "completions/max_terminated_length": 39.0, |
| "completions/mean_length": 36.5, |
| "completions/mean_terminated_length": 36.5, |
| "completions/min_length": 34.0, |
| "completions/min_terminated_length": 34.0, |
| "entropy": 0.26607948541641235, |
| "epoch": 0.021052631578947368, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 21.125, |
| "kl": 0.0008539336849935353, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": -0.0343, |
| "num_tokens": 157583.0, |
| "reward": 0.45500001311302185, |
| "reward_std": 0.742462158203125, |
| "rewards/alfworld_rollout_reward_func/mean": 0.45500001311302185, |
| "rewards/alfworld_rollout_reward_func/std": 0.7424620985984802, |
| "sampling/importance_sampling_ratio/max": 0.9992303848266602, |
| "sampling/importance_sampling_ratio/mean": 0.8897002339363098, |
| "sampling/importance_sampling_ratio/min": 0.7801700830459595, |
| "sampling/sampling_logp_difference/max": 0.2817434072494507, |
| "sampling/sampling_logp_difference/mean": 0.01816781423985958, |
| "step": 20, |
| "step_time": 21.228662558999986 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 26.0, |
| "completions/max_terminated_length": 26.0, |
| "completions/mean_length": 19.5, |
| "completions/mean_terminated_length": 19.5, |
| "completions/min_length": 13.0, |
| "completions/min_terminated_length": 13.0, |
| "entropy": 0.28872889280319214, |
| "epoch": 0.022105263157894735, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 18.75, |
| "kl": 0.00026875274488702416, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.191, |
| "num_tokens": 162725.0, |
| "reward": 0.9650000333786011, |
| "reward_std": 0.04949747025966644, |
| "rewards/alfworld_rollout_reward_func/mean": 0.9650000333786011, |
| "rewards/alfworld_rollout_reward_func/std": 0.04949747025966644, |
| "sampling/importance_sampling_ratio/max": 0.9999924898147583, |
| "sampling/importance_sampling_ratio/mean": 0.9529882669448853, |
| "sampling/importance_sampling_ratio/min": 0.905984103679657, |
| "sampling/sampling_logp_difference/max": 0.13952183723449707, |
| "sampling/sampling_logp_difference/mean": 0.012355787679553032, |
| "step": 21, |
| "step_time": 8.206261441999914 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 41.0, |
| "completions/max_terminated_length": 41.0, |
| "completions/mean_length": 27.0, |
| "completions/mean_terminated_length": 27.0, |
| "completions/min_length": 13.0, |
| "completions/min_terminated_length": 13.0, |
| "entropy": 0.4186505675315857, |
| "epoch": 0.023157894736842106, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 70.0, |
| "kl": 0.0038073172327131033, |
| "learning_rate": 4.2000000000000004e-06, |
| "loss": 0.6479, |
| "num_tokens": 170762.0, |
| "reward": 0.4650000035762787, |
| "reward_std": 0.7566042542457581, |
| "rewards/alfworld_rollout_reward_func/mean": 0.4650000035762787, |
| "rewards/alfworld_rollout_reward_func/std": 0.7566042542457581, |
| "sampling/importance_sampling_ratio/max": 1.66000235080719, |
| "sampling/importance_sampling_ratio/mean": 1.544608235359192, |
| "sampling/importance_sampling_ratio/min": 1.4292141199111938, |
| "sampling/sampling_logp_difference/max": 0.23494195938110352, |
| "sampling/sampling_logp_difference/mean": 0.03184577822685242, |
| "step": 22, |
| "step_time": 17.66746830000011 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 44.0, |
| "completions/max_terminated_length": 44.0, |
| "completions/mean_length": 34.5, |
| "completions/mean_terminated_length": 34.5, |
| "completions/min_length": 25.0, |
| "completions/min_terminated_length": 25.0, |
| "entropy": 0.3005879819393158, |
| "epoch": 0.024210526315789474, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 20.5, |
| "kl": 0.02588764950633049, |
| "learning_rate": 4.4e-06, |
| "loss": 0.1134, |
| "num_tokens": 178722.0, |
| "reward": 0.48000001907348633, |
| "reward_std": 0.6929646730422974, |
| "rewards/alfworld_rollout_reward_func/mean": 0.48000001907348633, |
| "rewards/alfworld_rollout_reward_func/std": 0.6929646730422974, |
| "sampling/importance_sampling_ratio/max": 0.9999445676803589, |
| "sampling/importance_sampling_ratio/mean": 0.6585712432861328, |
| "sampling/importance_sampling_ratio/min": 0.3171979784965515, |
| "sampling/sampling_logp_difference/max": 1.331534504890442, |
| "sampling/sampling_logp_difference/mean": 0.03551221266388893, |
| "step": 23, |
| "step_time": 14.36746498499997 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 35.0, |
| "completions/max_terminated_length": 35.0, |
| "completions/mean_length": 23.5, |
| "completions/mean_terminated_length": 23.5, |
| "completions/min_length": 12.0, |
| "completions/min_terminated_length": 12.0, |
| "entropy": 0.3918939232826233, |
| "epoch": 0.02526315789473684, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 43.25, |
| "kl": 0.0027948389761149883, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 0.5853, |
| "num_tokens": 185227.0, |
| "reward": 0.4650000035762787, |
| "reward_std": 0.7424620389938354, |
| "rewards/alfworld_rollout_reward_func/mean": 0.4650000035762787, |
| "rewards/alfworld_rollout_reward_func/std": 0.7424620389938354, |
| "sampling/importance_sampling_ratio/max": 1.4053035974502563, |
| "sampling/importance_sampling_ratio/mean": 1.130873203277588, |
| "sampling/importance_sampling_ratio/min": 0.8564428091049194, |
| "sampling/sampling_logp_difference/max": 0.16124820709228516, |
| "sampling/sampling_logp_difference/mean": 0.016462432220578194, |
| "step": 24, |
| "step_time": 11.062920657999939 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 31.0, |
| "completions/max_terminated_length": 31.0, |
| "completions/mean_length": 29.0, |
| "completions/mean_terminated_length": 29.0, |
| "completions/min_length": 27.0, |
| "completions/min_terminated_length": 27.0, |
| "entropy": 0.2529997229576111, |
| "epoch": 0.02631578947368421, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.018310546875, |
| "kl": 0.0004380304308142513, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 0.0, |
| "num_tokens": 193142.0, |
| "reward": -0.019999999552965164, |
| "reward_std": 0.0, |
| "rewards/alfworld_rollout_reward_func/mean": -0.019999999552965164, |
| "rewards/alfworld_rollout_reward_func/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.0052870512008667, |
| "sampling/importance_sampling_ratio/mean": 0.8423590660095215, |
| "sampling/importance_sampling_ratio/min": 0.6794310212135315, |
| "sampling/sampling_logp_difference/max": 0.18489933013916016, |
| "sampling/sampling_logp_difference/mean": 0.014295091852545738, |
| "step": 25, |
| "step_time": 17.847312069000054 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 49.0, |
| "completions/max_terminated_length": 49.0, |
| "completions/mean_length": 31.0, |
| "completions/mean_terminated_length": 31.0, |
| "completions/min_length": 13.0, |
| "completions/min_terminated_length": 13.0, |
| "entropy": 0.3845043182373047, |
| "epoch": 0.02736842105263158, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 50.0, |
| "kl": 0.002633685013279319, |
| "learning_rate": 5e-06, |
| "loss": -0.273, |
| "num_tokens": 200881.0, |
| "reward": -0.10999999940395355, |
| "reward_std": 0.014142133295536041, |
| "rewards/alfworld_rollout_reward_func/mean": -0.10999999940395355, |
| "rewards/alfworld_rollout_reward_func/std": 0.014142133295536041, |
| "sampling/importance_sampling_ratio/max": 1.0188355445861816, |
| "sampling/importance_sampling_ratio/mean": 0.8905454874038696, |
| "sampling/importance_sampling_ratio/min": 0.7622554302215576, |
| "sampling/sampling_logp_difference/max": 0.29382169246673584, |
| "sampling/sampling_logp_difference/mean": 0.03841578587889671, |
| "step": 26, |
| "step_time": 22.61463799199987 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 26.0, |
| "completions/max_terminated_length": 26.0, |
| "completions/mean_length": 17.0, |
| "completions/mean_terminated_length": 17.0, |
| "completions/min_length": 8.0, |
| "completions/min_terminated_length": 8.0, |
| "entropy": 0.32032543420791626, |
| "epoch": 0.028421052631578948, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 33.75, |
| "kl": 0.0024931079242378473, |
| "learning_rate": 5.2e-06, |
| "loss": 0.3737, |
| "num_tokens": 208999.0, |
| "reward": 0.4950000047683716, |
| "reward_std": 0.7141778469085693, |
| "rewards/alfworld_rollout_reward_func/mean": 0.4950000047683716, |
| "rewards/alfworld_rollout_reward_func/std": 0.7141778469085693, |
| "sampling/importance_sampling_ratio/max": 1.1281967163085938, |
| "sampling/importance_sampling_ratio/mean": 1.083227276802063, |
| "sampling/importance_sampling_ratio/min": 1.0382578372955322, |
| "sampling/sampling_logp_difference/max": 0.17356586456298828, |
| "sampling/sampling_logp_difference/mean": 0.017684968188405037, |
| "step": 27, |
| "step_time": 11.613979460999872 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14.0, |
| "completions/max_terminated_length": 14.0, |
| "completions/mean_length": 11.0, |
| "completions/mean_terminated_length": 11.0, |
| "completions/min_length": 8.0, |
| "completions/min_terminated_length": 8.0, |
| "entropy": 0.05829498916864395, |
| "epoch": 0.029473684210526315, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 32.25, |
| "kl": 0.0006212808657437563, |
| "learning_rate": 5.400000000000001e-06, |
| "loss": -0.2479, |
| "num_tokens": 214157.0, |
| "reward": 0.4650000035762787, |
| "reward_std": 0.7424620389938354, |
| "rewards/alfworld_rollout_reward_func/mean": 0.4650000035762787, |
| "rewards/alfworld_rollout_reward_func/std": 0.7424620389938354, |
| "sampling/importance_sampling_ratio/max": 0.9999884366989136, |
| "sampling/importance_sampling_ratio/mean": 0.8929275274276733, |
| "sampling/importance_sampling_ratio/min": 0.7858666181564331, |
| "sampling/sampling_logp_difference/max": 0.21855998039245605, |
| "sampling/sampling_logp_difference/mean": 0.01126509066671133, |
| "step": 28, |
| "step_time": 9.151633475999915 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 37.0, |
| "completions/max_terminated_length": 37.0, |
| "completions/mean_length": 36.5, |
| "completions/mean_terminated_length": 36.5, |
| "completions/min_length": 36.0, |
| "completions/min_terminated_length": 36.0, |
| "entropy": 0.46773216128349304, |
| "epoch": 0.030526315789473683, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 86.0, |
| "kl": 0.015533313155174255, |
| "learning_rate": 5.600000000000001e-06, |
| "loss": -0.5057, |
| "num_tokens": 222111.0, |
| "reward": -0.03500000014901161, |
| "reward_std": 0.02121320366859436, |
| "rewards/alfworld_rollout_reward_func/mean": -0.03500000014901161, |
| "rewards/alfworld_rollout_reward_func/std": 0.02121320366859436, |
| "sampling/importance_sampling_ratio/max": 2.2050247192382812, |
| "sampling/importance_sampling_ratio/mean": 1.4661935567855835, |
| "sampling/importance_sampling_ratio/min": 0.727362334728241, |
| "sampling/sampling_logp_difference/max": 1.248981237411499, |
| "sampling/sampling_logp_difference/mean": 0.042725156992673874, |
| "step": 29, |
| "step_time": 20.633877447999794 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 44.0, |
| "completions/max_terminated_length": 44.0, |
| "completions/mean_length": 28.0, |
| "completions/mean_terminated_length": 28.0, |
| "completions/min_length": 12.0, |
| "completions/min_terminated_length": 12.0, |
| "entropy": 0.18183040618896484, |
| "epoch": 0.031578947368421054, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 29.625, |
| "kl": 0.0012305846903473139, |
| "learning_rate": 5.8e-06, |
| "loss": 0.3004, |
| "num_tokens": 229439.0, |
| "reward": 0.44999998807907104, |
| "reward_std": 0.7778174877166748, |
| "rewards/alfworld_rollout_reward_func/mean": 0.44999998807907104, |
| "rewards/alfworld_rollout_reward_func/std": 0.7778174877166748, |
| "sampling/importance_sampling_ratio/max": 0.9991921782493591, |
| "sampling/importance_sampling_ratio/mean": 0.9062168598175049, |
| "sampling/importance_sampling_ratio/min": 0.8132414817810059, |
| "sampling/sampling_logp_difference/max": 0.1453406810760498, |
| "sampling/sampling_logp_difference/mean": 0.007052628789097071, |
| "step": 30, |
| "step_time": 15.40591485699997 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 56.0, |
| "completions/max_terminated_length": 56.0, |
| "completions/mean_length": 49.5, |
| "completions/mean_terminated_length": 49.5, |
| "completions/min_length": 43.0, |
| "completions/min_terminated_length": 43.0, |
| "entropy": 0.3357900381088257, |
| "epoch": 0.03263157894736842, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 17.875, |
| "kl": 0.004221746232360601, |
| "learning_rate": 6e-06, |
| "loss": -0.0807, |
| "num_tokens": 237387.0, |
| "reward": -0.05999999865889549, |
| "reward_std": 0.02828427031636238, |
| "rewards/alfworld_rollout_reward_func/mean": -0.05999999865889549, |
| "rewards/alfworld_rollout_reward_func/std": 0.02828427031636238, |
| "sampling/importance_sampling_ratio/max": 0.8237836956977844, |
| "sampling/importance_sampling_ratio/mean": 0.8164201974868774, |
| "sampling/importance_sampling_ratio/min": 0.8090566992759705, |
| "sampling/sampling_logp_difference/max": 0.2831292152404785, |
| "sampling/sampling_logp_difference/mean": 0.024411508813500404, |
| "step": 31, |
| "step_time": 19.268974757000024 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 33.0, |
| "completions/max_terminated_length": 33.0, |
| "completions/mean_length": 32.0, |
| "completions/mean_terminated_length": 32.0, |
| "completions/min_length": 31.0, |
| "completions/min_terminated_length": 31.0, |
| "entropy": 0.2340346872806549, |
| "epoch": 0.03368421052631579, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.049560546875, |
| "kl": 0.0027104970067739487, |
| "learning_rate": 6.200000000000001e-06, |
| "loss": 0.0, |
| "num_tokens": 245449.0, |
| "reward": -0.05000000074505806, |
| "reward_std": 0.0, |
| "rewards/alfworld_rollout_reward_func/mean": -0.05000000074505806, |
| "rewards/alfworld_rollout_reward_func/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.6312272548675537, |
| "sampling/importance_sampling_ratio/mean": 1.798392653465271, |
| "sampling/importance_sampling_ratio/min": 0.9655579924583435, |
| "sampling/sampling_logp_difference/max": 0.5748621225357056, |
| "sampling/sampling_logp_difference/mean": 0.029400669038295746, |
| "step": 32, |
| "step_time": 24.658211100000017 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 31.0, |
| "completions/max_terminated_length": 31.0, |
| "completions/mean_length": 20.5, |
| "completions/mean_terminated_length": 20.5, |
| "completions/min_length": 10.0, |
| "completions/min_terminated_length": 10.0, |
| "entropy": 0.050181157886981964, |
| "epoch": 0.034736842105263156, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 19.125, |
| "kl": 0.003014294197782874, |
| "learning_rate": 6.4000000000000006e-06, |
| "loss": -0.3172, |
| "num_tokens": 253476.0, |
| "reward": -0.014999999664723873, |
| "reward_std": 0.007071067579090595, |
| "rewards/alfworld_rollout_reward_func/mean": -0.014999999664723873, |
| "rewards/alfworld_rollout_reward_func/std": 0.007071067579090595, |
| "sampling/importance_sampling_ratio/max": 1.2256042957305908, |
| "sampling/importance_sampling_ratio/mean": 1.1113841533660889, |
| "sampling/importance_sampling_ratio/min": 0.9971638917922974, |
| "sampling/sampling_logp_difference/max": 0.23240363597869873, |
| "sampling/sampling_logp_difference/mean": 0.006807921454310417, |
| "step": 33, |
| "step_time": 12.894395297000074 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13.0, |
| "completions/max_terminated_length": 13.0, |
| "completions/mean_length": 12.0, |
| "completions/mean_terminated_length": 12.0, |
| "completions/min_length": 11.0, |
| "completions/min_terminated_length": 11.0, |
| "entropy": 8.056841761572286e-05, |
| "epoch": 0.035789473684210524, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.031494140625, |
| "kl": 9.934107758624577e-09, |
| "learning_rate": 6.600000000000001e-06, |
| "loss": -0.0589, |
| "num_tokens": 259774.0, |
| "reward": 0.49000000953674316, |
| "reward_std": 0.7212488651275635, |
| "rewards/alfworld_rollout_reward_func/mean": 0.49000000953674316, |
| "rewards/alfworld_rollout_reward_func/std": 0.7212488651275635, |
| "sampling/importance_sampling_ratio/max": 1.0000007152557373, |
| "sampling/importance_sampling_ratio/mean": 0.9999703764915466, |
| "sampling/importance_sampling_ratio/min": 0.999940037727356, |
| "sampling/sampling_logp_difference/max": 5.8182922657579184e-05, |
| "sampling/sampling_logp_difference/mean": 2.677608563317335e-06, |
| "step": 34, |
| "step_time": 13.956647035999822 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 62.0, |
| "completions/max_terminated_length": 62.0, |
| "completions/mean_length": 49.0, |
| "completions/mean_terminated_length": 49.0, |
| "completions/min_length": 36.0, |
| "completions/min_terminated_length": 36.0, |
| "entropy": 0.489044189453125, |
| "epoch": 0.03684210526315789, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 46.0, |
| "kl": 0.004022765904664993, |
| "learning_rate": 6.800000000000001e-06, |
| "loss": -0.8836, |
| "num_tokens": 267539.0, |
| "reward": 0.9549999833106995, |
| "reward_std": 0.007071061059832573, |
| "rewards/alfworld_rollout_reward_func/mean": 0.9549999833106995, |
| "rewards/alfworld_rollout_reward_func/std": 0.007071061059832573, |
| "sampling/importance_sampling_ratio/max": 2.5752766132354736, |
| "sampling/importance_sampling_ratio/mean": 1.780219316482544, |
| "sampling/importance_sampling_ratio/min": 0.985162079334259, |
| "sampling/sampling_logp_difference/max": 0.38198375701904297, |
| "sampling/sampling_logp_difference/mean": 0.03108775056898594, |
| "step": 35, |
| "step_time": 13.362450941999896 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 29.0, |
| "completions/max_terminated_length": 29.0, |
| "completions/mean_length": 28.5, |
| "completions/mean_terminated_length": 28.5, |
| "completions/min_length": 28.0, |
| "completions/min_terminated_length": 28.0, |
| "entropy": 0.03428112342953682, |
| "epoch": 0.037894736842105266, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.005950927734375, |
| "kl": 9.688996215118095e-05, |
| "learning_rate": 7e-06, |
| "loss": 0.0, |
| "num_tokens": 275461.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/alfworld_rollout_reward_func/mean": 0.0, |
| "rewards/alfworld_rollout_reward_func/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.0216329097747803, |
| "sampling/importance_sampling_ratio/mean": 1.010430932044983, |
| "sampling/importance_sampling_ratio/min": 0.9992288947105408, |
| "sampling/sampling_logp_difference/max": 0.04146456718444824, |
| "sampling/sampling_logp_difference/mean": 0.0011078877141699195, |
| "step": 36, |
| "step_time": 16.18710341299993 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 51.0, |
| "completions/max_terminated_length": 51.0, |
| "completions/mean_length": 49.0, |
| "completions/mean_terminated_length": 49.0, |
| "completions/min_length": 47.0, |
| "completions/min_terminated_length": 47.0, |
| "entropy": 0.375491201877594, |
| "epoch": 0.03894736842105263, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 27.375, |
| "kl": 0.008600625209510326, |
| "learning_rate": 7.2000000000000005e-06, |
| "loss": 0.3645, |
| "num_tokens": 283600.0, |
| "reward": -0.08500000089406967, |
| "reward_std": 0.02121320366859436, |
| "rewards/alfworld_rollout_reward_func/mean": -0.08500000089406967, |
| "rewards/alfworld_rollout_reward_func/std": 0.02121320366859436, |
| "sampling/importance_sampling_ratio/max": 1.302950382232666, |
| "sampling/importance_sampling_ratio/mean": 0.8185228705406189, |
| "sampling/importance_sampling_ratio/min": 0.3340953290462494, |
| "sampling/sampling_logp_difference/max": 0.3411126136779785, |
| "sampling/sampling_logp_difference/mean": 0.03013680875301361, |
| "step": 37, |
| "step_time": 20.539627713000073 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12.0, |
| "completions/max_terminated_length": 12.0, |
| "completions/mean_length": 8.0, |
| "completions/mean_terminated_length": 8.0, |
| "completions/min_length": 4.0, |
| "completions/min_terminated_length": 4.0, |
| "entropy": 0.09109717607498169, |
| "epoch": 0.04, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 170.0, |
| "kl": 0.019233860075473785, |
| "learning_rate": 7.4e-06, |
| "loss": 0.2797, |
| "num_tokens": 291611.0, |
| "reward": 0.49000000953674316, |
| "reward_std": 0.7071067690849304, |
| "rewards/alfworld_rollout_reward_func/mean": 0.49000000953674316, |
| "rewards/alfworld_rollout_reward_func/std": 0.7071067690849304, |
| "sampling/importance_sampling_ratio/max": 1.4181230068206787, |
| "sampling/importance_sampling_ratio/mean": 1.2089695930480957, |
| "sampling/importance_sampling_ratio/min": 0.9998162984848022, |
| "sampling/sampling_logp_difference/max": 0.3511829376220703, |
| "sampling/sampling_logp_difference/mean": 0.022082466632127762, |
| "step": 38, |
| "step_time": 13.460569201999988 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 47.0, |
| "completions/max_terminated_length": 47.0, |
| "completions/mean_length": 31.0, |
| "completions/mean_terminated_length": 31.0, |
| "completions/min_length": 15.0, |
| "completions/min_terminated_length": 15.0, |
| "entropy": 0.35754403471946716, |
| "epoch": 0.04105263157894737, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 8.4375, |
| "kl": 0.01082681491971016, |
| "learning_rate": 7.600000000000001e-06, |
| "loss": -0.0723, |
| "num_tokens": 299145.0, |
| "reward": 0.4350000023841858, |
| "reward_std": 0.7990306615829468, |
| "rewards/alfworld_rollout_reward_func/mean": 0.4350000023841858, |
| "rewards/alfworld_rollout_reward_func/std": 0.7990306615829468, |
| "sampling/importance_sampling_ratio/max": 1.0000016689300537, |
| "sampling/importance_sampling_ratio/mean": 0.5920178294181824, |
| "sampling/importance_sampling_ratio/min": 0.18403403460979462, |
| "sampling/sampling_logp_difference/max": 0.6585979461669922, |
| "sampling/sampling_logp_difference/mean": 0.041399676352739334, |
| "step": 39, |
| "step_time": 15.72952194599975 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 37.0, |
| "completions/max_terminated_length": 37.0, |
| "completions/mean_length": 22.5, |
| "completions/mean_terminated_length": 22.5, |
| "completions/min_length": 8.0, |
| "completions/min_terminated_length": 8.0, |
| "entropy": 0.35843947529792786, |
| "epoch": 0.042105263157894736, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 28.625, |
| "kl": 0.0046605560928583145, |
| "learning_rate": 7.800000000000002e-06, |
| "loss": -0.3742, |
| "num_tokens": 307197.0, |
| "reward": -0.009999999776482582, |
| "reward_std": 0.01414213515818119, |
| "rewards/alfworld_rollout_reward_func/mean": -0.009999999776482582, |
| "rewards/alfworld_rollout_reward_func/std": 0.01414213515818119, |
| "sampling/importance_sampling_ratio/max": 1.0000009536743164, |
| "sampling/importance_sampling_ratio/mean": 0.9322052001953125, |
| "sampling/importance_sampling_ratio/min": 0.8644094467163086, |
| "sampling/sampling_logp_difference/max": 0.34821510314941406, |
| "sampling/sampling_logp_difference/mean": 0.02805318869650364, |
| "step": 40, |
| "step_time": 13.67377691799993 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12.0, |
| "completions/max_terminated_length": 12.0, |
| "completions/mean_length": 12.0, |
| "completions/mean_terminated_length": 12.0, |
| "completions/min_length": 12.0, |
| "completions/min_terminated_length": 12.0, |
| "entropy": 0.028120441362261772, |
| "epoch": 0.0431578947368421, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 92.5, |
| "kl": 0.0015685707330703735, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.0001, |
| "num_tokens": 313777.0, |
| "reward": 0.9399999976158142, |
| "reward_std": 0.05656857416033745, |
| "rewards/alfworld_rollout_reward_func/mean": 0.9399999976158142, |
| "rewards/alfworld_rollout_reward_func/std": 0.05656857416033745, |
| "sampling/importance_sampling_ratio/max": 1.0000027418136597, |
| "sampling/importance_sampling_ratio/mean": 0.9999538064002991, |
| "sampling/importance_sampling_ratio/min": 0.9999048709869385, |
| "sampling/sampling_logp_difference/max": 9.777725790627301e-05, |
| "sampling/sampling_logp_difference/mean": 4.304089543438749e-06, |
| "step": 41, |
| "step_time": 11.189458066000043 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 40.0, |
| "completions/max_terminated_length": 40.0, |
| "completions/mean_length": 36.0, |
| "completions/mean_terminated_length": 36.0, |
| "completions/min_length": 32.0, |
| "completions/min_terminated_length": 32.0, |
| "entropy": 0.24259884655475616, |
| "epoch": 0.04421052631578947, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 16.75, |
| "kl": 0.0016654051141813397, |
| "learning_rate": 8.2e-06, |
| "loss": 0.1025, |
| "num_tokens": 321642.0, |
| "reward": -0.014999999664723873, |
| "reward_std": 0.02121320366859436, |
| "rewards/alfworld_rollout_reward_func/mean": -0.014999999664723873, |
| "rewards/alfworld_rollout_reward_func/std": 0.02121320366859436, |
| "sampling/importance_sampling_ratio/max": 1.060155987739563, |
| "sampling/importance_sampling_ratio/mean": 1.0288386344909668, |
| "sampling/importance_sampling_ratio/min": 0.9975212216377258, |
| "sampling/sampling_logp_difference/max": 0.16695499420166016, |
| "sampling/sampling_logp_difference/mean": 0.009143915958702564, |
| "step": 42, |
| "step_time": 22.64027631700037 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 38.0, |
| "completions/max_terminated_length": 38.0, |
| "completions/mean_length": 23.5, |
| "completions/mean_terminated_length": 23.5, |
| "completions/min_length": 9.0, |
| "completions/min_terminated_length": 9.0, |
| "entropy": 0.20157602429389954, |
| "epoch": 0.045263157894736845, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 63.25, |
| "kl": 0.022462664172053337, |
| "learning_rate": 8.400000000000001e-06, |
| "loss": 0.2766, |
| "num_tokens": 325516.0, |
| "reward": 0.9850000143051147, |
| "reward_std": 0.007071061059832573, |
| "rewards/alfworld_rollout_reward_func/mean": 0.9850000143051147, |
| "rewards/alfworld_rollout_reward_func/std": 0.007071061059832573, |
| "sampling/importance_sampling_ratio/max": 0.9990339875221252, |
| "sampling/importance_sampling_ratio/mean": 0.8629282712936401, |
| "sampling/importance_sampling_ratio/min": 0.7268226146697998, |
| "sampling/sampling_logp_difference/max": 0.17243313789367676, |
| "sampling/sampling_logp_difference/mean": 0.015407886356115341, |
| "step": 43, |
| "step_time": 5.622832681000091 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 35.0, |
| "completions/max_terminated_length": 35.0, |
| "completions/mean_length": 33.5, |
| "completions/mean_terminated_length": 33.5, |
| "completions/min_length": 32.0, |
| "completions/min_terminated_length": 32.0, |
| "entropy": 0.2535250782966614, |
| "epoch": 0.04631578947368421, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 19.25, |
| "kl": 0.004778198432177305, |
| "learning_rate": 8.6e-06, |
| "loss": 0.0797, |
| "num_tokens": 333519.0, |
| "reward": 0.4699999988079071, |
| "reward_std": 0.7495331764221191, |
| "rewards/alfworld_rollout_reward_func/mean": 0.4699999988079071, |
| "rewards/alfworld_rollout_reward_func/std": 0.7495331764221191, |
| "sampling/importance_sampling_ratio/max": 0.7066987752914429, |
| "sampling/importance_sampling_ratio/mean": 0.6219298839569092, |
| "sampling/importance_sampling_ratio/min": 0.5371610522270203, |
| "sampling/sampling_logp_difference/max": 0.34627819061279297, |
| "sampling/sampling_logp_difference/mean": 0.017883947119116783, |
| "step": 44, |
| "step_time": 15.114469847999771 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 4750, |
| "num_input_tokens_seen": 333519, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|