{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.16221363842487335, "kl": -3.4924596548080444e-10, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 2580.0, "reward": -0.22999998927116394, "reward_std": 1.088944435119629, "rewards/reward_environment_execution/mean": -0.23000000417232513, "rewards/reward_environment_execution/std": 0.3818376660346985, "rewards/reward_format_compliance/mean": -0.15000000596046448, "rewards/reward_format_compliance/std": 0.4949747622013092, "rewards/reward_investigation_quality/mean": 0.15000000596046448, "rewards/reward_investigation_quality/std": 0.2121320366859436, "rewards/reward_os_mechanics/mean": 0.0, "rewards/reward_os_mechanics/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.5, "frac_reward_zero_std": 1.0, "grad_norm": 3.505946644111191e-09, "kl": -4.656612873077393e-10, "learning_rate": 5.000000000000001e-07, "loss": -1.8626451075975936e-11, "num_tokens": 5158.0, "reward": 0.5400000214576721, "reward_std": 0.0, "rewards/reward_environment_execution/mean": 0.03999999910593033, "rewards/reward_environment_execution/std": 0.0, "rewards/reward_format_compliance/mean": 0.20000000298023224, "rewards/reward_format_compliance/std": 0.0, "rewards/reward_investigation_quality/mean": 0.30000001192092896, "rewards/reward_investigation_quality/std": 0.0, "rewards/reward_os_mechanics/mean": 0.0, "rewards/reward_os_mechanics/std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.20935378968715668, "kl": 5.7238503359258175e-06, "learning_rate": 1.0000000000000002e-06, "loss": 2.384185791015625e-07, "num_tokens": 7736.0, "reward": -0.22999998927116394, "reward_std": 1.088944435119629, "rewards/reward_environment_execution/mean": -0.23000000417232513, "rewards/reward_environment_execution/std": 0.3818376660346985, "rewards/reward_format_compliance/mean": -0.15000000596046448, "rewards/reward_format_compliance/std": 0.4949747622013092, "rewards/reward_investigation_quality/mean": 0.15000000596046448, "rewards/reward_investigation_quality/std": 0.2121320366859436, "rewards/reward_os_mechanics/mean": 0.0, "rewards/reward_os_mechanics/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.16330939531326294, "kl": 5.532288923859596e-06, "learning_rate": 1.5e-06, "loss": 2.384185791015625e-07, "num_tokens": 10392.0, "reward": -0.22999998927116394, "reward_std": 1.088944435119629, "rewards/reward_environment_execution/mean": -0.23000000417232513, "rewards/reward_environment_execution/std": 0.3818376660346985, "rewards/reward_format_compliance/mean": -0.15000000596046448, "rewards/reward_format_compliance/std": 0.4949747622013092, "rewards/reward_investigation_quality/mean": 0.15000000596046448, "rewards/reward_investigation_quality/std": 0.2121320366859436, "rewards/reward_os_mechanics/mean": 0.0, "rewards/reward_os_mechanics/std": 0.0, "step": 4 } ], "logging_steps": 1, "max_steps": 4, "num_input_tokens_seen": 10392, "num_train_epochs": 1, "save_steps": 999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }