{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.30175015087507545,
  "eval_steps": 250,
  "global_step": 500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.12125,
      "completions/max_length": 255.34,
      "completions/max_terminated_length": 252.14,
      "completions/mean_length": 221.534375,
      "completions/mean_terminated_length": 216.93697082519532,
      "completions/min_length": 173.54,
      "completions/min_terminated_length": 173.54,
      "entropy": 0.10048629969358444,
      "epoch": 0.030175015087507542,
      "frac_reward_zero_std": 0.3225,
      "grad_norm": 0.46380576491355896,
      "learning_rate": 5e-05,
      "loss": 0.004,
      "num_tokens": 8142396.0,
      "reward": 7.30375,
      "reward_std": 1.5006456315517425,
      "rewards/event_reward_fn/mean": 7.30375,
      "rewards/event_reward_fn/std": 6.278198585510254,
      "step": 50,
      "step_time": 40.824848868116966
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.068125,
      "completions/max_length": 251.74,
      "completions/max_terminated_length": 248.06,
      "completions/mean_length": 215.08625,
      "completions/mean_terminated_length": 212.25316284179686,
      "completions/min_length": 171.76,
      "completions/min_terminated_length": 171.76,
      "entropy": 0.10318506792187691,
      "epoch": 0.060350030175015085,
      "frac_reward_zero_std": 0.325,
      "grad_norm": 0.21978232264518738,
      "learning_rate": 5e-05,
      "loss": -0.0025,
      "num_tokens": 16421719.0,
      "reward": 7.36875,
      "reward_std": 1.3263894939422607,
      "rewards/event_reward_fn/mean": 7.36875,
      "rewards/event_reward_fn/std": 6.119045643806458,
      "step": 100,
      "step_time": 38.99798643006128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4825,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 251.32,
      "completions/mean_length": 238.104375,
      "completions/mean_terminated_length": 221.8957485961914,
      "completions/min_length": 191.34,
      "completions/min_terminated_length": 191.34,
      "entropy": 0.10444845259189606,
      "epoch": 0.09052504526252263,
      "frac_reward_zero_std": 0.2925,
      "grad_norm": 0.5579063892364502,
      "learning_rate": 5e-05,
      "loss": -0.0006,
      "num_tokens": 24885844.0,
      "reward": 7.74625,
      "reward_std": 1.5345598912239076,
      "rewards/event_reward_fn/mean": 7.74625,
      "rewards/event_reward_fn/std": 6.464660973548889,
      "step": 150,
      "step_time": 41.26081488572061
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7925,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 202.92,
      "completions/mean_length": 245.916875,
      "completions/mean_terminated_length": 184.6587713623047,
      "completions/min_length": 199.94,
      "completions/min_terminated_length": 169.22,
      "entropy": 0.10581055819988251,
      "epoch": 0.12070006035003017,
      "frac_reward_zero_std": 0.33,
      "grad_norm": 0.31808722019195557,
      "learning_rate": 5e-05,
      "loss": 0.0003,
      "num_tokens": 33226966.0,
      "reward": 7.19125,
      "reward_std": 1.4298825466632843,
      "rewards/event_reward_fn/mean": 7.19125,
      "rewards/event_reward_fn/std": 5.8599746036529545,
      "step": 200,
      "step_time": 41.91275953448203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.825,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 181.12,
      "completions/mean_length": 245.851875,
      "completions/mean_terminated_length": 163.72261688232422,
      "completions/min_length": 198.46,
      "completions/min_terminated_length": 152.38,
      "entropy": 0.10499135926365852,
      "epoch": 0.15087507543753773,
      "frac_reward_zero_std": 0.2875,
      "grad_norm": 0.2646925449371338,
      "learning_rate": 5e-05,
      "loss": 0.0005,
      "num_tokens": 41523308.0,
      "reward": 7.9475,
      "reward_std": 1.5300491595268249,
      "rewards/event_reward_fn/mean": 7.9475,
      "rewards/event_reward_fn/std": 6.3965685844421385,
      "step": 250,
      "step_time": 41.663273623897695
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.898125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 149.62,
      "completions/mean_length": 250.625625,
      "completions/mean_terminated_length": 144.78653198242188,
      "completions/min_length": 215.68,
      "completions/min_terminated_length": 138.88,
      "entropy": 0.10884671121835708,
      "epoch": 0.18105009052504525,
      "frac_reward_zero_std": 0.3325,
      "grad_norm": 0.5418329834938049,
      "learning_rate": 5e-05,
      "loss": -0.0002,
      "num_tokens": 49889481.0,
      "reward": 7.489375,
      "reward_std": 1.5504147619009019,
      "rewards/event_reward_fn/mean": 7.489375,
      "rewards/event_reward_fn/std": 6.099679977893829,
      "step": 300,
      "step_time": 40.817094522019616
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9275,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 131.48,
      "completions/mean_length": 253.1625,
      "completions/mean_terminated_length": 125.53590209960937,
      "completions/min_length": 228.04,
      "completions/min_terminated_length": 120.52,
      "entropy": 0.10796756476163864,
      "epoch": 0.2112251056125528,
      "frac_reward_zero_std": 0.3175,
      "grad_norm": 0.4433981776237488,
      "learning_rate": 5e-05,
      "loss": 0.0019,
      "num_tokens": 58206892.0,
      "reward": 7.89625,
      "reward_std": 1.573977051973343,
      "rewards/event_reward_fn/mean": 7.89625,
      "rewards/event_reward_fn/std": 6.586006484031677,
      "step": 350,
      "step_time": 42.12015992245928
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.945625,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 151.42,
      "completions/mean_length": 254.76625,
      "completions/mean_terminated_length": 146.34000091552736,
      "completions/min_length": 238.42,
      "completions/min_terminated_length": 141.14,
      "entropy": 0.11530103281140328,
      "epoch": 0.24140012070006034,
      "frac_reward_zero_std": 0.29,
      "grad_norm": 0.3932775855064392,
      "learning_rate": 5e-05,
      "loss": 0.0001,
      "num_tokens": 66513664.0,
      "reward": 7.304375,
      "reward_std": 1.552179645895958,
      "rewards/event_reward_fn/mean": 7.304375,
      "rewards/event_reward_fn/std": 5.687906408309937,
      "step": 400,
      "step_time": 40.78123372233997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.92375,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 185.1,
      "completions/mean_length": 254.35875,
      "completions/mean_terminated_length": 178.61883544921875,
      "completions/min_length": 232.56,
      "completions/min_terminated_length": 171.12,
      "entropy": 0.13443249970674515,
      "epoch": 0.27157513578756787,
      "frac_reward_zero_std": 0.315,
      "grad_norm": 0.2284364551305771,
      "learning_rate": 5e-05,
      "loss": -0.0013,
      "num_tokens": 74493599.0,
      "reward": 7.766875,
      "reward_std": 1.5890911322832109,
      "rewards/event_reward_fn/mean": 7.766875,
      "rewards/event_reward_fn/std": 6.074563751220703,
      "step": 450,
      "step_time": 40.8964025861409
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.983125,
      "completions/max_length": 256.0,
      "completions/max_terminated_length": 73.22,
      "completions/mean_length": 255.728125,
      "completions/mean_terminated_length": 72.48666687011719,
      "completions/min_length": 250.14,
      "completions/min_terminated_length": 70.94,
      "entropy": 0.1348781806230545,
      "epoch": 0.30175015087507545,
      "frac_reward_zero_std": 0.32,
      "grad_norm": 0.44683775305747986,
      "learning_rate": 5e-05,
      "loss": 0.0006,
      "num_tokens": 82766712.0,
      "reward": 7.835625,
      "reward_std": 1.6530324041843414,
      "rewards/event_reward_fn/mean": 7.835625,
      "rewards/event_reward_fn/std": 6.139980282783508,
      "step": 500,
      "step_time": 41.13054014526191
    }
  ],
  "logging_steps": 50,
  "max_steps": 16570,
  "num_input_tokens_seen": 82766712,
  "num_train_epochs": 10,
  "save_steps": 250,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}