{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 500,
  "global_step": 746,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05,
      "completions/max_length": 246.3,
      "completions/max_terminated_length": 240.26,
      "completions/mean_length": 195.0275,
      "completions/mean_terminated_length": 191.87969146728517,
      "completions/min_length": 147.76,
      "completions/min_terminated_length": 147.76,
      "entropy": 0.06807037293910981,
      "epoch": 0.06702412868632708,
      "frac_reward_zero_std": 0.4475,
      "grad_norm": 0.1978774070739746,
      "learning_rate": 1e-05,
      "loss": -0.0022,
      "num_tokens": 6268258.0,
      "reward": 12.489985446929932,
      "reward_std": 1.05244723290205,
      "rewards/event_reward_fn/mean": 11.62375,
      "rewards/event_reward_fn/std": 7.598931360244751,
      "rewards/format_reward_fn/mean": 0.8662354218959808,
      "rewards/format_reward_fn/std": 0.24084076710045338,
      "step": 50,
      "step_time": 24.881226640827954
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.043125,
      "completions/max_length": 249.38,
      "completions/max_terminated_length": 244.26,
      "completions/mean_length": 198.4925,
      "completions/mean_terminated_length": 195.8674203491211,
      "completions/min_length": 155.1,
      "completions/min_terminated_length": 155.1,
      "entropy": 0.07096008479595184,
      "epoch": 0.13404825737265416,
      "frac_reward_zero_std": 0.42,
      "grad_norm": 0.31616032123565674,
      "learning_rate": 1e-05,
      "loss": -0.0052,
      "num_tokens": 12603730.0,
      "reward": 11.722552404403686,
      "reward_std": 1.104598103761673,
      "rewards/event_reward_fn/mean": 10.865,
      "rewards/event_reward_fn/std": 7.203483366966248,
      "rewards/format_reward_fn/mean": 0.8575523483753205,
      "rewards/format_reward_fn/std": 0.25920433282852173,
      "step": 100,
      "step_time": 23.881343694739044
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.069375,
      "completions/max_length": 251.04,
      "completions/max_terminated_length": 245.08,
      "completions/mean_length": 201.58125,
      "completions/mean_terminated_length": 197.60445678710937,
      "completions/min_length": 157.96,
      "completions/min_terminated_length": 157.96,
      "entropy": 0.07228697955608368,
      "epoch": 0.20107238605898123,
      "frac_reward_zero_std": 0.41,
      "grad_norm": 0.1767224669456482,
      "learning_rate": 1e-05,
      "loss": 0.002,
      "num_tokens": 19236102.0,
      "reward": 11.989666719436645,
      "reward_std": 1.2850025883316993,
      "rewards/event_reward_fn/mean": 11.1225,
      "rewards/event_reward_fn/std": 7.3152674865722656,
      "rewards/format_reward_fn/mean": 0.8671666479110718,
      "rewards/format_reward_fn/std": 0.24983404949307442,
      "step": 150,
      "step_time": 27.783113366477192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.068125,
      "completions/max_length": 250.62,
      "completions/max_terminated_length": 244.54,
      "completions/mean_length": 201.1025,
      "completions/mean_terminated_length": 197.25198516845703,
      "completions/min_length": 156.4,
      "completions/min_terminated_length": 156.4,
      "entropy": 0.06773373357951641,
      "epoch": 0.2680965147453083,
      "frac_reward_zero_std": 0.415,
      "grad_norm": 0.13261352479457855,
      "learning_rate": 1e-05,
      "loss": -0.0029,
      "num_tokens": 25426958.0,
      "reward": 12.467143926620484,
      "reward_std": 1.1554639112949372,
      "rewards/event_reward_fn/mean": 11.59875,
      "rewards/event_reward_fn/std": 7.149877543449402,
      "rewards/format_reward_fn/mean": 0.8683938610553742,
      "rewards/format_reward_fn/std": 0.24253679752349855,
      "step": 200,
      "step_time": 24.421198091395198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.058125,
      "completions/max_length": 250.22,
      "completions/max_terminated_length": 243.8,
      "completions/mean_length": 200.303125,
      "completions/mean_terminated_length": 196.90248321533204,
      "completions/min_length": 162.42,
      "completions/min_terminated_length": 162.42,
      "entropy": 0.06486415289342404,
      "epoch": 0.3351206434316354,
      "frac_reward_zero_std": 0.385,
      "grad_norm": 0.49442073702812195,
      "learning_rate": 1e-05,
      "loss": -0.0036,
      "num_tokens": 31582342.0,
      "reward": 12.355808296203612,
      "reward_std": 1.1142808997631073,
      "rewards/event_reward_fn/mean": 11.48875,
      "rewards/event_reward_fn/std": 7.448825697898865,
      "rewards/format_reward_fn/mean": 0.8670582604408265,
      "rewards/format_reward_fn/std": 0.24978963822126388,
      "step": 250,
      "step_time": 25.453000083304943
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.04875,
      "completions/max_length": 248.68,
      "completions/max_terminated_length": 244.22,
      "completions/mean_length": 198.759375,
      "completions/mean_terminated_length": 196.16592681884765,
      "completions/min_length": 156.2,
      "completions/min_terminated_length": 156.2,
      "entropy": 0.0681518343836069,
      "epoch": 0.40214477211796246,
      "frac_reward_zero_std": 0.39,
      "grad_norm": 0.48775437474250793,
      "learning_rate": 1e-05,
      "loss": -0.0057,
      "num_tokens": 37800719.0,
      "reward": 12.434584522247315,
      "reward_std": 1.183589797616005,
      "rewards/event_reward_fn/mean": 11.56375,
      "rewards/event_reward_fn/std": 7.52141658782959,
      "rewards/format_reward_fn/mean": 0.8708344352245331,
      "rewards/format_reward_fn/std": 0.23306368254125118,
      "step": 300,
      "step_time": 25.360634116120636
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.034375,
      "completions/max_length": 248.34,
      "completions/max_terminated_length": 245.28,
      "completions/mean_length": 203.264375,
      "completions/mean_terminated_length": 201.32774475097656,
      "completions/min_length": 157.54,
      "completions/min_terminated_length": 157.54,
      "entropy": 0.06739457175135613,
      "epoch": 0.4691689008042895,
      "frac_reward_zero_std": 0.3525,
      "grad_norm": 0.33356958627700806,
      "learning_rate": 1e-05,
      "loss": -0.004,
      "num_tokens": 44150011.0,
      "reward": 13.173797435760498,
      "reward_std": 1.2946509444713592,
      "rewards/event_reward_fn/mean": 12.28875,
      "rewards/event_reward_fn/std": 7.145490102767944,
      "rewards/format_reward_fn/mean": 0.885047378540039,
      "rewards/format_reward_fn/std": 0.22108205765485764,
      "step": 350,
      "step_time": 26.940150288008155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.064375,
      "completions/max_length": 252.9,
      "completions/max_terminated_length": 247.8,
      "completions/mean_length": 203.064375,
      "completions/mean_terminated_length": 199.5350747680664,
      "completions/min_length": 158.26,
      "completions/min_terminated_length": 158.26,
      "entropy": 0.0657703248411417,
      "epoch": 0.5361930294906166,
      "frac_reward_zero_std": 0.435,
      "grad_norm": 0.26359474658966064,
      "learning_rate": 1e-05,
      "loss": -0.0021,
      "num_tokens": 50384400.0,
      "reward": 12.238037357330322,
      "reward_std": 1.057584773004055,
      "rewards/event_reward_fn/mean": 11.37,
      "rewards/event_reward_fn/std": 7.154304637908935,
      "rewards/format_reward_fn/mean": 0.8680373668670655,
      "rewards/format_reward_fn/std": 0.26109003871679304,
      "step": 400,
      "step_time": 25.59800311360508
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.049375,
      "completions/max_length": 249.06,
      "completions/max_terminated_length": 244.9,
      "completions/mean_length": 203.706875,
      "completions/mean_terminated_length": 200.99220581054686,
      "completions/min_length": 161.06,
      "completions/min_terminated_length": 161.06,
      "entropy": 0.06626586891710758,
      "epoch": 0.6032171581769437,
      "frac_reward_zero_std": 0.3775,
      "grad_norm": 0.48660293221473694,
      "learning_rate": 1e-05,
      "loss": -0.004,
      "num_tokens": 56771056.0,
      "reward": 13.009743461608887,
      "reward_std": 1.2429037857055665,
      "rewards/event_reward_fn/mean": 12.130625,
      "rewards/event_reward_fn/std": 7.234463820457458,
      "rewards/format_reward_fn/mean": 0.8791184043884277,
      "rewards/format_reward_fn/std": 0.23800445690751076,
      "step": 450,
      "step_time": 25.550446799769997
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.066875,
      "completions/max_length": 251.92,
      "completions/max_terminated_length": 246.12,
      "completions/mean_length": 204.07625,
      "completions/mean_terminated_length": 200.35590240478516,
      "completions/min_length": 160.74,
      "completions/min_terminated_length": 160.74,
      "entropy": 0.06663089752197265,
      "epoch": 0.6702412868632708,
      "frac_reward_zero_std": 0.4025,
      "grad_norm": 0.6319305300712585,
      "learning_rate": 1e-05,
      "loss": -0.0042,
      "num_tokens": 63078757.0,
      "reward": 12.313038005828858,
      "reward_std": 1.1368902394175529,
      "rewards/event_reward_fn/mean": 11.4575,
      "rewards/event_reward_fn/std": 6.7143393945693965,
      "rewards/format_reward_fn/mean": 0.8555380630493165,
      "rewards/format_reward_fn/std": 0.2657873314619064,
      "step": 500,
      "step_time": 26.24973841637373
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.091875,
      "completions/max_length": 252.82,
      "completions/max_terminated_length": 246.88,
      "completions/mean_length": 203.815,
      "completions/mean_terminated_length": 198.69242126464843,
      "completions/min_length": 161.16,
      "completions/min_terminated_length": 161.16,
      "entropy": 0.06187104433774948,
      "epoch": 0.7372654155495979,
      "frac_reward_zero_std": 0.425,
      "grad_norm": 0.40395304560661316,
      "learning_rate": 1e-05,
      "loss": -0.0025,
      "num_tokens": 69170452.0,
      "reward": 12.482298536300659,
      "reward_std": 1.0457301473617553,
      "rewards/event_reward_fn/mean": 11.64625,
      "rewards/event_reward_fn/std": 7.317771224975586,
      "rewards/format_reward_fn/mean": 0.8360484623908997,
      "rewards/format_reward_fn/std": 0.2895883430540562,
      "step": 550,
      "step_time": 24.193240740820766
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.110625,
      "completions/max_length": 252.62,
      "completions/max_terminated_length": 246.8,
      "completions/mean_length": 208.275625,
      "completions/mean_terminated_length": 202.49910614013672,
      "completions/min_length": 165.54,
      "completions/min_terminated_length": 165.54,
      "entropy": 0.0649487990140915,
      "epoch": 0.8042895442359249,
      "frac_reward_zero_std": 0.38,
      "grad_norm": 0.37119486927986145,
      "learning_rate": 1e-05,
      "loss": 0.0006,
      "num_tokens": 75499314.0,
      "reward": 12.80059557914734,
      "reward_std": 1.1889909988641738,
      "rewards/event_reward_fn/mean": 11.97375,
      "rewards/event_reward_fn/std": 7.475857477188111,
      "rewards/format_reward_fn/mean": 0.8268455564975739,
      "rewards/format_reward_fn/std": 0.29714462146162984,
      "step": 600,
      "step_time": 24.3176869976148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.05625,
      "completions/max_length": 249.28,
      "completions/max_terminated_length": 244.8,
      "completions/mean_length": 202.789375,
      "completions/mean_terminated_length": 199.91522064208985,
      "completions/min_length": 161.74,
      "completions/min_terminated_length": 161.74,
      "entropy": 0.06481640346348286,
      "epoch": 0.871313672922252,
      "frac_reward_zero_std": 0.3975,
      "grad_norm": 0.08866075426340103,
      "learning_rate": 1e-05,
      "loss": -0.0023,
      "num_tokens": 81673001.0,
      "reward": 12.689926280975342,
      "reward_std": 1.2458794575929641,
      "rewards/event_reward_fn/mean": 11.815625,
      "rewards/event_reward_fn/std": 7.275726590156555,
      "rewards/format_reward_fn/mean": 0.8743013119697571,
      "rewards/format_reward_fn/std": 0.23756251022219657,
      "step": 650,
      "step_time": 25.04028965227306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.100625,
      "completions/max_length": 253.72,
      "completions/max_terminated_length": 248.28,
      "completions/mean_length": 205.536875,
      "completions/mean_terminated_length": 200.1349432373047,
      "completions/min_length": 162.16,
      "completions/min_terminated_length": 162.16,
      "entropy": 0.0658975774794817,
      "epoch": 0.938337801608579,
      "frac_reward_zero_std": 0.3975,
      "grad_norm": 0.2268964648246765,
      "learning_rate": 1e-05,
      "loss": -0.0008,
      "num_tokens": 87934795.0,
      "reward": 12.72035478591919,
      "reward_std": 1.1722034803032875,
      "rewards/event_reward_fn/mean": 11.888125,
      "rewards/event_reward_fn/std": 7.583159003257752,
      "rewards/format_reward_fn/mean": 0.8322297859191895,
      "rewards/format_reward_fn/std": 0.29026631206274034,
      "step": 700,
      "step_time": 24.744350045956672
    }
  ],
  "logging_steps": 50,
  "max_steps": 7460,
  "num_input_tokens_seen": 93493541,
  "num_train_epochs": 10,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}