RLVR-hotpot / trainer_state.json
Byanka's picture
Model save
ffb19b8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 50,
"global_step": 312,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.028125,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 1430.2,
"completions/mean_length": 166.39462890625,
"completions/mean_terminated_length": 126.74219970703125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.016,
"grad_norm": 0.0278764758259058,
"learning_rate": 3.1249999999999997e-07,
"loss": 0.0308,
"num_tokens": 13404233.0,
"reward": 0.435546875,
"reward_std": 0.3221697866916656,
"rewards/accuracy_reward": 0.2009765625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.6701171875,
"rewards/mean_confidence_reward": 0.0,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02431640625,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 1472.4,
"completions/mean_length": 162.2275390625,
"completions/mean_terminated_length": 128.01040344238282,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.032,
"grad_norm": 0.022988498210906982,
"learning_rate": 6.249999999999999e-07,
"loss": 0.038,
"num_tokens": 27022115.0,
"reward": 0.462451171875,
"reward_std": 0.3004496514797211,
"rewards/accuracy_reward": 0.19091796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.733984375,
"rewards/mean_confidence_reward": 0.0,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00947265625,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 1380.6,
"completions/mean_length": 117.775,
"completions/mean_terminated_length": 104.22791290283203,
"completions/min_length": 3.6,
"completions/min_terminated_length": 3.6,
"epoch": 0.048,
"grad_norm": 0.025120964273810387,
"learning_rate": 9.374999999999999e-07,
"loss": 0.0318,
"num_tokens": 40133187.0,
"reward": 0.583056640625,
"reward_std": 0.2110624998807907,
"rewards/accuracy_reward": 0.24814453125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.91796875,
"rewards/mean_confidence_reward": 0.0,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0048828125,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 1017.2,
"completions/mean_length": 87.47548828125,
"completions/mean_terminated_length": 80.37409057617188,
"completions/min_length": 8.8,
"completions/min_terminated_length": 8.8,
"epoch": 0.064,
"grad_norm": 0.00622530234977603,
"learning_rate": 1e-06,
"loss": 0.0215,
"num_tokens": 52803656.0,
"reward": 0.6568359375,
"reward_std": 0.15337491929531097,
"rewards/accuracy_reward": 0.3328125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.980859375,
"rewards/mean_confidence_reward": 0.0,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0005859375,
"completions/max_length": 1171.8,
"completions/max_terminated_length": 593.8,
"completions/mean_length": 73.31455078125,
"completions/mean_terminated_length": 72.45734252929688,
"completions/min_length": 13.6,
"completions/min_terminated_length": 13.6,
"epoch": 0.08,
"grad_norm": 0.01711602509021759,
"learning_rate": 1e-06,
"loss": 0.0012,
"num_tokens": 65343869.0,
"reward": 0.697216796875,
"reward_std": 0.1176684021949768,
"rewards/accuracy_reward": 0.39697265625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9974609375,
"rewards/mean_confidence_reward": 0.0,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00029296875,
"completions/max_length": 1257.4,
"completions/max_terminated_length": 598.0,
"completions/mean_length": 72.3849609375,
"completions/mean_terminated_length": 71.9559326171875,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.096,
"grad_norm": 0.0022409269586205482,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 77986019.0,
"reward": 0.699267578125,
"reward_std": 0.10926563590765,
"rewards/accuracy_reward": 0.4005859375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99794921875,
"rewards/mean_confidence_reward": 0.0,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 1141.6,
"completions/max_terminated_length": 687.8,
"completions/mean_length": 75.8130859375,
"completions/mean_terminated_length": 75.24237976074218,
"completions/min_length": 19.8,
"completions/min_terminated_length": 19.8,
"epoch": 0.112,
"grad_norm": 0.002861637622117996,
"learning_rate": 1e-06,
"loss": 0.0027,
"num_tokens": 90728137.0,
"reward": 0.71552734375,
"reward_std": 0.1081365168094635,
"rewards/accuracy_reward": 0.43212890625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99892578125,
"rewards/mean_confidence_reward": 0.0,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1178.2,
"completions/max_terminated_length": 563.4,
"completions/mean_length": 79.030859375,
"completions/mean_terminated_length": 78.31913757324219,
"completions/min_length": 20.8,
"completions/min_terminated_length": 20.8,
"epoch": 0.128,
"grad_norm": 0.0015531065873801708,
"learning_rate": 1e-06,
"loss": 0.0023,
"num_tokens": 103310405.0,
"reward": 0.713525390625,
"reward_std": 0.09464964717626571,
"rewards/accuracy_reward": 0.4279296875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99912109375,
"rewards/mean_confidence_reward": 0.0,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1106.2,
"completions/max_terminated_length": 417.8,
"completions/mean_length": 77.76083984375,
"completions/mean_terminated_length": 77.0487274169922,
"completions/min_length": 17.6,
"completions/min_terminated_length": 17.6,
"epoch": 0.144,
"grad_norm": 0.0022233380004763603,
"learning_rate": 1e-06,
"loss": 0.0017,
"num_tokens": 115913428.0,
"reward": 0.76220703125,
"reward_std": 0.09807199090719224,
"rewards/accuracy_reward": 0.52548828125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99892578125,
"rewards/mean_confidence_reward": 0.0,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0001953125,
"completions/max_length": 853.4,
"completions/max_terminated_length": 390.8,
"completions/mean_length": 79.07109375,
"completions/mean_terminated_length": 78.78670043945313,
"completions/min_length": 23.6,
"completions/min_terminated_length": 23.6,
"epoch": 0.16,
"grad_norm": 0.0017749707913026214,
"learning_rate": 1e-06,
"loss": 0.001,
"num_tokens": 128600364.0,
"reward": 0.7380859375,
"reward_std": 0.09451625794172287,
"rewards/accuracy_reward": 0.4765625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.999609375,
"rewards/mean_confidence_reward": 0.0,
"step": 50
},
{
"epoch": 0.16,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 232.5,
"eval_completions/max_terminated_length": 232.5,
"eval_completions/mean_length": 82.95326042175293,
"eval_completions/mean_terminated_length": 82.95326042175293,
"eval_completions/min_length": 28.25,
"eval_completions/min_terminated_length": 28.25,
"eval_loss": 0.0,
"eval_num_tokens": 128600364.0,
"eval_reward": 0.69140625,
"eval_reward_std": 0.24272222816944122,
"eval_rewards/accuracy_reward": 0.3828125,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 13.311,
"eval_samples_per_second": 37.563,
"eval_steps_per_second": 0.301,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00029296875,
"completions/max_length": 834.6,
"completions/max_terminated_length": 366.6,
"completions/mean_length": 83.1912109375,
"completions/mean_terminated_length": 82.76549530029297,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.176,
"grad_norm": 0.0019512384897097945,
"learning_rate": 1e-06,
"loss": 0.0016,
"num_tokens": 141545682.0,
"reward": 0.733056640625,
"reward_std": 0.095227712392807,
"rewards/accuracy_reward": 0.46650390625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.999609375,
"rewards/mean_confidence_reward": 0.0,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1311.4,
"completions/max_terminated_length": 485.4,
"completions/mean_length": 86.99189453125,
"completions/mean_terminated_length": 86.28425750732421,
"completions/min_length": 22.6,
"completions/min_terminated_length": 22.6,
"epoch": 0.192,
"grad_norm": 0.0016324262833222747,
"learning_rate": 1e-06,
"loss": 0.0021,
"num_tokens": 154107615.0,
"reward": 0.74697265625,
"reward_std": 0.08921304196119309,
"rewards/accuracy_reward": 0.4947265625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99921875,
"rewards/mean_confidence_reward": 0.0,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 1086.2,
"completions/max_terminated_length": 592.6,
"completions/mean_length": 90.79443359375,
"completions/mean_terminated_length": 90.22920989990234,
"completions/min_length": 28.2,
"completions/min_terminated_length": 28.2,
"epoch": 0.208,
"grad_norm": 0.0016499038320034742,
"learning_rate": 1e-06,
"loss": 0.0023,
"num_tokens": 166925894.0,
"reward": 0.77353515625,
"reward_std": 0.08650225400924683,
"rewards/accuracy_reward": 0.54765625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9994140625,
"rewards/mean_confidence_reward": 0.0,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0009765625,
"completions/max_length": 1338.6,
"completions/max_terminated_length": 731.0,
"completions/mean_length": 96.20087890625,
"completions/mean_terminated_length": 94.7965301513672,
"completions/min_length": 28.6,
"completions/min_terminated_length": 28.6,
"epoch": 0.224,
"grad_norm": 0.001517058233730495,
"learning_rate": 1e-06,
"loss": 0.002,
"num_tokens": 179920495.0,
"reward": 0.74716796875,
"reward_std": 0.08538677096366883,
"rewards/accuracy_reward": 0.49560546875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99873046875,
"rewards/mean_confidence_reward": 0.0,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00107421875,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 574.4,
"completions/mean_length": 97.76240234375,
"completions/mean_terminated_length": 96.21627960205078,
"completions/min_length": 25.4,
"completions/min_terminated_length": 25.4,
"epoch": 0.24,
"grad_norm": 0.0017738911556079984,
"learning_rate": 1e-06,
"loss": 0.0028,
"num_tokens": 193029582.0,
"reward": 0.77578125,
"reward_std": 0.09508010596036912,
"rewards/accuracy_reward": 0.55283203125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99873046875,
"rewards/mean_confidence_reward": 0.0,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0009765625,
"completions/max_length": 1319.4,
"completions/max_terminated_length": 417.2,
"completions/mean_length": 96.47294921875,
"completions/mean_terminated_length": 95.06647644042968,
"completions/min_length": 30.6,
"completions/min_terminated_length": 30.6,
"epoch": 0.256,
"grad_norm": 0.0015991576947271824,
"learning_rate": 1e-06,
"loss": 0.0029,
"num_tokens": 205928601.0,
"reward": 0.75869140625,
"reward_std": 0.08824991285800934,
"rewards/accuracy_reward": 0.518359375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.0,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00087890625,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 459.0,
"completions/mean_length": 99.259765625,
"completions/mean_terminated_length": 97.99633331298828,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.272,
"grad_norm": 0.0014954438665881753,
"learning_rate": 1e-06,
"loss": 0.0026,
"num_tokens": 218767037.0,
"reward": 0.7568359375,
"reward_std": 0.08405127227306367,
"rewards/accuracy_reward": 0.51484375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.998828125,
"rewards/mean_confidence_reward": 0.0,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00068359375,
"completions/max_length": 1149.6,
"completions/max_terminated_length": 682.4,
"completions/mean_length": 94.7064453125,
"completions/mean_terminated_length": 93.72140197753906,
"completions/min_length": 32.2,
"completions/min_terminated_length": 32.2,
"epoch": 0.288,
"grad_norm": 0.0017134748632088304,
"learning_rate": 1e-06,
"loss": 0.0026,
"num_tokens": 231551327.0,
"reward": 0.7654296875,
"reward_std": 0.08652912825345993,
"rewards/accuracy_reward": 0.53154296875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99931640625,
"rewards/mean_confidence_reward": 0.0,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1091.0,
"completions/max_terminated_length": 413.2,
"completions/mean_length": 94.5794921875,
"completions/mean_terminated_length": 93.45206604003906,
"completions/min_length": 34.4,
"completions/min_terminated_length": 34.4,
"epoch": 0.304,
"grad_norm": 0.002114097587764263,
"learning_rate": 1e-06,
"loss": 0.0031,
"num_tokens": 244306093.0,
"reward": 0.762939453125,
"reward_std": 0.08613481372594833,
"rewards/accuracy_reward": 0.52705078125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.998828125,
"rewards/mean_confidence_reward": 0.0,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0005859375,
"completions/max_length": 1399.2,
"completions/max_terminated_length": 458.8,
"completions/mean_length": 91.70283203125,
"completions/mean_terminated_length": 90.85653686523438,
"completions/min_length": 33.8,
"completions/min_terminated_length": 33.8,
"epoch": 0.32,
"grad_norm": 0.00154271034989506,
"learning_rate": 1e-06,
"loss": 0.0023,
"num_tokens": 257190154.0,
"reward": 0.77060546875,
"reward_std": 0.06635084152221679,
"rewards/accuracy_reward": 0.5419921875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99921875,
"rewards/mean_confidence_reward": 0.0,
"step": 100
},
{
"epoch": 0.32,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 232.5,
"eval_completions/max_terminated_length": 232.5,
"eval_completions/mean_length": 93.95743560791016,
"eval_completions/mean_terminated_length": 93.95743560791016,
"eval_completions/min_length": 41.75,
"eval_completions/min_terminated_length": 41.75,
"eval_loss": 0.0,
"eval_num_tokens": 257190154.0,
"eval_reward": 0.712890625,
"eval_reward_std": 0.2481144778430462,
"eval_rewards/accuracy_reward": 0.42578125,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 14.235,
"eval_samples_per_second": 35.125,
"eval_steps_per_second": 0.281,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00146484375,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 624.2,
"completions/mean_length": 94.64306640625,
"completions/mean_terminated_length": 92.5290313720703,
"completions/min_length": 30.4,
"completions/min_terminated_length": 30.4,
"epoch": 0.336,
"grad_norm": 0.001522217644378543,
"learning_rate": 1e-06,
"loss": 0.0043,
"num_tokens": 269738051.0,
"reward": 0.77568359375,
"reward_std": 0.07683707624673844,
"rewards/accuracy_reward": 0.5529296875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9984375,
"rewards/mean_confidence_reward": 0.0,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.001171875,
"completions/max_length": 1307.4,
"completions/max_terminated_length": 435.8,
"completions/mean_length": 94.1294921875,
"completions/mean_terminated_length": 92.43776245117188,
"completions/min_length": 21.8,
"completions/min_terminated_length": 21.8,
"epoch": 0.352,
"grad_norm": 0.001526491018012166,
"learning_rate": 1e-06,
"loss": 0.0039,
"num_tokens": 282818673.0,
"reward": 0.745751953125,
"reward_std": 0.08200332224369049,
"rewards/accuracy_reward": 0.49306640625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9984375,
"rewards/mean_confidence_reward": 0.0,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0013671875,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 588.4,
"completions/mean_length": 92.6953125,
"completions/mean_terminated_length": 90.7186294555664,
"completions/min_length": 30.6,
"completions/min_terminated_length": 30.6,
"epoch": 0.368,
"grad_norm": 0.0013903952203691006,
"learning_rate": 1e-06,
"loss": 0.0039,
"num_tokens": 295689665.0,
"reward": 0.754443359375,
"reward_std": 0.07026491463184356,
"rewards/accuracy_reward": 0.5103515625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99853515625,
"rewards/mean_confidence_reward": 0.0,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0009765625,
"completions/max_length": 1296.6,
"completions/max_terminated_length": 561.2,
"completions/mean_length": 91.94736328125,
"completions/mean_terminated_length": 90.53576354980468,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.384,
"grad_norm": 0.0018187090754508972,
"learning_rate": 1e-06,
"loss": 0.0033,
"num_tokens": 308344038.0,
"reward": 0.7708984375,
"reward_std": 0.07117158472537995,
"rewards/accuracy_reward": 0.54296875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.998828125,
"rewards/mean_confidence_reward": 0.0,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.001171875,
"completions/max_length": 1305.0,
"completions/max_terminated_length": 520.2,
"completions/mean_length": 89.5787109375,
"completions/mean_terminated_length": 87.88221435546875,
"completions/min_length": 29.4,
"completions/min_terminated_length": 29.4,
"epoch": 0.4,
"grad_norm": 0.0016340231522917747,
"learning_rate": 1e-06,
"loss": 0.0038,
"num_tokens": 321154092.0,
"reward": 0.76103515625,
"reward_std": 0.07767283618450165,
"rewards/accuracy_reward": 0.52333984375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99873046875,
"rewards/mean_confidence_reward": 0.0,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00068359375,
"completions/max_length": 1244.4,
"completions/max_terminated_length": 590.0,
"completions/mean_length": 89.6955078125,
"completions/mean_terminated_length": 88.70513153076172,
"completions/min_length": 31.8,
"completions/min_terminated_length": 31.8,
"epoch": 0.416,
"grad_norm": 0.0015821981942281127,
"learning_rate": 1e-06,
"loss": 0.0019,
"num_tokens": 333810078.0,
"reward": 0.764208984375,
"reward_std": 0.06756853386759758,
"rewards/accuracy_reward": 0.5291015625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99931640625,
"rewards/mean_confidence_reward": 0.0,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1174.4,
"completions/max_terminated_length": 537.0,
"completions/mean_length": 90.36181640625,
"completions/mean_terminated_length": 89.65568542480469,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.432,
"grad_norm": 0.0014525202568620443,
"learning_rate": 1e-06,
"loss": 0.0025,
"num_tokens": 346606039.0,
"reward": 0.7806640625,
"reward_std": 0.06507465690374374,
"rewards/accuracy_reward": 0.56181640625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99951171875,
"rewards/mean_confidence_reward": 0.0,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00068359375,
"completions/max_length": 1301.0,
"completions/max_terminated_length": 503.6,
"completions/mean_length": 94.90302734375,
"completions/mean_terminated_length": 93.91733856201172,
"completions/min_length": 42.2,
"completions/min_terminated_length": 42.2,
"epoch": 0.448,
"grad_norm": 0.0015080425655469298,
"learning_rate": 1e-06,
"loss": 0.0023,
"num_tokens": 359386966.0,
"reward": 0.76650390625,
"reward_std": 0.069513601064682,
"rewards/accuracy_reward": 0.53369140625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99931640625,
"rewards/mean_confidence_reward": 0.0,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00185546875,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 541.6,
"completions/mean_length": 99.85595703125,
"completions/mean_terminated_length": 97.19031372070313,
"completions/min_length": 40.8,
"completions/min_terminated_length": 40.8,
"epoch": 0.464,
"grad_norm": 0.0010223939316347241,
"learning_rate": 1e-06,
"loss": 0.0047,
"num_tokens": 372436627.0,
"reward": 0.7345703125,
"reward_std": 0.05768234580755234,
"rewards/accuracy_reward": 0.47109375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.998046875,
"rewards/mean_confidence_reward": 0.0,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00068359375,
"completions/max_length": 1303.2,
"completions/max_terminated_length": 405.6,
"completions/mean_length": 98.14833984375,
"completions/mean_terminated_length": 97.16446533203126,
"completions/min_length": 43.8,
"completions/min_terminated_length": 43.8,
"epoch": 0.48,
"grad_norm": 0.0016905076336115599,
"learning_rate": 1e-06,
"loss": 0.0023,
"num_tokens": 385346018.0,
"reward": 0.769873046875,
"reward_std": 0.07736360728740692,
"rewards/accuracy_reward": 0.5408203125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99892578125,
"rewards/mean_confidence_reward": 0.0,
"step": 150
},
{
"epoch": 0.48,
"eval_completions/clipped_ratio": 0.001953125,
"eval_completions/max_length": 582.5,
"eval_completions/max_terminated_length": 264.0,
"eval_completions/mean_length": 101.61126136779785,
"eval_completions/mean_terminated_length": 98.80445098876953,
"eval_completions/min_length": 51.25,
"eval_completions/min_terminated_length": 51.25,
"eval_loss": 0.0,
"eval_num_tokens": 385346018.0,
"eval_reward": 0.7265625,
"eval_reward_std": 0.2514254078269005,
"eval_rewards/accuracy_reward": 0.455078125,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.998046875,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 24.3286,
"eval_samples_per_second": 20.552,
"eval_steps_per_second": 0.164,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1536.0,
"completions/max_terminated_length": 386.0,
"completions/mean_length": 97.898046875,
"completions/mean_terminated_length": 96.7735092163086,
"completions/min_length": 40.2,
"completions/min_terminated_length": 40.2,
"epoch": 0.496,
"grad_norm": 0.001528796274214983,
"learning_rate": 1e-06,
"loss": 0.0029,
"num_tokens": 398512654.0,
"reward": 0.7744140625,
"reward_std": 0.0720748171210289,
"rewards/accuracy_reward": 0.549609375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99921875,
"rewards/mean_confidence_reward": 0.0,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1317.0,
"completions/max_terminated_length": 399.8,
"completions/mean_length": 95.37158203125,
"completions/mean_terminated_length": 94.66787719726562,
"completions/min_length": 40.8,
"completions/min_terminated_length": 40.8,
"epoch": 0.512,
"grad_norm": 0.00132983538787812,
"learning_rate": 1e-06,
"loss": 0.0025,
"num_tokens": 411491243.0,
"reward": 0.779931640625,
"reward_std": 0.06670133695006371,
"rewards/accuracy_reward": 0.56044921875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9994140625,
"rewards/mean_confidence_reward": 0.0,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0009765625,
"completions/max_length": 1341.4,
"completions/max_terminated_length": 662.8,
"completions/mean_length": 96.72470703125,
"completions/mean_terminated_length": 95.31717834472656,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.528,
"grad_norm": 0.001326797646470368,
"learning_rate": 1e-06,
"loss": 0.0037,
"num_tokens": 424367560.0,
"reward": 0.77578125,
"reward_std": 0.06582950651645661,
"rewards/accuracy_reward": 0.5525390625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.0,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0001953125,
"completions/max_length": 699.4,
"completions/max_terminated_length": 456.6,
"completions/mean_length": 93.36513671875,
"completions/mean_terminated_length": 93.0830810546875,
"completions/min_length": 34.2,
"completions/min_terminated_length": 34.2,
"epoch": 0.544,
"grad_norm": 0.0017570438794791698,
"learning_rate": 1e-06,
"loss": 0.0018,
"num_tokens": 437343523.0,
"reward": 0.796533203125,
"reward_std": 0.07356481105089188,
"rewards/accuracy_reward": 0.593359375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99970703125,
"rewards/mean_confidence_reward": 0.0,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 444.8,
"completions/max_terminated_length": 444.8,
"completions/mean_length": 91.45390625,
"completions/mean_terminated_length": 91.45390625,
"completions/min_length": 41.8,
"completions/min_terminated_length": 41.8,
"epoch": 0.56,
"grad_norm": 0.0014896654756739736,
"learning_rate": 1e-06,
"loss": 0.0009,
"num_tokens": 449957739.0,
"reward": 0.76611328125,
"reward_std": 0.060523012280464174,
"rewards/accuracy_reward": 0.53232421875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.0,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 1289.4,
"completions/max_terminated_length": 382.2,
"completions/mean_length": 90.7419921875,
"completions/mean_terminated_length": 90.17736206054687,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.576,
"grad_norm": 0.0016016842564567924,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 462929881.0,
"reward": 0.764111328125,
"reward_std": 0.05608753189444542,
"rewards/accuracy_reward": 0.52861328125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.999609375,
"rewards/mean_confidence_reward": 0.0,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 603.8,
"completions/max_terminated_length": 356.0,
"completions/mean_length": 91.03701171875,
"completions/mean_terminated_length": 90.8958251953125,
"completions/min_length": 42.6,
"completions/min_terminated_length": 42.6,
"epoch": 0.592,
"grad_norm": 0.0013300231657922268,
"learning_rate": 1e-06,
"loss": 0.0014,
"num_tokens": 475886132.0,
"reward": 0.763671875,
"reward_std": 0.06121245920658112,
"rewards/accuracy_reward": 0.52744140625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.0,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00029296875,
"completions/max_length": 831.0,
"completions/max_terminated_length": 363.8,
"completions/mean_length": 94.77607421875,
"completions/mean_terminated_length": 94.35356140136719,
"completions/min_length": 42.6,
"completions/min_terminated_length": 42.6,
"epoch": 0.608,
"grad_norm": 0.001449022558517754,
"learning_rate": 1e-06,
"loss": 0.0015,
"num_tokens": 488712447.0,
"reward": 0.776220703125,
"reward_std": 0.055175574868917464,
"rewards/accuracy_reward": 0.55283203125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.999609375,
"rewards/mean_confidence_reward": 0.0,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 719.8,
"completions/max_terminated_length": 511.4,
"completions/mean_length": 95.49326171875,
"completions/mean_terminated_length": 95.3530044555664,
"completions/min_length": 42.2,
"completions/min_terminated_length": 42.2,
"epoch": 0.624,
"grad_norm": 0.0016013348940759897,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 501890522.0,
"reward": 0.770703125,
"reward_std": 0.06375713348388672,
"rewards/accuracy_reward": 0.54150390625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.0,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.001171875,
"completions/max_length": 1310.4,
"completions/max_terminated_length": 450.2,
"completions/mean_length": 97.82939453125,
"completions/mean_terminated_length": 96.14148406982422,
"completions/min_length": 43.8,
"completions/min_terminated_length": 43.8,
"epoch": 0.64,
"grad_norm": 0.00144854630343616,
"learning_rate": 1e-06,
"loss": 0.0031,
"num_tokens": 515091303.0,
"reward": 0.79462890625,
"reward_std": 0.05500866025686264,
"rewards/accuracy_reward": 0.5904296875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.998828125,
"rewards/mean_confidence_reward": 0.0,
"step": 200
},
{
"epoch": 0.64,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 226.5,
"eval_completions/max_terminated_length": 226.5,
"eval_completions/mean_length": 97.45433807373047,
"eval_completions/mean_terminated_length": 97.45433807373047,
"eval_completions/min_length": 48.5,
"eval_completions/min_terminated_length": 48.5,
"eval_loss": 0.0,
"eval_num_tokens": 515091303.0,
"eval_reward": 0.7216796875,
"eval_reward_std": 0.24690637737512589,
"eval_rewards/accuracy_reward": 0.443359375,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 14.5533,
"eval_samples_per_second": 34.357,
"eval_steps_per_second": 0.275,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0001953125,
"completions/max_length": 809.0,
"completions/max_terminated_length": 348.2,
"completions/mean_length": 95.0333984375,
"completions/mean_terminated_length": 94.75228271484374,
"completions/min_length": 42.4,
"completions/min_terminated_length": 42.4,
"epoch": 0.656,
"grad_norm": 0.0017163316952064633,
"learning_rate": 1e-06,
"loss": 0.0013,
"num_tokens": 527777309.0,
"reward": 0.75673828125,
"reward_std": 0.060856021195650103,
"rewards/accuracy_reward": 0.513671875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.0,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 828.2,
"completions/max_terminated_length": 388.2,
"completions/mean_length": 95.4216796875,
"completions/mean_terminated_length": 94.85912628173828,
"completions/min_length": 41.4,
"completions/min_terminated_length": 41.4,
"epoch": 0.672,
"grad_norm": 0.0013491360004991293,
"learning_rate": 1e-06,
"loss": 0.0013,
"num_tokens": 540524187.0,
"reward": 0.768115234375,
"reward_std": 0.05722193792462349,
"rewards/accuracy_reward": 0.53662109375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.999609375,
"rewards/mean_confidence_reward": 0.0,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 1050.6,
"completions/max_terminated_length": 373.2,
"completions/mean_length": 95.3826171875,
"completions/mean_terminated_length": 94.82014465332031,
"completions/min_length": 43.8,
"completions/min_terminated_length": 43.8,
"epoch": 0.688,
"grad_norm": 0.0017077566590160131,
"learning_rate": 1e-06,
"loss": 0.0011,
"num_tokens": 553311145.0,
"reward": 0.779443359375,
"reward_std": 0.06311368122696877,
"rewards/accuracy_reward": 0.55927734375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.999609375,
"rewards/mean_confidence_reward": 0.0,
"step": 215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 810.0,
"completions/max_terminated_length": 377.8,
"completions/mean_length": 90.67568359375,
"completions/mean_terminated_length": 90.11159210205078,
"completions/min_length": 42.6,
"completions/min_terminated_length": 42.6,
"epoch": 0.704,
"grad_norm": 0.0018953669350594282,
"learning_rate": 1e-06,
"loss": 0.001,
"num_tokens": 565962128.0,
"reward": 0.78388671875,
"reward_std": 0.05257489308714867,
"rewards/accuracy_reward": 0.5681640625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.999609375,
"rewards/mean_confidence_reward": 0.0,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0005859375,
"completions/max_length": 1065.6,
"completions/max_terminated_length": 403.6,
"completions/mean_length": 91.39638671875,
"completions/mean_terminated_length": 90.54814147949219,
"completions/min_length": 43.6,
"completions/min_terminated_length": 43.6,
"epoch": 0.72,
"grad_norm": 0.0019243984716013074,
"learning_rate": 1e-06,
"loss": 0.0025,
"num_tokens": 578764203.0,
"reward": 0.790576171875,
"reward_std": 0.05948638021945953,
"rewards/accuracy_reward": 0.58173828125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9994140625,
"rewards/mean_confidence_reward": 0.0,
"step": 225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00078125,
"completions/max_length": 1299.8,
"completions/max_terminated_length": 342.0,
"completions/mean_length": 91.9408203125,
"completions/mean_terminated_length": 90.81210021972656,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.736,
"grad_norm": 0.0013582052197307348,
"learning_rate": 1e-06,
"loss": 0.002,
"num_tokens": 591501581.0,
"reward": 0.79150390625,
"reward_std": 0.05623424053192139,
"rewards/accuracy_reward": 0.5837890625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99921875,
"rewards/mean_confidence_reward": 0.0,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0001953125,
"completions/max_length": 565.0,
"completions/max_terminated_length": 327.2,
"completions/mean_length": 91.51416015625,
"completions/mean_terminated_length": 91.23197326660156,
"completions/min_length": 43.8,
"completions/min_terminated_length": 43.8,
"epoch": 0.752,
"grad_norm": 0.0015649450942873955,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 604522206.0,
"reward": 0.7857421875,
"reward_std": 0.05413587838411331,
"rewards/accuracy_reward": 0.5716796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.0,
"step": 235
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 646.2,
"completions/max_terminated_length": 607.4,
"completions/mean_length": 94.9529296875,
"completions/mean_terminated_length": 94.39071350097656,
"completions/min_length": 42.4,
"completions/min_terminated_length": 42.4,
"epoch": 0.768,
"grad_norm": 0.0017648260109126568,
"learning_rate": 1e-06,
"loss": 0.0017,
"num_tokens": 617283548.0,
"reward": 0.761279296875,
"reward_std": 0.057571640610694884,
"rewards/accuracy_reward": 0.52294921875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.999609375,
"rewards/mean_confidence_reward": 0.0,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 413.6,
"completions/max_terminated_length": 413.6,
"completions/mean_length": 93.616015625,
"completions/mean_terminated_length": 93.616015625,
"completions/min_length": 43.8,
"completions/min_terminated_length": 43.8,
"epoch": 0.784,
"grad_norm": 0.0011762650683522224,
"learning_rate": 1e-06,
"loss": 0.0006,
"num_tokens": 630272864.0,
"reward": 0.7892578125,
"reward_std": 0.05626345500349998,
"rewards/accuracy_reward": 0.578515625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 245
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 326.8,
"completions/max_terminated_length": 326.8,
"completions/mean_length": 92.14990234375,
"completions/mean_terminated_length": 92.14990234375,
"completions/min_length": 44.6,
"completions/min_terminated_length": 44.6,
"epoch": 0.8,
"grad_norm": 0.0013073732843622565,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 643083359.0,
"reward": 0.804052734375,
"reward_std": 0.05071377567946911,
"rewards/accuracy_reward": 0.60810546875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 250
},
{
"epoch": 0.8,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 236.75,
"eval_completions/max_terminated_length": 236.75,
"eval_completions/mean_length": 91.57341003417969,
"eval_completions/mean_terminated_length": 91.57341003417969,
"eval_completions/min_length": 49.5,
"eval_completions/min_terminated_length": 49.5,
"eval_loss": 0.0,
"eval_num_tokens": 643083359.0,
"eval_reward": 0.7265625,
"eval_reward_std": 0.24742106348276138,
"eval_rewards/accuracy_reward": 0.453125,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 14.2768,
"eval_samples_per_second": 35.022,
"eval_steps_per_second": 0.28,
"step": 250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0001953125,
"completions/max_length": 588.8,
"completions/max_terminated_length": 456.2,
"completions/mean_length": 89.20595703125,
"completions/mean_terminated_length": 88.92289428710937,
"completions/min_length": 43.2,
"completions/min_terminated_length": 43.2,
"epoch": 0.816,
"grad_norm": 0.0019169868901371956,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 655952316.0,
"reward": 0.805029296875,
"reward_std": 0.054716046899557114,
"rewards/accuracy_reward": 0.61025390625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.0,
"step": 255
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 462.6,
"completions/max_terminated_length": 462.6,
"completions/mean_length": 91.78896484375,
"completions/mean_terminated_length": 91.78896484375,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.832,
"grad_norm": 0.0014281836338341236,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 668756907.0,
"reward": 0.790087890625,
"reward_std": 0.0539084292948246,
"rewards/accuracy_reward": 0.58017578125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 260
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 663.6,
"completions/max_terminated_length": 417.8,
"completions/mean_length": 91.91787109375,
"completions/mean_terminated_length": 91.77665557861329,
"completions/min_length": 45.4,
"completions/min_terminated_length": 45.4,
"epoch": 0.848,
"grad_norm": 0.001417965511791408,
"learning_rate": 1e-06,
"loss": 0.0009,
"num_tokens": 681568834.0,
"reward": 0.77236328125,
"reward_std": 0.0536438025534153,
"rewards/accuracy_reward": 0.54482421875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.0,
"step": 265
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.000390625,
"completions/max_length": 1059.4,
"completions/max_terminated_length": 416.2,
"completions/mean_length": 94.8748046875,
"completions/mean_terminated_length": 94.31214294433593,
"completions/min_length": 42.8,
"completions/min_terminated_length": 42.8,
"epoch": 0.864,
"grad_norm": 0.0025267351884394884,
"learning_rate": 1e-06,
"loss": 0.0014,
"num_tokens": 694383488.0,
"reward": 0.809716796875,
"reward_std": 0.05303701683878899,
"rewards/accuracy_reward": 0.61982421875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.999609375,
"rewards/mean_confidence_reward": 0.0,
"step": 270
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 592.0,
"completions/max_terminated_length": 383.0,
"completions/mean_length": 94.92939453125,
"completions/mean_terminated_length": 94.78859100341796,
"completions/min_length": 45.4,
"completions/min_terminated_length": 45.4,
"epoch": 0.88,
"grad_norm": 0.001522132777608931,
"learning_rate": 1e-06,
"loss": 0.0011,
"num_tokens": 707358957.0,
"reward": 0.760546875,
"reward_std": 0.055512601137161256,
"rewards/accuracy_reward": 0.52119140625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.0,
"step": 275
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 572.6,
"completions/max_terminated_length": 337.0,
"completions/mean_length": 97.50546875,
"completions/mean_terminated_length": 97.36535034179687,
"completions/min_length": 44.0,
"completions/min_terminated_length": 44.0,
"epoch": 0.896,
"grad_norm": 0.0011945873266085982,
"learning_rate": 1e-06,
"loss": 0.001,
"num_tokens": 720324581.0,
"reward": 0.784912109375,
"reward_std": 0.04334753602743149,
"rewards/accuracy_reward": 0.569921875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.0,
"step": 280
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0001953125,
"completions/max_length": 960.6,
"completions/max_terminated_length": 469.8,
"completions/mean_length": 97.699609375,
"completions/mean_terminated_length": 97.41859893798828,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.912,
"grad_norm": 0.0012196388561278582,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 733232641.0,
"reward": 0.781201171875,
"reward_std": 0.0503702849149704,
"rewards/accuracy_reward": 0.56259765625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.0,
"step": 285
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 527.2,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 94.194921875,
"completions/mean_terminated_length": 94.05436248779297,
"completions/min_length": 39.6,
"completions/min_terminated_length": 39.6,
"epoch": 0.928,
"grad_norm": 0.0010627944720909,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 746080333.0,
"reward": 0.77783203125,
"reward_std": 0.0477489285171032,
"rewards/accuracy_reward": 0.55576171875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.0,
"step": 290
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0001953125,
"completions/max_length": 830.8,
"completions/max_terminated_length": 384.8,
"completions/mean_length": 96.19130859375,
"completions/mean_terminated_length": 95.91045227050782,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"epoch": 0.944,
"grad_norm": 0.0012808856554329395,
"learning_rate": 1e-06,
"loss": 0.0014,
"num_tokens": 758897076.0,
"reward": 0.774853515625,
"reward_std": 0.06085398942232132,
"rewards/accuracy_reward": 0.54990234375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9998046875,
"rewards/mean_confidence_reward": 0.0,
"step": 295
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 9.765625e-05,
"completions/max_length": 618.6,
"completions/max_terminated_length": 385.6,
"completions/mean_length": 95.019921875,
"completions/mean_terminated_length": 94.87908020019532,
"completions/min_length": 41.6,
"completions/min_terminated_length": 41.6,
"epoch": 0.96,
"grad_norm": 0.001602579141035676,
"learning_rate": 1e-06,
"loss": 0.0006,
"num_tokens": 771666720.0,
"reward": 0.77060546875,
"reward_std": 0.047741709649562834,
"rewards/accuracy_reward": 0.54130859375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99990234375,
"rewards/mean_confidence_reward": 0.0,
"step": 300
},
{
"epoch": 0.96,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 212.5,
"eval_completions/max_terminated_length": 212.5,
"eval_completions/mean_length": 94.41190719604492,
"eval_completions/mean_terminated_length": 94.41190719604492,
"eval_completions/min_length": 51.0,
"eval_completions/min_terminated_length": 51.0,
"eval_loss": 0.0,
"eval_num_tokens": 771666720.0,
"eval_reward": 0.7158203125,
"eval_reward_std": 0.24653732031583786,
"eval_rewards/accuracy_reward": 0.431640625,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 13.6738,
"eval_samples_per_second": 36.566,
"eval_steps_per_second": 0.293,
"step": 300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0005859375,
"completions/max_length": 1112.6,
"completions/max_terminated_length": 416.8,
"completions/mean_length": 95.6703125,
"completions/mean_terminated_length": 94.82695617675782,
"completions/min_length": 39.8,
"completions/min_terminated_length": 39.8,
"epoch": 0.976,
"grad_norm": 0.0012756388168781996,
"learning_rate": 1e-06,
"loss": 0.0017,
"num_tokens": 784363824.0,
"reward": 0.789599609375,
"reward_std": 0.05751314386725426,
"rewards/accuracy_reward": 0.57978515625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9994140625,
"rewards/mean_confidence_reward": 0.0,
"step": 305
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00029296875,
"completions/max_length": 813.6,
"completions/max_terminated_length": 335.2,
"completions/mean_length": 93.523046875,
"completions/mean_terminated_length": 93.10098266601562,
"completions/min_length": 41.8,
"completions/min_terminated_length": 41.8,
"epoch": 0.992,
"grad_norm": 0.0012513543479144573,
"learning_rate": 1e-06,
"loss": 0.001,
"num_tokens": 797306300.0,
"reward": 0.778857421875,
"reward_std": 0.04584160037338734,
"rewards/accuracy_reward": 0.5580078125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99970703125,
"rewards/mean_confidence_reward": 0.0,
"step": 310
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 341.0,
"completions/max_terminated_length": 341.0,
"completions/mean_length": 92.21929550170898,
"completions/mean_terminated_length": 92.21929550170898,
"completions/min_length": 40.5,
"completions/min_terminated_length": 40.5,
"epoch": 0.9984,
"num_tokens": 802441496.0,
"reward": 0.784912109375,
"reward_std": 0.055518221110105515,
"rewards/accuracy_reward": 0.570556640625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.999267578125,
"rewards/mean_confidence_reward": 0.0,
"step": 312,
"total_flos": 0.0,
"train_loss": 0.0036987085283889873,
"train_runtime": 71191.4821,
"train_samples_per_second": 0.281,
"train_steps_per_second": 0.004
}
],
"logging_steps": 5,
"max_steps": 312,
"num_input_tokens_seen": 802441496,
"num_train_epochs": 1,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}