{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 50, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1430.2, "completions/mean_length": 166.39462890625, "completions/mean_terminated_length": 126.74219970703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.016, "grad_norm": 0.0278764758259058, "learning_rate": 3.1249999999999997e-07, "loss": 0.0308, "num_tokens": 13404233.0, "reward": 0.435546875, "reward_std": 0.3221697866916656, "rewards/accuracy_reward": 0.2009765625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.6701171875, "rewards/mean_confidence_reward": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02431640625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1472.4, "completions/mean_length": 162.2275390625, "completions/mean_terminated_length": 128.01040344238282, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.032, "grad_norm": 0.022988498210906982, "learning_rate": 6.249999999999999e-07, "loss": 0.038, "num_tokens": 27022115.0, "reward": 0.462451171875, "reward_std": 0.3004496514797211, "rewards/accuracy_reward": 0.19091796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.733984375, "rewards/mean_confidence_reward": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00947265625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1380.6, "completions/mean_length": 117.775, "completions/mean_terminated_length": 104.22791290283203, "completions/min_length": 3.6, "completions/min_terminated_length": 3.6, "epoch": 0.048, "grad_norm": 0.025120964273810387, "learning_rate": 9.374999999999999e-07, "loss": 0.0318, "num_tokens": 40133187.0, "reward": 0.583056640625, "reward_std": 0.2110624998807907, "rewards/accuracy_reward": 0.24814453125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.91796875, "rewards/mean_confidence_reward": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0048828125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1017.2, "completions/mean_length": 87.47548828125, "completions/mean_terminated_length": 80.37409057617188, "completions/min_length": 8.8, "completions/min_terminated_length": 8.8, "epoch": 0.064, "grad_norm": 0.00622530234977603, "learning_rate": 1e-06, "loss": 0.0215, "num_tokens": 52803656.0, "reward": 0.6568359375, "reward_std": 0.15337491929531097, "rewards/accuracy_reward": 0.3328125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.980859375, "rewards/mean_confidence_reward": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0005859375, "completions/max_length": 1171.8, "completions/max_terminated_length": 593.8, "completions/mean_length": 73.31455078125, "completions/mean_terminated_length": 72.45734252929688, "completions/min_length": 13.6, "completions/min_terminated_length": 13.6, "epoch": 0.08, "grad_norm": 0.01711602509021759, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 65343869.0, "reward": 0.697216796875, "reward_std": 0.1176684021949768, "rewards/accuracy_reward": 0.39697265625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9974609375, "rewards/mean_confidence_reward": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00029296875, "completions/max_length": 1257.4, "completions/max_terminated_length": 598.0, "completions/mean_length": 72.3849609375, "completions/mean_terminated_length": 71.9559326171875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.096, "grad_norm": 0.0022409269586205482, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 77986019.0, "reward": 0.699267578125, "reward_std": 0.10926563590765, "rewards/accuracy_reward": 0.4005859375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99794921875, "rewards/mean_confidence_reward": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 1141.6, "completions/max_terminated_length": 687.8, "completions/mean_length": 75.8130859375, "completions/mean_terminated_length": 75.24237976074218, "completions/min_length": 19.8, "completions/min_terminated_length": 19.8, "epoch": 0.112, "grad_norm": 0.002861637622117996, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 90728137.0, "reward": 0.71552734375, "reward_std": 0.1081365168094635, "rewards/accuracy_reward": 0.43212890625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99892578125, "rewards/mean_confidence_reward": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1178.2, "completions/max_terminated_length": 563.4, "completions/mean_length": 79.030859375, "completions/mean_terminated_length": 78.31913757324219, "completions/min_length": 20.8, "completions/min_terminated_length": 20.8, "epoch": 0.128, "grad_norm": 0.0015531065873801708, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 103310405.0, "reward": 0.713525390625, "reward_std": 0.09464964717626571, "rewards/accuracy_reward": 0.4279296875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99912109375, "rewards/mean_confidence_reward": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1106.2, "completions/max_terminated_length": 417.8, "completions/mean_length": 77.76083984375, "completions/mean_terminated_length": 77.0487274169922, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 0.144, "grad_norm": 0.0022233380004763603, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 115913428.0, "reward": 0.76220703125, "reward_std": 0.09807199090719224, "rewards/accuracy_reward": 0.52548828125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99892578125, "rewards/mean_confidence_reward": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0001953125, "completions/max_length": 853.4, "completions/max_terminated_length": 390.8, "completions/mean_length": 79.07109375, "completions/mean_terminated_length": 78.78670043945313, "completions/min_length": 23.6, "completions/min_terminated_length": 23.6, "epoch": 0.16, "grad_norm": 0.0017749707913026214, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 128600364.0, "reward": 0.7380859375, "reward_std": 0.09451625794172287, "rewards/accuracy_reward": 0.4765625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.999609375, "rewards/mean_confidence_reward": 0.0, "step": 50 }, { "epoch": 0.16, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 232.5, "eval_completions/max_terminated_length": 232.5, "eval_completions/mean_length": 82.95326042175293, "eval_completions/mean_terminated_length": 82.95326042175293, "eval_completions/min_length": 28.25, "eval_completions/min_terminated_length": 28.25, "eval_loss": 0.0, "eval_num_tokens": 128600364.0, "eval_reward": 0.69140625, "eval_reward_std": 0.24272222816944122, "eval_rewards/accuracy_reward": 0.3828125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 13.311, "eval_samples_per_second": 37.563, "eval_steps_per_second": 0.301, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00029296875, "completions/max_length": 834.6, "completions/max_terminated_length": 366.6, "completions/mean_length": 83.1912109375, "completions/mean_terminated_length": 82.76549530029297, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.176, "grad_norm": 0.0019512384897097945, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 141545682.0, "reward": 0.733056640625, "reward_std": 0.095227712392807, "rewards/accuracy_reward": 0.46650390625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.999609375, "rewards/mean_confidence_reward": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1311.4, "completions/max_terminated_length": 485.4, "completions/mean_length": 86.99189453125, "completions/mean_terminated_length": 86.28425750732421, "completions/min_length": 22.6, "completions/min_terminated_length": 22.6, "epoch": 0.192, "grad_norm": 0.0016324262833222747, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 154107615.0, "reward": 0.74697265625, "reward_std": 0.08921304196119309, "rewards/accuracy_reward": 0.4947265625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99921875, "rewards/mean_confidence_reward": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 1086.2, "completions/max_terminated_length": 592.6, "completions/mean_length": 90.79443359375, "completions/mean_terminated_length": 90.22920989990234, "completions/min_length": 28.2, "completions/min_terminated_length": 28.2, "epoch": 0.208, "grad_norm": 0.0016499038320034742, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 166925894.0, "reward": 0.77353515625, "reward_std": 0.08650225400924683, "rewards/accuracy_reward": 0.54765625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9994140625, "rewards/mean_confidence_reward": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 1338.6, "completions/max_terminated_length": 731.0, "completions/mean_length": 96.20087890625, "completions/mean_terminated_length": 94.7965301513672, "completions/min_length": 28.6, "completions/min_terminated_length": 28.6, "epoch": 0.224, "grad_norm": 0.001517058233730495, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 179920495.0, "reward": 0.74716796875, "reward_std": 0.08538677096366883, "rewards/accuracy_reward": 0.49560546875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99873046875, "rewards/mean_confidence_reward": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00107421875, "completions/max_length": 1536.0, "completions/max_terminated_length": 574.4, "completions/mean_length": 97.76240234375, "completions/mean_terminated_length": 96.21627960205078, "completions/min_length": 25.4, "completions/min_terminated_length": 25.4, "epoch": 0.24, "grad_norm": 0.0017738911556079984, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 193029582.0, "reward": 0.77578125, "reward_std": 0.09508010596036912, "rewards/accuracy_reward": 0.55283203125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99873046875, "rewards/mean_confidence_reward": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 1319.4, "completions/max_terminated_length": 417.2, "completions/mean_length": 96.47294921875, "completions/mean_terminated_length": 95.06647644042968, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.256, "grad_norm": 0.0015991576947271824, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 205928601.0, "reward": 0.75869140625, "reward_std": 0.08824991285800934, "rewards/accuracy_reward": 0.518359375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00087890625, "completions/max_length": 1536.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 99.259765625, "completions/mean_terminated_length": 97.99633331298828, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.272, "grad_norm": 0.0014954438665881753, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 218767037.0, "reward": 0.7568359375, "reward_std": 0.08405127227306367, "rewards/accuracy_reward": 0.51484375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998828125, "rewards/mean_confidence_reward": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00068359375, "completions/max_length": 1149.6, "completions/max_terminated_length": 682.4, "completions/mean_length": 94.7064453125, "completions/mean_terminated_length": 93.72140197753906, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.288, "grad_norm": 0.0017134748632088304, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 231551327.0, "reward": 0.7654296875, "reward_std": 0.08652912825345993, "rewards/accuracy_reward": 0.53154296875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99931640625, "rewards/mean_confidence_reward": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1091.0, "completions/max_terminated_length": 413.2, "completions/mean_length": 94.5794921875, "completions/mean_terminated_length": 93.45206604003906, "completions/min_length": 34.4, "completions/min_terminated_length": 34.4, "epoch": 0.304, "grad_norm": 0.002114097587764263, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 244306093.0, "reward": 0.762939453125, "reward_std": 0.08613481372594833, "rewards/accuracy_reward": 0.52705078125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998828125, "rewards/mean_confidence_reward": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0005859375, "completions/max_length": 1399.2, "completions/max_terminated_length": 458.8, "completions/mean_length": 91.70283203125, "completions/mean_terminated_length": 90.85653686523438, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.32, "grad_norm": 0.00154271034989506, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 257190154.0, "reward": 0.77060546875, "reward_std": 0.06635084152221679, "rewards/accuracy_reward": 0.5419921875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99921875, "rewards/mean_confidence_reward": 0.0, "step": 100 }, { "epoch": 0.32, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 232.5, "eval_completions/max_terminated_length": 232.5, "eval_completions/mean_length": 93.95743560791016, "eval_completions/mean_terminated_length": 93.95743560791016, "eval_completions/min_length": 41.75, "eval_completions/min_terminated_length": 41.75, "eval_loss": 0.0, "eval_num_tokens": 257190154.0, "eval_reward": 0.712890625, "eval_reward_std": 0.2481144778430462, "eval_rewards/accuracy_reward": 0.42578125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 14.235, "eval_samples_per_second": 35.125, "eval_steps_per_second": 0.281, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00146484375, "completions/max_length": 1536.0, "completions/max_terminated_length": 624.2, "completions/mean_length": 94.64306640625, "completions/mean_terminated_length": 92.5290313720703, "completions/min_length": 30.4, "completions/min_terminated_length": 30.4, "epoch": 0.336, "grad_norm": 0.001522217644378543, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 269738051.0, "reward": 0.77568359375, "reward_std": 0.07683707624673844, "rewards/accuracy_reward": 0.5529296875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9984375, "rewards/mean_confidence_reward": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001171875, "completions/max_length": 1307.4, "completions/max_terminated_length": 435.8, "completions/mean_length": 94.1294921875, "completions/mean_terminated_length": 92.43776245117188, "completions/min_length": 21.8, "completions/min_terminated_length": 21.8, "epoch": 0.352, "grad_norm": 0.001526491018012166, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 282818673.0, "reward": 0.745751953125, "reward_std": 0.08200332224369049, "rewards/accuracy_reward": 0.49306640625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9984375, "rewards/mean_confidence_reward": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013671875, "completions/max_length": 1536.0, "completions/max_terminated_length": 588.4, "completions/mean_length": 92.6953125, "completions/mean_terminated_length": 90.7186294555664, "completions/min_length": 30.6, "completions/min_terminated_length": 30.6, "epoch": 0.368, "grad_norm": 0.0013903952203691006, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 295689665.0, "reward": 0.754443359375, "reward_std": 0.07026491463184356, "rewards/accuracy_reward": 0.5103515625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99853515625, "rewards/mean_confidence_reward": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 1296.6, "completions/max_terminated_length": 561.2, "completions/mean_length": 91.94736328125, "completions/mean_terminated_length": 90.53576354980468, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.384, "grad_norm": 0.0018187090754508972, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 308344038.0, "reward": 0.7708984375, "reward_std": 0.07117158472537995, "rewards/accuracy_reward": 0.54296875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998828125, "rewards/mean_confidence_reward": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001171875, "completions/max_length": 1305.0, "completions/max_terminated_length": 520.2, "completions/mean_length": 89.5787109375, "completions/mean_terminated_length": 87.88221435546875, "completions/min_length": 29.4, "completions/min_terminated_length": 29.4, "epoch": 0.4, "grad_norm": 0.0016340231522917747, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 321154092.0, "reward": 0.76103515625, "reward_std": 0.07767283618450165, "rewards/accuracy_reward": 0.52333984375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99873046875, "rewards/mean_confidence_reward": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00068359375, "completions/max_length": 1244.4, "completions/max_terminated_length": 590.0, "completions/mean_length": 89.6955078125, "completions/mean_terminated_length": 88.70513153076172, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.416, "grad_norm": 0.0015821981942281127, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 333810078.0, "reward": 0.764208984375, "reward_std": 0.06756853386759758, "rewards/accuracy_reward": 0.5291015625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99931640625, "rewards/mean_confidence_reward": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1174.4, "completions/max_terminated_length": 537.0, "completions/mean_length": 90.36181640625, "completions/mean_terminated_length": 89.65568542480469, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.432, "grad_norm": 0.0014525202568620443, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 346606039.0, "reward": 0.7806640625, "reward_std": 0.06507465690374374, "rewards/accuracy_reward": 0.56181640625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99951171875, "rewards/mean_confidence_reward": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00068359375, "completions/max_length": 1301.0, "completions/max_terminated_length": 503.6, "completions/mean_length": 94.90302734375, "completions/mean_terminated_length": 93.91733856201172, "completions/min_length": 42.2, "completions/min_terminated_length": 42.2, "epoch": 0.448, "grad_norm": 0.0015080425655469298, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 359386966.0, "reward": 0.76650390625, "reward_std": 0.069513601064682, "rewards/accuracy_reward": 0.53369140625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99931640625, "rewards/mean_confidence_reward": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00185546875, "completions/max_length": 1536.0, "completions/max_terminated_length": 541.6, "completions/mean_length": 99.85595703125, "completions/mean_terminated_length": 97.19031372070313, "completions/min_length": 40.8, "completions/min_terminated_length": 40.8, "epoch": 0.464, "grad_norm": 0.0010223939316347241, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 372436627.0, "reward": 0.7345703125, "reward_std": 0.05768234580755234, "rewards/accuracy_reward": 0.47109375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00068359375, "completions/max_length": 1303.2, "completions/max_terminated_length": 405.6, "completions/mean_length": 98.14833984375, "completions/mean_terminated_length": 97.16446533203126, "completions/min_length": 43.8, "completions/min_terminated_length": 43.8, "epoch": 0.48, "grad_norm": 0.0016905076336115599, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 385346018.0, "reward": 0.769873046875, "reward_std": 0.07736360728740692, "rewards/accuracy_reward": 0.5408203125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99892578125, "rewards/mean_confidence_reward": 0.0, "step": 150 }, { "epoch": 0.48, "eval_completions/clipped_ratio": 0.001953125, "eval_completions/max_length": 582.5, "eval_completions/max_terminated_length": 264.0, "eval_completions/mean_length": 101.61126136779785, "eval_completions/mean_terminated_length": 98.80445098876953, "eval_completions/min_length": 51.25, "eval_completions/min_terminated_length": 51.25, "eval_loss": 0.0, "eval_num_tokens": 385346018.0, "eval_reward": 0.7265625, "eval_reward_std": 0.2514254078269005, "eval_rewards/accuracy_reward": 0.455078125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.998046875, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 24.3286, "eval_samples_per_second": 20.552, "eval_steps_per_second": 0.164, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1536.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 97.898046875, "completions/mean_terminated_length": 96.7735092163086, "completions/min_length": 40.2, "completions/min_terminated_length": 40.2, "epoch": 0.496, "grad_norm": 0.001528796274214983, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 398512654.0, "reward": 0.7744140625, "reward_std": 0.0720748171210289, "rewards/accuracy_reward": 0.549609375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99921875, "rewards/mean_confidence_reward": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 1317.0, "completions/max_terminated_length": 399.8, "completions/mean_length": 95.37158203125, "completions/mean_terminated_length": 94.66787719726562, "completions/min_length": 40.8, "completions/min_terminated_length": 40.8, "epoch": 0.512, "grad_norm": 0.00132983538787812, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 411491243.0, "reward": 0.779931640625, "reward_std": 0.06670133695006371, "rewards/accuracy_reward": 0.56044921875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9994140625, "rewards/mean_confidence_reward": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 1341.4, "completions/max_terminated_length": 662.8, "completions/mean_length": 96.72470703125, "completions/mean_terminated_length": 95.31717834472656, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.528, "grad_norm": 0.001326797646470368, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 424367560.0, "reward": 0.77578125, "reward_std": 0.06582950651645661, "rewards/accuracy_reward": 0.5525390625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0001953125, "completions/max_length": 699.4, "completions/max_terminated_length": 456.6, "completions/mean_length": 93.36513671875, "completions/mean_terminated_length": 93.0830810546875, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.544, "grad_norm": 0.0017570438794791698, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 437343523.0, "reward": 0.796533203125, "reward_std": 0.07356481105089188, "rewards/accuracy_reward": 0.593359375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99970703125, "rewards/mean_confidence_reward": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.8, "completions/max_terminated_length": 444.8, "completions/mean_length": 91.45390625, "completions/mean_terminated_length": 91.45390625, "completions/min_length": 41.8, "completions/min_terminated_length": 41.8, "epoch": 0.56, "grad_norm": 0.0014896654756739736, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 449957739.0, "reward": 0.76611328125, "reward_std": 0.060523012280464174, "rewards/accuracy_reward": 0.53232421875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 1289.4, "completions/max_terminated_length": 382.2, "completions/mean_length": 90.7419921875, "completions/mean_terminated_length": 90.17736206054687, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.576, "grad_norm": 0.0016016842564567924, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 462929881.0, "reward": 0.764111328125, "reward_std": 0.05608753189444542, "rewards/accuracy_reward": 0.52861328125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.999609375, "rewards/mean_confidence_reward": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 603.8, "completions/max_terminated_length": 356.0, "completions/mean_length": 91.03701171875, "completions/mean_terminated_length": 90.8958251953125, "completions/min_length": 42.6, "completions/min_terminated_length": 42.6, "epoch": 0.592, "grad_norm": 0.0013300231657922268, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 475886132.0, "reward": 0.763671875, "reward_std": 0.06121245920658112, "rewards/accuracy_reward": 0.52744140625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00029296875, "completions/max_length": 831.0, "completions/max_terminated_length": 363.8, "completions/mean_length": 94.77607421875, "completions/mean_terminated_length": 94.35356140136719, "completions/min_length": 42.6, "completions/min_terminated_length": 42.6, "epoch": 0.608, "grad_norm": 0.001449022558517754, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 488712447.0, "reward": 0.776220703125, "reward_std": 0.055175574868917464, "rewards/accuracy_reward": 0.55283203125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.999609375, "rewards/mean_confidence_reward": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 719.8, "completions/max_terminated_length": 511.4, "completions/mean_length": 95.49326171875, "completions/mean_terminated_length": 95.3530044555664, "completions/min_length": 42.2, "completions/min_terminated_length": 42.2, "epoch": 0.624, "grad_norm": 0.0016013348940759897, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 501890522.0, "reward": 0.770703125, "reward_std": 0.06375713348388672, "rewards/accuracy_reward": 0.54150390625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001171875, "completions/max_length": 1310.4, "completions/max_terminated_length": 450.2, "completions/mean_length": 97.82939453125, "completions/mean_terminated_length": 96.14148406982422, "completions/min_length": 43.8, "completions/min_terminated_length": 43.8, "epoch": 0.64, "grad_norm": 0.00144854630343616, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 515091303.0, "reward": 0.79462890625, "reward_std": 0.05500866025686264, "rewards/accuracy_reward": 0.5904296875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998828125, "rewards/mean_confidence_reward": 0.0, "step": 200 }, { "epoch": 0.64, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 226.5, "eval_completions/max_terminated_length": 226.5, "eval_completions/mean_length": 97.45433807373047, "eval_completions/mean_terminated_length": 97.45433807373047, "eval_completions/min_length": 48.5, "eval_completions/min_terminated_length": 48.5, "eval_loss": 0.0, "eval_num_tokens": 515091303.0, "eval_reward": 0.7216796875, "eval_reward_std": 0.24690637737512589, "eval_rewards/accuracy_reward": 0.443359375, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 14.5533, "eval_samples_per_second": 34.357, "eval_steps_per_second": 0.275, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0001953125, "completions/max_length": 809.0, "completions/max_terminated_length": 348.2, "completions/mean_length": 95.0333984375, "completions/mean_terminated_length": 94.75228271484374, "completions/min_length": 42.4, "completions/min_terminated_length": 42.4, "epoch": 0.656, "grad_norm": 0.0017163316952064633, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 527777309.0, "reward": 0.75673828125, "reward_std": 0.060856021195650103, "rewards/accuracy_reward": 0.513671875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 828.2, "completions/max_terminated_length": 388.2, "completions/mean_length": 95.4216796875, "completions/mean_terminated_length": 94.85912628173828, "completions/min_length": 41.4, "completions/min_terminated_length": 41.4, "epoch": 0.672, "grad_norm": 0.0013491360004991293, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 540524187.0, "reward": 0.768115234375, "reward_std": 0.05722193792462349, "rewards/accuracy_reward": 0.53662109375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.999609375, "rewards/mean_confidence_reward": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 1050.6, "completions/max_terminated_length": 373.2, "completions/mean_length": 95.3826171875, "completions/mean_terminated_length": 94.82014465332031, "completions/min_length": 43.8, "completions/min_terminated_length": 43.8, "epoch": 0.688, "grad_norm": 0.0017077566590160131, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 553311145.0, "reward": 0.779443359375, "reward_std": 0.06311368122696877, "rewards/accuracy_reward": 0.55927734375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.999609375, "rewards/mean_confidence_reward": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 810.0, "completions/max_terminated_length": 377.8, "completions/mean_length": 90.67568359375, "completions/mean_terminated_length": 90.11159210205078, "completions/min_length": 42.6, "completions/min_terminated_length": 42.6, "epoch": 0.704, "grad_norm": 0.0018953669350594282, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 565962128.0, "reward": 0.78388671875, "reward_std": 0.05257489308714867, "rewards/accuracy_reward": 0.5681640625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.999609375, "rewards/mean_confidence_reward": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0005859375, "completions/max_length": 1065.6, "completions/max_terminated_length": 403.6, "completions/mean_length": 91.39638671875, "completions/mean_terminated_length": 90.54814147949219, "completions/min_length": 43.6, "completions/min_terminated_length": 43.6, "epoch": 0.72, "grad_norm": 0.0019243984716013074, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 578764203.0, "reward": 0.790576171875, "reward_std": 0.05948638021945953, "rewards/accuracy_reward": 0.58173828125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9994140625, "rewards/mean_confidence_reward": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1299.8, "completions/max_terminated_length": 342.0, "completions/mean_length": 91.9408203125, "completions/mean_terminated_length": 90.81210021972656, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.736, "grad_norm": 0.0013582052197307348, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 591501581.0, "reward": 0.79150390625, "reward_std": 0.05623424053192139, "rewards/accuracy_reward": 0.5837890625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99921875, "rewards/mean_confidence_reward": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0001953125, "completions/max_length": 565.0, "completions/max_terminated_length": 327.2, "completions/mean_length": 91.51416015625, "completions/mean_terminated_length": 91.23197326660156, "completions/min_length": 43.8, "completions/min_terminated_length": 43.8, "epoch": 0.752, "grad_norm": 0.0015649450942873955, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 604522206.0, "reward": 0.7857421875, "reward_std": 0.05413587838411331, "rewards/accuracy_reward": 0.5716796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 646.2, "completions/max_terminated_length": 607.4, "completions/mean_length": 94.9529296875, "completions/mean_terminated_length": 94.39071350097656, "completions/min_length": 42.4, "completions/min_terminated_length": 42.4, "epoch": 0.768, "grad_norm": 0.0017648260109126568, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 617283548.0, "reward": 0.761279296875, "reward_std": 0.057571640610694884, "rewards/accuracy_reward": 0.52294921875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.999609375, "rewards/mean_confidence_reward": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.6, "completions/max_terminated_length": 413.6, "completions/mean_length": 93.616015625, "completions/mean_terminated_length": 93.616015625, "completions/min_length": 43.8, "completions/min_terminated_length": 43.8, "epoch": 0.784, "grad_norm": 0.0011762650683522224, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 630272864.0, "reward": 0.7892578125, "reward_std": 0.05626345500349998, "rewards/accuracy_reward": 0.578515625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.8, "completions/max_terminated_length": 326.8, "completions/mean_length": 92.14990234375, "completions/mean_terminated_length": 92.14990234375, "completions/min_length": 44.6, "completions/min_terminated_length": 44.6, "epoch": 0.8, "grad_norm": 0.0013073732843622565, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 643083359.0, "reward": 0.804052734375, "reward_std": 0.05071377567946911, "rewards/accuracy_reward": 0.60810546875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 250 }, { "epoch": 0.8, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 236.75, "eval_completions/max_terminated_length": 236.75, "eval_completions/mean_length": 91.57341003417969, "eval_completions/mean_terminated_length": 91.57341003417969, "eval_completions/min_length": 49.5, "eval_completions/min_terminated_length": 49.5, "eval_loss": 0.0, "eval_num_tokens": 643083359.0, "eval_reward": 0.7265625, "eval_reward_std": 0.24742106348276138, "eval_rewards/accuracy_reward": 0.453125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 14.2768, "eval_samples_per_second": 35.022, "eval_steps_per_second": 0.28, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0001953125, "completions/max_length": 588.8, "completions/max_terminated_length": 456.2, "completions/mean_length": 89.20595703125, "completions/mean_terminated_length": 88.92289428710937, "completions/min_length": 43.2, "completions/min_terminated_length": 43.2, "epoch": 0.816, "grad_norm": 0.0019169868901371956, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 655952316.0, "reward": 0.805029296875, "reward_std": 0.054716046899557114, "rewards/accuracy_reward": 0.61025390625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.6, "completions/max_terminated_length": 462.6, "completions/mean_length": 91.78896484375, "completions/mean_terminated_length": 91.78896484375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.832, "grad_norm": 0.0014281836338341236, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 668756907.0, "reward": 0.790087890625, "reward_std": 0.0539084292948246, "rewards/accuracy_reward": 0.58017578125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 663.6, "completions/max_terminated_length": 417.8, "completions/mean_length": 91.91787109375, "completions/mean_terminated_length": 91.77665557861329, "completions/min_length": 45.4, "completions/min_terminated_length": 45.4, "epoch": 0.848, "grad_norm": 0.001417965511791408, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 681568834.0, "reward": 0.77236328125, "reward_std": 0.0536438025534153, "rewards/accuracy_reward": 0.54482421875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 1059.4, "completions/max_terminated_length": 416.2, "completions/mean_length": 94.8748046875, "completions/mean_terminated_length": 94.31214294433593, "completions/min_length": 42.8, "completions/min_terminated_length": 42.8, "epoch": 0.864, "grad_norm": 0.0025267351884394884, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 694383488.0, "reward": 0.809716796875, "reward_std": 0.05303701683878899, "rewards/accuracy_reward": 0.61982421875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.999609375, "rewards/mean_confidence_reward": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 592.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 94.92939453125, "completions/mean_terminated_length": 94.78859100341796, "completions/min_length": 45.4, "completions/min_terminated_length": 45.4, "epoch": 0.88, "grad_norm": 0.001522132777608931, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 707358957.0, "reward": 0.760546875, "reward_std": 0.055512601137161256, "rewards/accuracy_reward": 0.52119140625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 572.6, "completions/max_terminated_length": 337.0, "completions/mean_length": 97.50546875, "completions/mean_terminated_length": 97.36535034179687, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.896, "grad_norm": 0.0011945873266085982, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 720324581.0, "reward": 0.784912109375, "reward_std": 0.04334753602743149, "rewards/accuracy_reward": 0.569921875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0001953125, "completions/max_length": 960.6, "completions/max_terminated_length": 469.8, "completions/mean_length": 97.699609375, "completions/mean_terminated_length": 97.41859893798828, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.912, "grad_norm": 0.0012196388561278582, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 733232641.0, "reward": 0.781201171875, "reward_std": 0.0503702849149704, "rewards/accuracy_reward": 0.56259765625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 527.2, "completions/max_terminated_length": 509.0, "completions/mean_length": 94.194921875, "completions/mean_terminated_length": 94.05436248779297, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.928, "grad_norm": 0.0010627944720909, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 746080333.0, "reward": 0.77783203125, "reward_std": 0.0477489285171032, "rewards/accuracy_reward": 0.55576171875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0001953125, "completions/max_length": 830.8, "completions/max_terminated_length": 384.8, "completions/mean_length": 96.19130859375, "completions/mean_terminated_length": 95.91045227050782, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.944, "grad_norm": 0.0012808856554329395, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 758897076.0, "reward": 0.774853515625, "reward_std": 0.06085398942232132, "rewards/accuracy_reward": 0.54990234375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 618.6, "completions/max_terminated_length": 385.6, "completions/mean_length": 95.019921875, "completions/mean_terminated_length": 94.87908020019532, "completions/min_length": 41.6, "completions/min_terminated_length": 41.6, "epoch": 0.96, "grad_norm": 0.001602579141035676, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 771666720.0, "reward": 0.77060546875, "reward_std": 0.047741709649562834, "rewards/accuracy_reward": 0.54130859375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.0, "step": 300 }, { "epoch": 0.96, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 212.5, "eval_completions/max_terminated_length": 212.5, "eval_completions/mean_length": 94.41190719604492, "eval_completions/mean_terminated_length": 94.41190719604492, "eval_completions/min_length": 51.0, "eval_completions/min_terminated_length": 51.0, "eval_loss": 0.0, "eval_num_tokens": 771666720.0, "eval_reward": 0.7158203125, "eval_reward_std": 0.24653732031583786, "eval_rewards/accuracy_reward": 0.431640625, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 13.6738, "eval_samples_per_second": 36.566, "eval_steps_per_second": 0.293, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0005859375, "completions/max_length": 1112.6, "completions/max_terminated_length": 416.8, "completions/mean_length": 95.6703125, "completions/mean_terminated_length": 94.82695617675782, "completions/min_length": 39.8, "completions/min_terminated_length": 39.8, "epoch": 0.976, "grad_norm": 0.0012756388168781996, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 784363824.0, "reward": 0.789599609375, "reward_std": 0.05751314386725426, "rewards/accuracy_reward": 0.57978515625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9994140625, "rewards/mean_confidence_reward": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00029296875, "completions/max_length": 813.6, "completions/max_terminated_length": 335.2, "completions/mean_length": 93.523046875, "completions/mean_terminated_length": 93.10098266601562, "completions/min_length": 41.8, "completions/min_terminated_length": 41.8, "epoch": 0.992, "grad_norm": 0.0012513543479144573, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 797306300.0, "reward": 0.778857421875, "reward_std": 0.04584160037338734, "rewards/accuracy_reward": 0.5580078125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99970703125, "rewards/mean_confidence_reward": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 92.21929550170898, "completions/mean_terminated_length": 92.21929550170898, "completions/min_length": 40.5, "completions/min_terminated_length": 40.5, "epoch": 0.9984, "num_tokens": 802441496.0, "reward": 0.784912109375, "reward_std": 0.055518221110105515, "rewards/accuracy_reward": 0.570556640625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.999267578125, "rewards/mean_confidence_reward": 0.0, "step": 312, "total_flos": 0.0, "train_loss": 0.0036987085283889873, "train_runtime": 71191.4821, "train_samples_per_second": 0.281, "train_steps_per_second": 0.004 } ], "logging_steps": 5, "max_steps": 312, "num_input_tokens_seen": 802441496, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }