{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996190476190476, "eval_steps": 50, "global_step": 164, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.24140625, "completions/max_length": 64.6, "completions/max_terminated_length": 64.4, "completions/mean_length": 63.24580078125, "completions/mean_terminated_length": 63.16332855224609, "completions/min_length": 16.2, "completions/min_terminated_length": 16.2, "epoch": 0.030476190476190476, "grad_norm": 0.4937044382095337, "learning_rate": 5.555555555555555e-07, "loss": 0.0283, "num_tokens": 1958357.0, "reward": 0.147213414311409, "reward_std": 0.6661659717559815, "rewards/accuracy_reward": 0.01796875, "rewards/brier_reward": 0.02096688412129879, "rewards/confidence_one_or_zero": 0.0072265625, "rewards/format_reward": 0.0279296875, "rewards/log_2_reward": 0.24852838814258577, "rewards/mean_confidence_reward": 0.03470296487212181, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.366015625, "completions/max_length": 64.6, "completions/max_terminated_length": 64.4, "completions/mean_length": 61.88505859375, "completions/mean_terminated_length": 60.428192138671875, "completions/min_length": 11.2, "completions/min_terminated_length": 11.2, "epoch": 0.06095238095238095, "grad_norm": 0.24706685543060303, "learning_rate": 1e-06, "loss": 0.0898, "num_tokens": 3902780.0, "reward": 0.45152418315410614, "reward_std": 1.2864935278892518, "rewards/accuracy_reward": 0.0533203125, "rewards/brier_reward": 0.06290790475904942, "rewards/confidence_one_or_zero": 0.02109375, "rewards/format_reward": 0.0875, "rewards/log_2_reward": 0.7622280478477478, "rewards/mean_confidence_reward": 0.08969703018665313, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0689453125, "completions/max_length": 64.2, "completions/max_terminated_length": 64.2, "completions/mean_length": 47.2318359375, "completions/mean_terminated_length": 46.269396209716795, "completions/min_length": 17.6, "completions/min_terminated_length": 17.6, "epoch": 0.09142857142857143, "grad_norm": 0.0876019150018692, "learning_rate": 1e-06, "loss": 0.2805, "num_tokens": 5697154.0, "reward": 3.8407562732696534, "reward_std": 2.1547468423843386, "rewards/accuracy_reward": 0.47509765625, "rewards/brier_reward": 0.5359858989715576, "rewards/confidence_one_or_zero": 0.1521484375, "rewards/format_reward": 0.7376953125, "rewards/log_2_reward": 6.46871919631958, "rewards/mean_confidence_reward": 0.6385979354381561, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00185546875, "completions/max_length": 63.4, "completions/max_terminated_length": 61.6, "completions/mean_length": 37.20126953125, "completions/mean_terminated_length": 37.15373306274414, "completions/min_length": 19.6, "completions/min_terminated_length": 19.6, "epoch": 0.1219047619047619, "grad_norm": 0.01609966531395912, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 7388815.0, "reward": 5.555976676940918, "reward_std": 0.33366540521383287, "rewards/accuracy_reward": 0.71943359375, "rewards/brier_reward": 0.8196830987930298, "rewards/confidence_one_or_zero": 0.07275390625, "rewards/format_reward": 0.9970703125, "rewards/log_2_reward": 9.395448875427245, "rewards/mean_confidence_reward": 0.7144028902053833, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 35.7521484375, "completions/mean_terminated_length": 35.7521484375, "completions/min_length": 28.8, "completions/min_terminated_length": 28.8, "epoch": 0.1523809523809524, "grad_norm": 0.007558116689324379, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 9065637.0, "reward": 5.614911460876465, "reward_std": 0.0848025493323803, "rewards/accuracy_reward": 0.7482421875, "rewards/brier_reward": 0.8309232592582703, "rewards/confidence_one_or_zero": 0.02490234375, "rewards/format_reward": 0.99921875, "rewards/log_2_reward": 9.482362365722656, "rewards/mean_confidence_reward": 0.7317079305648804, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.4, "completions/max_terminated_length": 51.4, "completions/mean_length": 34.6220703125, "completions/mean_terminated_length": 34.6220703125, "completions/min_length": 32.4, "completions/min_terminated_length": 32.4, "epoch": 0.18285714285714286, "grad_norm": 0.0032027510460466146, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 10730887.0, "reward": 5.600866794586182, "reward_std": 0.02974188979715109, "rewards/accuracy_reward": 0.71484375, "rewards/brier_reward": 0.831008231639862, "rewards/confidence_one_or_zero": 0.01875, "rewards/format_reward": 0.99990234375, "rewards/log_2_reward": 9.48698787689209, "rewards/mean_confidence_reward": 0.7338953256607056, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.6, "completions/max_terminated_length": 51.6, "completions/mean_length": 36.87431640625, "completions/mean_terminated_length": 36.87431640625, "completions/min_length": 32.2, "completions/min_terminated_length": 32.2, "epoch": 0.21333333333333335, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 12419200.0, "reward": 5.571425914764404, "reward_std": 0.005977154635274928, "rewards/accuracy_reward": 0.71533203125, "rewards/brier_reward": 0.807461929321289, "rewards/confidence_one_or_zero": 0.021875, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.427520751953125, "rewards/mean_confidence_reward": 0.756035315990448, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.6, "completions/max_terminated_length": 51.6, "completions/mean_length": 37.90732421875, "completions/mean_terminated_length": 37.90732421875, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.2438095238095238, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 14118091.0, "reward": 5.616148853302002, "reward_std": 3.089594500238491e-08, "rewards/accuracy_reward": 0.75, "rewards/brier_reward": 0.8339757204055787, "rewards/confidence_one_or_zero": 0.021875, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.482297897338867, "rewards/mean_confidence_reward": 0.7595968842506409, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 37.988671875, "completions/mean_terminated_length": 37.988671875, "completions/min_length": 34.2, "completions/min_terminated_length": 34.2, "epoch": 0.2742857142857143, "grad_norm": 1.1408615563368585e-07, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 15817815.0, "reward": 5.657996559143067, "reward_std": 8.469001613775617e-07, "rewards/accuracy_reward": 0.7875, "rewards/brier_reward": 0.8462499141693115, "rewards/confidence_one_or_zero": 0.021875, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.528493499755859, "rewards/mean_confidence_reward": 0.7478750228881836, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.4, "completions/max_terminated_length": 51.4, "completions/mean_length": 37.9708984375, "completions/mean_terminated_length": 37.9708984375, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.3047619047619048, "grad_norm": 5.573377848122618e-07, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 17517357.0, "reward": 5.583265399932861, "reward_std": 0.000277167129360123, "rewards/accuracy_reward": 0.72177734375, "rewards/brier_reward": 0.8111650466918945, "rewards/confidence_one_or_zero": 0.009375, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.444753646850586, "rewards/mean_confidence_reward": 0.756777310371399, "step": 50 }, { "epoch": 0.3047619047619048, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 47.75, "eval_completions/max_terminated_length": 47.75, "eval_completions/mean_length": 38.02060890197754, "eval_completions/mean_terminated_length": 38.02060890197754, "eval_completions/min_length": 36.0, "eval_completions/min_terminated_length": 36.0, "eval_loss": 0.0, "eval_num_tokens": 17517357.0, "eval_reward": 5.624788165092468, "eval_reward_std": 0.46847544610500336, "eval_rewards/accuracy_reward": 0.76953125, "eval_rewards/brier_reward": 0.8338424563407898, "eval_rewards/confidence_one_or_zero": 0.021484375, "eval_rewards/format_reward": 1.0, "eval_rewards/log_2_reward": 9.480044841766357, "eval_rewards/mean_confidence_reward": 0.7464218735694885, "eval_runtime": 6.6713, "eval_samples_per_second": 74.948, "eval_steps_per_second": 0.6, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.8, "completions/max_terminated_length": 51.8, "completions/mean_length": 38.0583984375, "completions/mean_terminated_length": 38.0583984375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.3352380952380952, "grad_norm": 2.0944073142459274e-08, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 19217795.0, "reward": 5.633004188537598, "reward_std": 4.244793103680422e-08, "rewards/accuracy_reward": 0.74375, "rewards/brier_reward": 0.8407172799110413, "rewards/confidence_one_or_zero": 0.025, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.522258758544922, "rewards/mean_confidence_reward": 0.7552843928337097, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.6, "completions/max_terminated_length": 51.6, "completions/mean_length": 38.00673828125, "completions/mean_terminated_length": 38.00673828125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.3657142857142857, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 20917704.0, "reward": 5.66101713180542, "reward_std": 0.0026967732763623076, "rewards/accuracy_reward": 0.7875, "rewards/brier_reward": 0.8482979655265808, "rewards/confidence_one_or_zero": 0.0125, "rewards/format_reward": 0.99990234375, "rewards/log_2_reward": 9.534632110595703, "rewards/mean_confidence_reward": 0.7581686615943909, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.6, "completions/max_terminated_length": 51.6, "completions/mean_length": 38.03154296875, "completions/mean_terminated_length": 38.03154296875, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.3961904761904762, "grad_norm": 9.332956096841372e-08, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 22617867.0, "reward": 5.594816780090332, "reward_std": 0.00027667832823539127, "rewards/accuracy_reward": 0.72197265625, "rewards/brier_reward": 0.8180258274078369, "rewards/confidence_one_or_zero": 0.025, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.467660713195801, "rewards/mean_confidence_reward": 0.7601582884788514, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 53.8, "completions/max_terminated_length": 51.6, "completions/mean_length": 38.0232421875, "completions/mean_terminated_length": 38.020800018310545, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.4266666666666667, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 24317945.0, "reward": 5.6432279586792, "reward_std": 0.0026955028995871546, "rewards/accuracy_reward": 0.76875, "rewards/brier_reward": 0.8412735939025879, "rewards/confidence_one_or_zero": 0.009375, "rewards/format_reward": 0.99990234375, "rewards/log_2_reward": 9.517803573608399, "rewards/mean_confidence_reward": 0.7524774312973023, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 38.0142578125, "completions/mean_terminated_length": 38.0142578125, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.45714285714285713, "grad_norm": 2.269921139941289e-08, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 26017931.0, "reward": 5.630081748962402, "reward_std": 4.863502539365072e-07, "rewards/accuracy_reward": 0.74375, "rewards/brier_reward": 0.8380238056182862, "rewards/confidence_one_or_zero": 0.00625, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.516414070129395, "rewards/mean_confidence_reward": 0.7339439868927002, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 38.0216796875, "completions/mean_terminated_length": 38.0216796875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.4876190476190476, "grad_norm": 1.2988358832899394e-07, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 27717993.0, "reward": 5.639082336425782, "reward_std": 1.4793427283166238e-07, "rewards/accuracy_reward": 0.759375, "rewards/brier_reward": 0.8395630717277527, "rewards/confidence_one_or_zero": 0.01884765625, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.51879005432129, "rewards/mean_confidence_reward": 0.760356330871582, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.8, "completions/max_terminated_length": 51.8, "completions/mean_length": 38.04814453125, "completions/mean_terminated_length": 38.04814453125, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.518095238095238, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 29418326.0, "reward": 5.621638393402099, "reward_std": 5.215314899942314e-07, "rewards/accuracy_reward": 0.740625, "rewards/brier_reward": 0.8350816965103149, "rewards/confidence_one_or_zero": 0.015625, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.502651786804199, "rewards/mean_confidence_reward": 0.7518718242645264, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.8, "completions/max_terminated_length": 51.8, "completions/mean_length": 38.00283203125, "completions/mean_terminated_length": 38.00283203125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.5485714285714286, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 31118195.0, "reward": 5.64435043334961, "reward_std": 9.689834064374736e-06, "rewards/accuracy_reward": 0.771875, "rewards/brier_reward": 0.8399171829223633, "rewards/confidence_one_or_zero": 0.01875, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.516826057434082, "rewards/mean_confidence_reward": 0.7615166783332825, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.6, "completions/max_terminated_length": 51.6, "completions/mean_length": 37.99736328125, "completions/mean_terminated_length": 37.99736328125, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.579047619047619, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 32818008.0, "reward": 5.644574451446533, "reward_std": 2.637261195559404e-06, "rewards/accuracy_reward": 0.771875, "rewards/brier_reward": 0.8405626773834228, "rewards/confidence_one_or_zero": 0.0125, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.517273902893066, "rewards/mean_confidence_reward": 0.7512939691543579, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.6, "completions/max_terminated_length": 51.6, "completions/mean_length": 37.9896484375, "completions/mean_terminated_length": 37.9896484375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.6095238095238096, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 34517742.0, "reward": 5.623265933990479, "reward_std": 2.0761127643709188e-07, "rewards/accuracy_reward": 0.7375, "rewards/brier_reward": 0.8349212646484375, "rewards/confidence_one_or_zero": 0.040625, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.509031677246094, "rewards/mean_confidence_reward": 0.7413062334060669, "step": 100 }, { "epoch": 0.6095238095238096, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 41.5, "eval_completions/max_terminated_length": 41.5, "eval_completions/mean_length": 37.94639015197754, "eval_completions/mean_terminated_length": 37.94639015197754, "eval_completions/min_length": 36.0, "eval_completions/min_terminated_length": 36.0, "eval_loss": 0.0, "eval_num_tokens": 34517742.0, "eval_reward": 5.625764727592468, "eval_reward_std": 0.46847712993621826, "eval_rewards/accuracy_reward": 0.771484375, "eval_rewards/brier_reward": 0.8338424563407898, "eval_rewards/confidence_one_or_zero": 0.021484375, "eval_rewards/format_reward": 1.0, "eval_rewards/log_2_reward": 9.480044841766357, "eval_rewards/mean_confidence_reward": 0.7464453130960464, "eval_runtime": 6.321, "eval_samples_per_second": 79.101, "eval_steps_per_second": 0.633, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.8, "completions/max_terminated_length": 51.8, "completions/mean_length": 38.013671875, "completions/mean_terminated_length": 38.013671875, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.64, "grad_norm": 2.458902770285931e-07, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 36217722.0, "reward": 5.606796550750732, "reward_std": 4.680077182683817e-07, "rewards/accuracy_reward": 0.73125, "rewards/brier_reward": 0.8244146585464478, "rewards/confidence_one_or_zero": 0.021875, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.482342720031738, "rewards/mean_confidence_reward": 0.7441876769065857, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.6, "completions/max_terminated_length": 51.6, "completions/mean_length": 38.02109375, "completions/mean_terminated_length": 38.02109375, "completions/min_length": 35.4, "completions/min_terminated_length": 35.4, "epoch": 0.6704761904761904, "grad_norm": 4.270423517027666e-07, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 37917778.0, "reward": 5.600750255584717, "reward_std": 1.2465623044022323e-06, "rewards/accuracy_reward": 0.725, "rewards/brier_reward": 0.8319555759429932, "rewards/confidence_one_or_zero": 0.00625, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.476500701904296, "rewards/mean_confidence_reward": 0.7388657212257386, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.8, "completions/max_terminated_length": 51.8, "completions/mean_length": 38.03330078125, "completions/mean_terminated_length": 38.03330078125, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.700952380952381, "grad_norm": 1.4557041794205361e-08, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 39617959.0, "reward": 5.6239110946655275, "reward_std": 1.9214322506400093e-07, "rewards/accuracy_reward": 0.74375, "rewards/brier_reward": 0.8344948053359985, "rewards/confidence_one_or_zero": 0.028125, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.504072189331055, "rewards/mean_confidence_reward": 0.761590576171875, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.8, "completions/max_terminated_length": 51.8, "completions/mean_length": 37.98671875, "completions/mean_terminated_length": 37.98671875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.7314285714285714, "grad_norm": 3.9047936439828845e-08, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 41317663.0, "reward": 5.612829780578613, "reward_std": 5.51469579335162e-08, "rewards/accuracy_reward": 0.740625, "rewards/brier_reward": 0.8286247611045837, "rewards/confidence_one_or_zero": 0.0125, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.48503475189209, "rewards/mean_confidence_reward": 0.7615031599998474, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 38.0029296875, "completions/mean_terminated_length": 38.0029296875, "completions/min_length": 36.2, "completions/min_terminated_length": 36.2, "epoch": 0.7619047619047619, "grad_norm": 9.443181170354364e-08, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 43017533.0, "reward": 5.626171684265136, "reward_std": 5.714372832699155e-07, "rewards/accuracy_reward": 0.74375, "rewards/brier_reward": 0.834185004234314, "rewards/confidence_one_or_zero": 0.01875, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.50859317779541, "rewards/mean_confidence_reward": 0.768284547328949, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 38.01806640625, "completions/mean_terminated_length": 38.01806640625, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.7923809523809524, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 44717558.0, "reward": 5.594818878173828, "reward_std": 2.063908056015862e-07, "rewards/accuracy_reward": 0.71875, "rewards/brier_reward": 0.8205490350723267, "rewards/confidence_one_or_zero": 0.028125, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.47088794708252, "rewards/mean_confidence_reward": 0.7437156438827515, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.8, "completions/max_terminated_length": 51.8, "completions/mean_length": 38.01826171875, "completions/mean_terminated_length": 38.01826171875, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.8228571428571428, "grad_norm": 0.002601601416245103, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 46417585.0, "reward": 5.612351322174073, "reward_std": 0.0027639744678197077, "rewards/accuracy_reward": 0.746875, "rewards/brier_reward": 0.8323373436927796, "rewards/confidence_one_or_zero": 0.0375, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.47782859802246, "rewards/mean_confidence_reward": 0.7525945901870728, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.6, "completions/max_terminated_length": 51.6, "completions/mean_length": 38.02470703125, "completions/mean_terminated_length": 38.02470703125, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.8533333333333334, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 48117678.0, "reward": 5.624975490570068, "reward_std": 5.118799759173954e-07, "rewards/accuracy_reward": 0.765625, "rewards/brier_reward": 0.8380842685699463, "rewards/confidence_one_or_zero": 0.028125, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.484325790405274, "rewards/mean_confidence_reward": 0.746293580532074, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 38.101171875, "completions/mean_terminated_length": 38.101171875, "completions/min_length": 36.8, "completions/min_terminated_length": 36.8, "epoch": 0.8838095238095238, "grad_norm": 1.7589671941209417e-08, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 49818554.0, "reward": 5.583425998687744, "reward_std": 0.0031354847874361267, "rewards/accuracy_reward": 0.70927734375, "rewards/brier_reward": 0.81463543176651, "rewards/confidence_one_or_zero": 0.021875, "rewards/format_reward": 0.99990234375, "rewards/log_2_reward": 9.457672691345214, "rewards/mean_confidence_reward": 0.7387593507766723, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 38.06513671875, "completions/mean_terminated_length": 38.06513671875, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.9142857142857143, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 51519061.0, "reward": 5.636477375030518, "reward_std": 0.00027642484747048, "rewards/accuracy_reward": 0.76552734375, "rewards/brier_reward": 0.8363968968391419, "rewards/confidence_one_or_zero": 0.034375, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.507427406311034, "rewards/mean_confidence_reward": 0.7564831972122192, "step": 150 }, { "epoch": 0.9142857142857143, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 41.25, "eval_completions/max_terminated_length": 41.25, "eval_completions/mean_length": 37.91904640197754, "eval_completions/mean_terminated_length": 37.91904640197754, "eval_completions/min_length": 36.0, "eval_completions/min_terminated_length": 36.0, "eval_loss": 0.0, "eval_num_tokens": 51519061.0, "eval_reward": 5.625764727592468, "eval_reward_std": 0.46847712993621826, "eval_rewards/accuracy_reward": 0.771484375, "eval_rewards/brier_reward": 0.8338424563407898, "eval_rewards/confidence_one_or_zero": 0.021484375, "eval_rewards/format_reward": 1.0, "eval_rewards/log_2_reward": 9.480044841766357, "eval_rewards/mean_confidence_reward": 0.7464453130960464, "eval_runtime": 6.2742, "eval_samples_per_second": 79.692, "eval_steps_per_second": 0.638, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.8, "completions/max_terminated_length": 51.8, "completions/mean_length": 38.05771484375, "completions/mean_terminated_length": 38.05771484375, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.9447619047619048, "grad_norm": 1.0517731396930685e-07, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 53219492.0, "reward": 5.621528244018554, "reward_std": 8.822739161473692e-07, "rewards/accuracy_reward": 0.740625, "rewards/brier_reward": 0.8325549244880677, "rewards/confidence_one_or_zero": 0.021875, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.502431297302246, "rewards/mean_confidence_reward": 0.7421627283096314, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.8, "completions/max_terminated_length": 51.8, "completions/mean_length": 38.035546875, "completions/mean_terminated_length": 38.035546875, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.9752380952380952, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 54919696.0, "reward": 5.657506370544434, "reward_std": 8.810078497845097e-08, "rewards/accuracy_reward": 0.775, "rewards/brier_reward": 0.8494255661964416, "rewards/confidence_one_or_zero": 0.0125, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.540012550354003, "rewards/mean_confidence_reward": 0.7592812776565552, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.75, "completions/max_terminated_length": 51.75, "completions/mean_length": 38.09580993652344, "completions/mean_terminated_length": 38.09580993652344, "completions/min_length": 36.75, "completions/min_terminated_length": 36.75, "epoch": 0.9996190476190476, "num_tokens": 56280078.0, "reward": 5.648329019546509, "reward_std": 7.243279220858767e-08, "rewards/accuracy_reward": 0.765625, "rewards/brier_reward": 0.8462725430727005, "rewards/confidence_one_or_zero": 0.03125, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.531033277511597, "rewards/mean_confidence_reward": 0.7498086243867874, "step": 164, "total_flos": 0.0, "train_loss": 0.012317530474541073, "train_runtime": 10837.5902, "train_samples_per_second": 0.969, "train_steps_per_second": 0.015 } ], "logging_steps": 5, "max_steps": 164, "num_input_tokens_seen": 56280078, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }