| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9996190476190476, | |
| "eval_steps": 50, | |
| "global_step": 164, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.24140625, | |
| "completions/max_length": 64.6, | |
| "completions/max_terminated_length": 64.4, | |
| "completions/mean_length": 63.24580078125, | |
| "completions/mean_terminated_length": 63.16332855224609, | |
| "completions/min_length": 16.2, | |
| "completions/min_terminated_length": 16.2, | |
| "epoch": 0.030476190476190476, | |
| "grad_norm": 0.4937044382095337, | |
| "learning_rate": 5.555555555555555e-07, | |
| "loss": 0.0283, | |
| "num_tokens": 1958357.0, | |
| "reward": 0.147213414311409, | |
| "reward_std": 0.6661659717559815, | |
| "rewards/accuracy_reward": 0.01796875, | |
| "rewards/brier_reward": 0.02096688412129879, | |
| "rewards/confidence_one_or_zero": 0.0072265625, | |
| "rewards/format_reward": 0.0279296875, | |
| "rewards/log_2_reward": 0.24852838814258577, | |
| "rewards/mean_confidence_reward": 0.03470296487212181, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.366015625, | |
| "completions/max_length": 64.6, | |
| "completions/max_terminated_length": 64.4, | |
| "completions/mean_length": 61.88505859375, | |
| "completions/mean_terminated_length": 60.428192138671875, | |
| "completions/min_length": 11.2, | |
| "completions/min_terminated_length": 11.2, | |
| "epoch": 0.06095238095238095, | |
| "grad_norm": 0.24706685543060303, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0898, | |
| "num_tokens": 3902780.0, | |
| "reward": 0.45152418315410614, | |
| "reward_std": 1.2864935278892518, | |
| "rewards/accuracy_reward": 0.0533203125, | |
| "rewards/brier_reward": 0.06290790475904942, | |
| "rewards/confidence_one_or_zero": 0.02109375, | |
| "rewards/format_reward": 0.0875, | |
| "rewards/log_2_reward": 0.7622280478477478, | |
| "rewards/mean_confidence_reward": 0.08969703018665313, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0689453125, | |
| "completions/max_length": 64.2, | |
| "completions/max_terminated_length": 64.2, | |
| "completions/mean_length": 47.2318359375, | |
| "completions/mean_terminated_length": 46.269396209716795, | |
| "completions/min_length": 17.6, | |
| "completions/min_terminated_length": 17.6, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.0876019150018692, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2805, | |
| "num_tokens": 5697154.0, | |
| "reward": 3.8407562732696534, | |
| "reward_std": 2.1547468423843386, | |
| "rewards/accuracy_reward": 0.47509765625, | |
| "rewards/brier_reward": 0.5359858989715576, | |
| "rewards/confidence_one_or_zero": 0.1521484375, | |
| "rewards/format_reward": 0.7376953125, | |
| "rewards/log_2_reward": 6.46871919631958, | |
| "rewards/mean_confidence_reward": 0.6385979354381561, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00185546875, | |
| "completions/max_length": 63.4, | |
| "completions/max_terminated_length": 61.6, | |
| "completions/mean_length": 37.20126953125, | |
| "completions/mean_terminated_length": 37.15373306274414, | |
| "completions/min_length": 19.6, | |
| "completions/min_terminated_length": 19.6, | |
| "epoch": 0.1219047619047619, | |
| "grad_norm": 0.01609966531395912, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0059, | |
| "num_tokens": 7388815.0, | |
| "reward": 5.555976676940918, | |
| "reward_std": 0.33366540521383287, | |
| "rewards/accuracy_reward": 0.71943359375, | |
| "rewards/brier_reward": 0.8196830987930298, | |
| "rewards/confidence_one_or_zero": 0.07275390625, | |
| "rewards/format_reward": 0.9970703125, | |
| "rewards/log_2_reward": 9.395448875427245, | |
| "rewards/mean_confidence_reward": 0.7144028902053833, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 54.0, | |
| "completions/max_terminated_length": 54.0, | |
| "completions/mean_length": 35.7521484375, | |
| "completions/mean_terminated_length": 35.7521484375, | |
| "completions/min_length": 28.8, | |
| "completions/min_terminated_length": 28.8, | |
| "epoch": 0.1523809523809524, | |
| "grad_norm": 0.007558116689324379, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0006, | |
| "num_tokens": 9065637.0, | |
| "reward": 5.614911460876465, | |
| "reward_std": 0.0848025493323803, | |
| "rewards/accuracy_reward": 0.7482421875, | |
| "rewards/brier_reward": 0.8309232592582703, | |
| "rewards/confidence_one_or_zero": 0.02490234375, | |
| "rewards/format_reward": 0.99921875, | |
| "rewards/log_2_reward": 9.482362365722656, | |
| "rewards/mean_confidence_reward": 0.7317079305648804, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.4, | |
| "completions/max_terminated_length": 51.4, | |
| "completions/mean_length": 34.6220703125, | |
| "completions/mean_terminated_length": 34.6220703125, | |
| "completions/min_length": 32.4, | |
| "completions/min_terminated_length": 32.4, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.0032027510460466146, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0001, | |
| "num_tokens": 10730887.0, | |
| "reward": 5.600866794586182, | |
| "reward_std": 0.02974188979715109, | |
| "rewards/accuracy_reward": 0.71484375, | |
| "rewards/brier_reward": 0.831008231639862, | |
| "rewards/confidence_one_or_zero": 0.01875, | |
| "rewards/format_reward": 0.99990234375, | |
| "rewards/log_2_reward": 9.48698787689209, | |
| "rewards/mean_confidence_reward": 0.7338953256607056, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.6, | |
| "completions/max_terminated_length": 51.6, | |
| "completions/mean_length": 36.87431640625, | |
| "completions/mean_terminated_length": 36.87431640625, | |
| "completions/min_length": 32.2, | |
| "completions/min_terminated_length": 32.2, | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 12419200.0, | |
| "reward": 5.571425914764404, | |
| "reward_std": 0.005977154635274928, | |
| "rewards/accuracy_reward": 0.71533203125, | |
| "rewards/brier_reward": 0.807461929321289, | |
| "rewards/confidence_one_or_zero": 0.021875, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.427520751953125, | |
| "rewards/mean_confidence_reward": 0.756035315990448, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.6, | |
| "completions/max_terminated_length": 51.6, | |
| "completions/mean_length": 37.90732421875, | |
| "completions/mean_terminated_length": 37.90732421875, | |
| "completions/min_length": 33.4, | |
| "completions/min_terminated_length": 33.4, | |
| "epoch": 0.2438095238095238, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 14118091.0, | |
| "reward": 5.616148853302002, | |
| "reward_std": 3.089594500238491e-08, | |
| "rewards/accuracy_reward": 0.75, | |
| "rewards/brier_reward": 0.8339757204055787, | |
| "rewards/confidence_one_or_zero": 0.021875, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.482297897338867, | |
| "rewards/mean_confidence_reward": 0.7595968842506409, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 52.0, | |
| "completions/max_terminated_length": 52.0, | |
| "completions/mean_length": 37.988671875, | |
| "completions/mean_terminated_length": 37.988671875, | |
| "completions/min_length": 34.2, | |
| "completions/min_terminated_length": 34.2, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 1.1408615563368585e-07, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 15817815.0, | |
| "reward": 5.657996559143067, | |
| "reward_std": 8.469001613775617e-07, | |
| "rewards/accuracy_reward": 0.7875, | |
| "rewards/brier_reward": 0.8462499141693115, | |
| "rewards/confidence_one_or_zero": 0.021875, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.528493499755859, | |
| "rewards/mean_confidence_reward": 0.7478750228881836, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.4, | |
| "completions/max_terminated_length": 51.4, | |
| "completions/mean_length": 37.9708984375, | |
| "completions/mean_terminated_length": 37.9708984375, | |
| "completions/min_length": 34.6, | |
| "completions/min_terminated_length": 34.6, | |
| "epoch": 0.3047619047619048, | |
| "grad_norm": 5.573377848122618e-07, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 17517357.0, | |
| "reward": 5.583265399932861, | |
| "reward_std": 0.000277167129360123, | |
| "rewards/accuracy_reward": 0.72177734375, | |
| "rewards/brier_reward": 0.8111650466918945, | |
| "rewards/confidence_one_or_zero": 0.009375, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.444753646850586, | |
| "rewards/mean_confidence_reward": 0.756777310371399, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3047619047619048, | |
| "eval_completions/clipped_ratio": 0.0, | |
| "eval_completions/max_length": 47.75, | |
| "eval_completions/max_terminated_length": 47.75, | |
| "eval_completions/mean_length": 38.02060890197754, | |
| "eval_completions/mean_terminated_length": 38.02060890197754, | |
| "eval_completions/min_length": 36.0, | |
| "eval_completions/min_terminated_length": 36.0, | |
| "eval_loss": 0.0, | |
| "eval_num_tokens": 17517357.0, | |
| "eval_reward": 5.624788165092468, | |
| "eval_reward_std": 0.46847544610500336, | |
| "eval_rewards/accuracy_reward": 0.76953125, | |
| "eval_rewards/brier_reward": 0.8338424563407898, | |
| "eval_rewards/confidence_one_or_zero": 0.021484375, | |
| "eval_rewards/format_reward": 1.0, | |
| "eval_rewards/log_2_reward": 9.480044841766357, | |
| "eval_rewards/mean_confidence_reward": 0.7464218735694885, | |
| "eval_runtime": 6.6713, | |
| "eval_samples_per_second": 74.948, | |
| "eval_steps_per_second": 0.6, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.8, | |
| "completions/max_terminated_length": 51.8, | |
| "completions/mean_length": 38.0583984375, | |
| "completions/mean_terminated_length": 38.0583984375, | |
| "completions/min_length": 35.0, | |
| "completions/min_terminated_length": 35.0, | |
| "epoch": 0.3352380952380952, | |
| "grad_norm": 2.0944073142459274e-08, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 19217795.0, | |
| "reward": 5.633004188537598, | |
| "reward_std": 4.244793103680422e-08, | |
| "rewards/accuracy_reward": 0.74375, | |
| "rewards/brier_reward": 0.8407172799110413, | |
| "rewards/confidence_one_or_zero": 0.025, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.522258758544922, | |
| "rewards/mean_confidence_reward": 0.7552843928337097, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.6, | |
| "completions/max_terminated_length": 51.6, | |
| "completions/mean_length": 38.00673828125, | |
| "completions/mean_terminated_length": 38.00673828125, | |
| "completions/min_length": 35.0, | |
| "completions/min_terminated_length": 35.0, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 20917704.0, | |
| "reward": 5.66101713180542, | |
| "reward_std": 0.0026967732763623076, | |
| "rewards/accuracy_reward": 0.7875, | |
| "rewards/brier_reward": 0.8482979655265808, | |
| "rewards/confidence_one_or_zero": 0.0125, | |
| "rewards/format_reward": 0.99990234375, | |
| "rewards/log_2_reward": 9.534632110595703, | |
| "rewards/mean_confidence_reward": 0.7581686615943909, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.6, | |
| "completions/max_terminated_length": 51.6, | |
| "completions/mean_length": 38.03154296875, | |
| "completions/mean_terminated_length": 38.03154296875, | |
| "completions/min_length": 35.6, | |
| "completions/min_terminated_length": 35.6, | |
| "epoch": 0.3961904761904762, | |
| "grad_norm": 9.332956096841372e-08, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 22617867.0, | |
| "reward": 5.594816780090332, | |
| "reward_std": 0.00027667832823539127, | |
| "rewards/accuracy_reward": 0.72197265625, | |
| "rewards/brier_reward": 0.8180258274078369, | |
| "rewards/confidence_one_or_zero": 0.025, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.467660713195801, | |
| "rewards/mean_confidence_reward": 0.7601582884788514, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 9.765625e-05, | |
| "completions/max_length": 53.8, | |
| "completions/max_terminated_length": 51.6, | |
| "completions/mean_length": 38.0232421875, | |
| "completions/mean_terminated_length": 38.020800018310545, | |
| "completions/min_length": 35.6, | |
| "completions/min_terminated_length": 35.6, | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "num_tokens": 24317945.0, | |
| "reward": 5.6432279586792, | |
| "reward_std": 0.0026955028995871546, | |
| "rewards/accuracy_reward": 0.76875, | |
| "rewards/brier_reward": 0.8412735939025879, | |
| "rewards/confidence_one_or_zero": 0.009375, | |
| "rewards/format_reward": 0.99990234375, | |
| "rewards/log_2_reward": 9.517803573608399, | |
| "rewards/mean_confidence_reward": 0.7524774312973023, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 52.0, | |
| "completions/max_terminated_length": 52.0, | |
| "completions/mean_length": 38.0142578125, | |
| "completions/mean_terminated_length": 38.0142578125, | |
| "completions/min_length": 36.6, | |
| "completions/min_terminated_length": 36.6, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 2.269921139941289e-08, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 26017931.0, | |
| "reward": 5.630081748962402, | |
| "reward_std": 4.863502539365072e-07, | |
| "rewards/accuracy_reward": 0.74375, | |
| "rewards/brier_reward": 0.8380238056182862, | |
| "rewards/confidence_one_or_zero": 0.00625, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.516414070129395, | |
| "rewards/mean_confidence_reward": 0.7339439868927002, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 52.0, | |
| "completions/max_terminated_length": 52.0, | |
| "completions/mean_length": 38.0216796875, | |
| "completions/mean_terminated_length": 38.0216796875, | |
| "completions/min_length": 36.2, | |
| "completions/min_terminated_length": 36.2, | |
| "epoch": 0.4876190476190476, | |
| "grad_norm": 1.2988358832899394e-07, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 27717993.0, | |
| "reward": 5.639082336425782, | |
| "reward_std": 1.4793427283166238e-07, | |
| "rewards/accuracy_reward": 0.759375, | |
| "rewards/brier_reward": 0.8395630717277527, | |
| "rewards/confidence_one_or_zero": 0.01884765625, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.51879005432129, | |
| "rewards/mean_confidence_reward": 0.760356330871582, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.8, | |
| "completions/max_terminated_length": 51.8, | |
| "completions/mean_length": 38.04814453125, | |
| "completions/mean_terminated_length": 38.04814453125, | |
| "completions/min_length": 35.8, | |
| "completions/min_terminated_length": 35.8, | |
| "epoch": 0.518095238095238, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 29418326.0, | |
| "reward": 5.621638393402099, | |
| "reward_std": 5.215314899942314e-07, | |
| "rewards/accuracy_reward": 0.740625, | |
| "rewards/brier_reward": 0.8350816965103149, | |
| "rewards/confidence_one_or_zero": 0.015625, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.502651786804199, | |
| "rewards/mean_confidence_reward": 0.7518718242645264, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.8, | |
| "completions/max_terminated_length": 51.8, | |
| "completions/mean_length": 38.00283203125, | |
| "completions/mean_terminated_length": 38.00283203125, | |
| "completions/min_length": 36.4, | |
| "completions/min_terminated_length": 36.4, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 31118195.0, | |
| "reward": 5.64435043334961, | |
| "reward_std": 9.689834064374736e-06, | |
| "rewards/accuracy_reward": 0.771875, | |
| "rewards/brier_reward": 0.8399171829223633, | |
| "rewards/confidence_one_or_zero": 0.01875, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.516826057434082, | |
| "rewards/mean_confidence_reward": 0.7615166783332825, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.6, | |
| "completions/max_terminated_length": 51.6, | |
| "completions/mean_length": 37.99736328125, | |
| "completions/mean_terminated_length": 37.99736328125, | |
| "completions/min_length": 36.6, | |
| "completions/min_terminated_length": 36.6, | |
| "epoch": 0.579047619047619, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 32818008.0, | |
| "reward": 5.644574451446533, | |
| "reward_std": 2.637261195559404e-06, | |
| "rewards/accuracy_reward": 0.771875, | |
| "rewards/brier_reward": 0.8405626773834228, | |
| "rewards/confidence_one_or_zero": 0.0125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.517273902893066, | |
| "rewards/mean_confidence_reward": 0.7512939691543579, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.6, | |
| "completions/max_terminated_length": 51.6, | |
| "completions/mean_length": 37.9896484375, | |
| "completions/mean_terminated_length": 37.9896484375, | |
| "completions/min_length": 36.0, | |
| "completions/min_terminated_length": 36.0, | |
| "epoch": 0.6095238095238096, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 34517742.0, | |
| "reward": 5.623265933990479, | |
| "reward_std": 2.0761127643709188e-07, | |
| "rewards/accuracy_reward": 0.7375, | |
| "rewards/brier_reward": 0.8349212646484375, | |
| "rewards/confidence_one_or_zero": 0.040625, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.509031677246094, | |
| "rewards/mean_confidence_reward": 0.7413062334060669, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6095238095238096, | |
| "eval_completions/clipped_ratio": 0.0, | |
| "eval_completions/max_length": 41.5, | |
| "eval_completions/max_terminated_length": 41.5, | |
| "eval_completions/mean_length": 37.94639015197754, | |
| "eval_completions/mean_terminated_length": 37.94639015197754, | |
| "eval_completions/min_length": 36.0, | |
| "eval_completions/min_terminated_length": 36.0, | |
| "eval_loss": 0.0, | |
| "eval_num_tokens": 34517742.0, | |
| "eval_reward": 5.625764727592468, | |
| "eval_reward_std": 0.46847712993621826, | |
| "eval_rewards/accuracy_reward": 0.771484375, | |
| "eval_rewards/brier_reward": 0.8338424563407898, | |
| "eval_rewards/confidence_one_or_zero": 0.021484375, | |
| "eval_rewards/format_reward": 1.0, | |
| "eval_rewards/log_2_reward": 9.480044841766357, | |
| "eval_rewards/mean_confidence_reward": 0.7464453130960464, | |
| "eval_runtime": 6.321, | |
| "eval_samples_per_second": 79.101, | |
| "eval_steps_per_second": 0.633, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.8, | |
| "completions/max_terminated_length": 51.8, | |
| "completions/mean_length": 38.013671875, | |
| "completions/mean_terminated_length": 38.013671875, | |
| "completions/min_length": 36.6, | |
| "completions/min_terminated_length": 36.6, | |
| "epoch": 0.64, | |
| "grad_norm": 2.458902770285931e-07, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 36217722.0, | |
| "reward": 5.606796550750732, | |
| "reward_std": 4.680077182683817e-07, | |
| "rewards/accuracy_reward": 0.73125, | |
| "rewards/brier_reward": 0.8244146585464478, | |
| "rewards/confidence_one_or_zero": 0.021875, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.482342720031738, | |
| "rewards/mean_confidence_reward": 0.7441876769065857, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.6, | |
| "completions/max_terminated_length": 51.6, | |
| "completions/mean_length": 38.02109375, | |
| "completions/mean_terminated_length": 38.02109375, | |
| "completions/min_length": 35.4, | |
| "completions/min_terminated_length": 35.4, | |
| "epoch": 0.6704761904761904, | |
| "grad_norm": 4.270423517027666e-07, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 37917778.0, | |
| "reward": 5.600750255584717, | |
| "reward_std": 1.2465623044022323e-06, | |
| "rewards/accuracy_reward": 0.725, | |
| "rewards/brier_reward": 0.8319555759429932, | |
| "rewards/confidence_one_or_zero": 0.00625, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.476500701904296, | |
| "rewards/mean_confidence_reward": 0.7388657212257386, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.8, | |
| "completions/max_terminated_length": 51.8, | |
| "completions/mean_length": 38.03330078125, | |
| "completions/mean_terminated_length": 38.03330078125, | |
| "completions/min_length": 36.4, | |
| "completions/min_terminated_length": 36.4, | |
| "epoch": 0.700952380952381, | |
| "grad_norm": 1.4557041794205361e-08, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 39617959.0, | |
| "reward": 5.6239110946655275, | |
| "reward_std": 1.9214322506400093e-07, | |
| "rewards/accuracy_reward": 0.74375, | |
| "rewards/brier_reward": 0.8344948053359985, | |
| "rewards/confidence_one_or_zero": 0.028125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.504072189331055, | |
| "rewards/mean_confidence_reward": 0.761590576171875, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.8, | |
| "completions/max_terminated_length": 51.8, | |
| "completions/mean_length": 37.98671875, | |
| "completions/mean_terminated_length": 37.98671875, | |
| "completions/min_length": 35.8, | |
| "completions/min_terminated_length": 35.8, | |
| "epoch": 0.7314285714285714, | |
| "grad_norm": 3.9047936439828845e-08, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 41317663.0, | |
| "reward": 5.612829780578613, | |
| "reward_std": 5.51469579335162e-08, | |
| "rewards/accuracy_reward": 0.740625, | |
| "rewards/brier_reward": 0.8286247611045837, | |
| "rewards/confidence_one_or_zero": 0.0125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.48503475189209, | |
| "rewards/mean_confidence_reward": 0.7615031599998474, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 52.0, | |
| "completions/max_terminated_length": 52.0, | |
| "completions/mean_length": 38.0029296875, | |
| "completions/mean_terminated_length": 38.0029296875, | |
| "completions/min_length": 36.2, | |
| "completions/min_terminated_length": 36.2, | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 9.443181170354364e-08, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 43017533.0, | |
| "reward": 5.626171684265136, | |
| "reward_std": 5.714372832699155e-07, | |
| "rewards/accuracy_reward": 0.74375, | |
| "rewards/brier_reward": 0.834185004234314, | |
| "rewards/confidence_one_or_zero": 0.01875, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.50859317779541, | |
| "rewards/mean_confidence_reward": 0.768284547328949, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 52.0, | |
| "completions/max_terminated_length": 52.0, | |
| "completions/mean_length": 38.01806640625, | |
| "completions/mean_terminated_length": 38.01806640625, | |
| "completions/min_length": 36.6, | |
| "completions/min_terminated_length": 36.6, | |
| "epoch": 0.7923809523809524, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 44717558.0, | |
| "reward": 5.594818878173828, | |
| "reward_std": 2.063908056015862e-07, | |
| "rewards/accuracy_reward": 0.71875, | |
| "rewards/brier_reward": 0.8205490350723267, | |
| "rewards/confidence_one_or_zero": 0.028125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.47088794708252, | |
| "rewards/mean_confidence_reward": 0.7437156438827515, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.8, | |
| "completions/max_terminated_length": 51.8, | |
| "completions/mean_length": 38.01826171875, | |
| "completions/mean_terminated_length": 38.01826171875, | |
| "completions/min_length": 36.8, | |
| "completions/min_terminated_length": 36.8, | |
| "epoch": 0.8228571428571428, | |
| "grad_norm": 0.002601601416245103, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 46417585.0, | |
| "reward": 5.612351322174073, | |
| "reward_std": 0.0027639744678197077, | |
| "rewards/accuracy_reward": 0.746875, | |
| "rewards/brier_reward": 0.8323373436927796, | |
| "rewards/confidence_one_or_zero": 0.0375, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.47782859802246, | |
| "rewards/mean_confidence_reward": 0.7525945901870728, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.6, | |
| "completions/max_terminated_length": 51.6, | |
| "completions/mean_length": 38.02470703125, | |
| "completions/mean_terminated_length": 38.02470703125, | |
| "completions/min_length": 35.8, | |
| "completions/min_terminated_length": 35.8, | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 48117678.0, | |
| "reward": 5.624975490570068, | |
| "reward_std": 5.118799759173954e-07, | |
| "rewards/accuracy_reward": 0.765625, | |
| "rewards/brier_reward": 0.8380842685699463, | |
| "rewards/confidence_one_or_zero": 0.028125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.484325790405274, | |
| "rewards/mean_confidence_reward": 0.746293580532074, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 52.0, | |
| "completions/max_terminated_length": 52.0, | |
| "completions/mean_length": 38.101171875, | |
| "completions/mean_terminated_length": 38.101171875, | |
| "completions/min_length": 36.8, | |
| "completions/min_terminated_length": 36.8, | |
| "epoch": 0.8838095238095238, | |
| "grad_norm": 1.7589671941209417e-08, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 49818554.0, | |
| "reward": 5.583425998687744, | |
| "reward_std": 0.0031354847874361267, | |
| "rewards/accuracy_reward": 0.70927734375, | |
| "rewards/brier_reward": 0.81463543176651, | |
| "rewards/confidence_one_or_zero": 0.021875, | |
| "rewards/format_reward": 0.99990234375, | |
| "rewards/log_2_reward": 9.457672691345214, | |
| "rewards/mean_confidence_reward": 0.7387593507766723, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 52.0, | |
| "completions/max_terminated_length": 52.0, | |
| "completions/mean_length": 38.06513671875, | |
| "completions/mean_terminated_length": 38.06513671875, | |
| "completions/min_length": 35.8, | |
| "completions/min_terminated_length": 35.8, | |
| "epoch": 0.9142857142857143, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 51519061.0, | |
| "reward": 5.636477375030518, | |
| "reward_std": 0.00027642484747048, | |
| "rewards/accuracy_reward": 0.76552734375, | |
| "rewards/brier_reward": 0.8363968968391419, | |
| "rewards/confidence_one_or_zero": 0.034375, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.507427406311034, | |
| "rewards/mean_confidence_reward": 0.7564831972122192, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9142857142857143, | |
| "eval_completions/clipped_ratio": 0.0, | |
| "eval_completions/max_length": 41.25, | |
| "eval_completions/max_terminated_length": 41.25, | |
| "eval_completions/mean_length": 37.91904640197754, | |
| "eval_completions/mean_terminated_length": 37.91904640197754, | |
| "eval_completions/min_length": 36.0, | |
| "eval_completions/min_terminated_length": 36.0, | |
| "eval_loss": 0.0, | |
| "eval_num_tokens": 51519061.0, | |
| "eval_reward": 5.625764727592468, | |
| "eval_reward_std": 0.46847712993621826, | |
| "eval_rewards/accuracy_reward": 0.771484375, | |
| "eval_rewards/brier_reward": 0.8338424563407898, | |
| "eval_rewards/confidence_one_or_zero": 0.021484375, | |
| "eval_rewards/format_reward": 1.0, | |
| "eval_rewards/log_2_reward": 9.480044841766357, | |
| "eval_rewards/mean_confidence_reward": 0.7464453130960464, | |
| "eval_runtime": 6.2742, | |
| "eval_samples_per_second": 79.692, | |
| "eval_steps_per_second": 0.638, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.8, | |
| "completions/max_terminated_length": 51.8, | |
| "completions/mean_length": 38.05771484375, | |
| "completions/mean_terminated_length": 38.05771484375, | |
| "completions/min_length": 35.8, | |
| "completions/min_terminated_length": 35.8, | |
| "epoch": 0.9447619047619048, | |
| "grad_norm": 1.0517731396930685e-07, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 53219492.0, | |
| "reward": 5.621528244018554, | |
| "reward_std": 8.822739161473692e-07, | |
| "rewards/accuracy_reward": 0.740625, | |
| "rewards/brier_reward": 0.8325549244880677, | |
| "rewards/confidence_one_or_zero": 0.021875, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.502431297302246, | |
| "rewards/mean_confidence_reward": 0.7421627283096314, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.8, | |
| "completions/max_terminated_length": 51.8, | |
| "completions/mean_length": 38.035546875, | |
| "completions/mean_terminated_length": 38.035546875, | |
| "completions/min_length": 36.4, | |
| "completions/min_terminated_length": 36.4, | |
| "epoch": 0.9752380952380952, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 54919696.0, | |
| "reward": 5.657506370544434, | |
| "reward_std": 8.810078497845097e-08, | |
| "rewards/accuracy_reward": 0.775, | |
| "rewards/brier_reward": 0.8494255661964416, | |
| "rewards/confidence_one_or_zero": 0.0125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.540012550354003, | |
| "rewards/mean_confidence_reward": 0.7592812776565552, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 51.75, | |
| "completions/max_terminated_length": 51.75, | |
| "completions/mean_length": 38.09580993652344, | |
| "completions/mean_terminated_length": 38.09580993652344, | |
| "completions/min_length": 36.75, | |
| "completions/min_terminated_length": 36.75, | |
| "epoch": 0.9996190476190476, | |
| "num_tokens": 56280078.0, | |
| "reward": 5.648329019546509, | |
| "reward_std": 7.243279220858767e-08, | |
| "rewards/accuracy_reward": 0.765625, | |
| "rewards/brier_reward": 0.8462725430727005, | |
| "rewards/confidence_one_or_zero": 0.03125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.531033277511597, | |
| "rewards/mean_confidence_reward": 0.7498086243867874, | |
| "step": 164, | |
| "total_flos": 0.0, | |
| "train_loss": 0.012317530474541073, | |
| "train_runtime": 10837.5902, | |
| "train_samples_per_second": 0.969, | |
| "train_steps_per_second": 0.015 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 164, | |
| "num_input_tokens_seen": 56280078, | |
| "num_train_epochs": 1, | |
| "save_steps": 60, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |