| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9984, | |
| "eval_steps": 50, | |
| "global_step": 312, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03251953125, | |
| "completions/max_length": 1536.0, | |
| "completions/max_terminated_length": 1501.4, | |
| "completions/mean_length": 261.44921875, | |
| "completions/mean_terminated_length": 218.59018249511718, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "epoch": 0.016, | |
| "grad_norm": 0.04527007043361664, | |
| "learning_rate": 3.1249999999999997e-07, | |
| "loss": 0.0817, | |
| "num_tokens": 17521272.0, | |
| "reward": 0.6406179428100586, | |
| "reward_std": 0.49268757104873656, | |
| "rewards/accuracy_reward": 0.2216796875, | |
| "rewards/brier_reward": 0.37469087839126586, | |
| "rewards/confidence_one_or_zero": 0.26728515625, | |
| "rewards/format_reward": 0.68486328125, | |
| "rewards/mean_confidence_reward": 0.7439393520355224, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03271484375, | |
| "completions/max_length": 1536.0, | |
| "completions/max_terminated_length": 1502.0, | |
| "completions/mean_length": 256.41748046875, | |
| "completions/mean_terminated_length": 213.14424743652344, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "epoch": 0.032, | |
| "grad_norm": 0.13754118978977203, | |
| "learning_rate": 6.249999999999999e-07, | |
| "loss": 0.0794, | |
| "num_tokens": 35247339.0, | |
| "reward": 0.6558520913124084, | |
| "reward_std": 0.46240503787994386, | |
| "rewards/accuracy_reward": 0.2123046875, | |
| "rewards/brier_reward": 0.37713180780410765, | |
| "rewards/confidence_one_or_zero": 0.27294921875, | |
| "rewards/format_reward": 0.722265625, | |
| "rewards/mean_confidence_reward": 0.7526058673858642, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.023046875, | |
| "completions/max_length": 1536.0, | |
| "completions/max_terminated_length": 1481.6, | |
| "completions/mean_length": 217.0734375, | |
| "completions/mean_terminated_length": 186.05869750976564, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "epoch": 0.048, | |
| "grad_norm": 0.07561526447534561, | |
| "learning_rate": 9.374999999999999e-07, | |
| "loss": 0.074, | |
| "num_tokens": 52518907.0, | |
| "reward": 0.7986767530441284, | |
| "reward_std": 0.38383460640907285, | |
| "rewards/accuracy_reward": 0.26533203125, | |
| "rewards/brier_reward": 0.4729374527931213, | |
| "rewards/confidence_one_or_zero": 0.252734375, | |
| "rewards/format_reward": 0.85908203125, | |
| "rewards/mean_confidence_reward": 0.7597096920013428, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0064453125, | |
| "completions/max_length": 1536.0, | |
| "completions/max_terminated_length": 1263.0, | |
| "completions/mean_length": 151.86923828125, | |
| "completions/mean_terminated_length": 142.8934539794922, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "epoch": 0.064, | |
| "grad_norm": 0.016455456614494324, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0127, | |
| "num_tokens": 68992448.0, | |
| "reward": 0.935137140750885, | |
| "reward_std": 0.29728189706802366, | |
| "rewards/accuracy_reward": 0.33779296875, | |
| "rewards/brier_reward": 0.5799404501914978, | |
| "rewards/confidence_one_or_zero": 0.1966796875, | |
| "rewards/format_reward": 0.9525390625, | |
| "rewards/mean_confidence_reward": 0.7396757483482361, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0021484375, | |
| "completions/max_length": 1536.0, | |
| "completions/max_terminated_length": 923.6, | |
| "completions/mean_length": 128.238671875, | |
| "completions/mean_terminated_length": 125.20802764892578, | |
| "completions/min_length": 11.4, | |
| "completions/min_terminated_length": 11.4, | |
| "epoch": 0.08, | |
| "grad_norm": 0.05269403010606766, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "num_tokens": 85238764.0, | |
| "reward": 1.0060295939445496, | |
| "reward_std": 0.2248561441898346, | |
| "rewards/accuracy_reward": 0.35712890625, | |
| "rewards/brier_reward": 0.6683077096939087, | |
| "rewards/confidence_one_or_zero": 0.09072265625, | |
| "rewards/format_reward": 0.98662109375, | |
| "rewards/mean_confidence_reward": 0.6470906853675842, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00224609375, | |
| "completions/max_length": 1306.2, | |
| "completions/max_terminated_length": 937.4, | |
| "completions/mean_length": 133.3306640625, | |
| "completions/mean_terminated_length": 130.1755615234375, | |
| "completions/min_length": 10.8, | |
| "completions/min_terminated_length": 10.8, | |
| "epoch": 0.096, | |
| "grad_norm": 0.11787062138319016, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0043, | |
| "num_tokens": 101648678.0, | |
| "reward": 1.0414881467819215, | |
| "reward_std": 0.18446856439113618, | |
| "rewards/accuracy_reward": 0.3625, | |
| "rewards/brier_reward": 0.7302408814430237, | |
| "rewards/confidence_one_or_zero": 0.04970703125, | |
| "rewards/format_reward": 0.990234375, | |
| "rewards/mean_confidence_reward": 0.5119157314300538, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00146484375, | |
| "completions/max_length": 1536.0, | |
| "completions/max_terminated_length": 1072.2, | |
| "completions/mean_length": 143.08466796875, | |
| "completions/mean_terminated_length": 141.0427459716797, | |
| "completions/min_length": 2.6, | |
| "completions/min_terminated_length": 2.6, | |
| "epoch": 0.112, | |
| "grad_norm": 0.03638599067926407, | |
| "learning_rate": 1e-06, | |
| "loss": 0.004, | |
| "num_tokens": 118223337.0, | |
| "reward": 1.0774429082870483, | |
| "reward_std": 0.13774871528148652, | |
| "rewards/accuracy_reward": 0.39619140625, | |
| "rewards/brier_reward": 0.7643576622009277, | |
| "rewards/confidence_one_or_zero": 0.05927734375, | |
| "rewards/format_reward": 0.9943359375, | |
| "rewards/mean_confidence_reward": 0.37070313692092893, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00048828125, | |
| "completions/max_length": 913.2, | |
| "completions/max_terminated_length": 723.4, | |
| "completions/mean_length": 151.192578125, | |
| "completions/mean_terminated_length": 150.51485290527344, | |
| "completions/min_length": 43.2, | |
| "completions/min_terminated_length": 43.2, | |
| "epoch": 0.128, | |
| "grad_norm": 0.008543482981622219, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "num_tokens": 134688221.0, | |
| "reward": 1.0826910495758058, | |
| "reward_std": 0.10313712060451508, | |
| "rewards/accuracy_reward": 0.405078125, | |
| "rewards/brier_reward": 0.7624517321586609, | |
| "rewards/confidence_one_or_zero": 0.0619140625, | |
| "rewards/format_reward": 0.9978515625, | |
| "rewards/mean_confidence_reward": 0.31029855012893676, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.000390625, | |
| "completions/max_length": 899.6, | |
| "completions/max_terminated_length": 555.0, | |
| "completions/mean_length": 155.73017578125, | |
| "completions/mean_terminated_length": 155.19070434570312, | |
| "completions/min_length": 57.0, | |
| "completions/min_terminated_length": 57.0, | |
| "epoch": 0.144, | |
| "grad_norm": 0.010719305835664272, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "num_tokens": 151233330.0, | |
| "reward": 1.1194574117660523, | |
| "reward_std": 0.10182622820138931, | |
| "rewards/accuracy_reward": 0.4982421875, | |
| "rewards/brier_reward": 0.7418437123298645, | |
| "rewards/confidence_one_or_zero": 0.04111328125, | |
| "rewards/format_reward": 0.998828125, | |
| "rewards/mean_confidence_reward": 0.3395689487457275, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00078125, | |
| "completions/max_length": 635.4, | |
| "completions/max_terminated_length": 433.6, | |
| "completions/mean_length": 156.94189453125, | |
| "completions/mean_terminated_length": 155.86400146484374, | |
| "completions/min_length": 61.0, | |
| "completions/min_terminated_length": 61.0, | |
| "epoch": 0.16, | |
| "grad_norm": 0.02116214483976364, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0021, | |
| "num_tokens": 167861343.0, | |
| "reward": 1.1079134941101074, | |
| "reward_std": 0.10681741833686828, | |
| "rewards/accuracy_reward": 0.45244140625, | |
| "rewards/brier_reward": 0.7644589900970459, | |
| "rewards/confidence_one_or_zero": 0.02294921875, | |
| "rewards/format_reward": 0.99892578125, | |
| "rewards/mean_confidence_reward": 0.3910071313381195, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_completions/clipped_ratio": 0.0, | |
| "eval_completions/max_length": 351.75, | |
| "eval_completions/max_terminated_length": 351.75, | |
| "eval_completions/mean_length": 158.1922149658203, | |
| "eval_completions/mean_terminated_length": 158.1922149658203, | |
| "eval_completions/min_length": 87.0, | |
| "eval_completions/min_terminated_length": 87.0, | |
| "eval_loss": 0.0, | |
| "eval_num_tokens": 167861343.0, | |
| "eval_reward": 1.0662100315093994, | |
| "eval_reward_std": 0.21462075412273407, | |
| "eval_rewards/accuracy_reward": 0.357421875, | |
| "eval_rewards/brier_reward": 0.774997279047966, | |
| "eval_rewards/confidence_one_or_zero": 0.013671875, | |
| "eval_rewards/format_reward": 1.0, | |
| "eval_rewards/mean_confidence_reward": 0.4146093651652336, | |
| "eval_runtime": 21.8666, | |
| "eval_samples_per_second": 22.866, | |
| "eval_steps_per_second": 0.183, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.000390625, | |
| "completions/max_length": 915.6, | |
| "completions/max_terminated_length": 474.8, | |
| "completions/mean_length": 159.457421875, | |
| "completions/mean_terminated_length": 158.91932067871093, | |
| "completions/min_length": 45.8, | |
| "completions/min_terminated_length": 45.8, | |
| "epoch": 0.176, | |
| "grad_norm": 0.009603990241885185, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "num_tokens": 184731307.0, | |
| "reward": 1.1049310684204101, | |
| "reward_std": 0.11344930976629257, | |
| "rewards/accuracy_reward": 0.441796875, | |
| "rewards/brier_reward": 0.769236147403717, | |
| "rewards/confidence_one_or_zero": 0.01845703125, | |
| "rewards/format_reward": 0.998828125, | |
| "rewards/mean_confidence_reward": 0.4387405276298523, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0005859375, | |
| "completions/max_length": 995.2, | |
| "completions/max_terminated_length": 543.0, | |
| "completions/mean_length": 163.6873046875, | |
| "completions/mean_terminated_length": 162.88194580078124, | |
| "completions/min_length": 77.4, | |
| "completions/min_terminated_length": 77.4, | |
| "epoch": 0.192, | |
| "grad_norm": 0.007208758499473333, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0024, | |
| "num_tokens": 201222281.0, | |
| "reward": 1.117084002494812, | |
| "reward_std": 0.11234763264656067, | |
| "rewards/accuracy_reward": 0.4650390625, | |
| "rewards/brier_reward": 0.7699091911315918, | |
| "rewards/confidence_one_or_zero": 0.0126953125, | |
| "rewards/format_reward": 0.99921875, | |
| "rewards/mean_confidence_reward": 0.47076983451843263, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.000390625, | |
| "completions/max_length": 695.4, | |
| "completions/max_terminated_length": 507.2, | |
| "completions/mean_length": 167.3060546875, | |
| "completions/mean_terminated_length": 166.7700225830078, | |
| "completions/min_length": 62.8, | |
| "completions/min_terminated_length": 62.8, | |
| "epoch": 0.208, | |
| "grad_norm": 0.010833042673766613, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "num_tokens": 217967719.0, | |
| "reward": 1.1401428937911988, | |
| "reward_std": 0.11306387037038804, | |
| "rewards/accuracy_reward": 0.5169921875, | |
| "rewards/brier_reward": 0.7639761686325073, | |
| "rewards/confidence_one_or_zero": 0.00966796875, | |
| "rewards/format_reward": 0.99931640625, | |
| "rewards/mean_confidence_reward": 0.48724169135093687, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 9.765625e-05, | |
| "completions/max_length": 730.4, | |
| "completions/max_terminated_length": 518.4, | |
| "completions/mean_length": 166.8869140625, | |
| "completions/mean_terminated_length": 166.7537078857422, | |
| "completions/min_length": 81.8, | |
| "completions/min_terminated_length": 81.8, | |
| "epoch": 0.224, | |
| "grad_norm": 0.00990777276456356, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "num_tokens": 234829825.0, | |
| "reward": 1.117465901374817, | |
| "reward_std": 0.11148046851158142, | |
| "rewards/accuracy_reward": 0.4650390625, | |
| "rewards/brier_reward": 0.7703800439834595, | |
| "rewards/confidence_one_or_zero": 0.0109375, | |
| "rewards/format_reward": 0.99951171875, | |
| "rewards/mean_confidence_reward": 0.48857617378234863, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0005859375, | |
| "completions/max_length": 1110.6, | |
| "completions/max_terminated_length": 485.6, | |
| "completions/mean_length": 173.951171875, | |
| "completions/mean_terminated_length": 173.1517333984375, | |
| "completions/min_length": 81.6, | |
| "completions/min_terminated_length": 81.6, | |
| "epoch": 0.24, | |
| "grad_norm": 0.012188726104795933, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0026, | |
| "num_tokens": 251862765.0, | |
| "reward": 1.1460721731185912, | |
| "reward_std": 0.1271799236536026, | |
| "rewards/accuracy_reward": 0.52431640625, | |
| "rewards/brier_reward": 0.7685105323791503, | |
| "rewards/confidence_one_or_zero": 0.00810546875, | |
| "rewards/format_reward": 0.99931640625, | |
| "rewards/mean_confidence_reward": 0.5015208303928376, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 9.765625e-05, | |
| "completions/max_length": 779.0, | |
| "completions/max_terminated_length": 570.2, | |
| "completions/mean_length": 171.87490234375, | |
| "completions/mean_terminated_length": 171.741259765625, | |
| "completions/min_length": 83.8, | |
| "completions/min_terminated_length": 83.8, | |
| "epoch": 0.256, | |
| "grad_norm": 0.00730907404795289, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "num_tokens": 268677580.0, | |
| "reward": 1.13645658493042, | |
| "reward_std": 0.11724818050861359, | |
| "rewards/accuracy_reward": 0.49716796875, | |
| "rewards/brier_reward": 0.7759395122528077, | |
| "rewards/confidence_one_or_zero": 0.0126953125, | |
| "rewards/format_reward": 0.9998046875, | |
| "rewards/mean_confidence_reward": 0.5036328196525574, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00048828125, | |
| "completions/max_length": 863.2, | |
| "completions/max_terminated_length": 460.8, | |
| "completions/mean_length": 177.0607421875, | |
| "completions/mean_terminated_length": 176.3964630126953, | |
| "completions/min_length": 57.6, | |
| "completions/min_terminated_length": 57.6, | |
| "epoch": 0.272, | |
| "grad_norm": 0.009313421323895454, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "num_tokens": 285456378.0, | |
| "reward": 1.1312544345855713, | |
| "reward_std": 0.11844458729028702, | |
| "rewards/accuracy_reward": 0.49052734375, | |
| "rewards/brier_reward": 0.7730547547340393, | |
| "rewards/confidence_one_or_zero": 0.0130859375, | |
| "rewards/format_reward": 0.99892578125, | |
| "rewards/mean_confidence_reward": 0.4925888657569885, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0001953125, | |
| "completions/max_length": 890.4, | |
| "completions/max_terminated_length": 446.2, | |
| "completions/mean_length": 173.14130859375, | |
| "completions/mean_terminated_length": 172.87517395019532, | |
| "completions/min_length": 66.2, | |
| "completions/min_terminated_length": 66.2, | |
| "epoch": 0.288, | |
| "grad_norm": 0.021724838763475418, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "num_tokens": 302187521.0, | |
| "reward": 1.1318594217300415, | |
| "reward_std": 0.11806258261203766, | |
| "rewards/accuracy_reward": 0.4912109375, | |
| "rewards/brier_reward": 0.7728975296020508, | |
| "rewards/confidence_one_or_zero": 0.01240234375, | |
| "rewards/format_reward": 0.999609375, | |
| "rewards/mean_confidence_reward": 0.4906367301940918, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0005859375, | |
| "completions/max_length": 1014.6, | |
| "completions/max_terminated_length": 857.8, | |
| "completions/mean_length": 176.462890625, | |
| "completions/mean_terminated_length": 175.66776123046876, | |
| "completions/min_length": 62.4, | |
| "completions/min_terminated_length": 62.4, | |
| "epoch": 0.304, | |
| "grad_norm": 0.006397055462002754, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0018, | |
| "num_tokens": 318924453.0, | |
| "reward": 1.1344104766845704, | |
| "reward_std": 0.11458559930324555, | |
| "rewards/accuracy_reward": 0.4953125, | |
| "rewards/brier_reward": 0.7745817184448243, | |
| "rewards/confidence_one_or_zero": 0.01767578125, | |
| "rewards/format_reward": 0.99892578125, | |
| "rewards/mean_confidence_reward": 0.47802149057388305, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0009765625, | |
| "completions/max_length": 1325.6, | |
| "completions/max_terminated_length": 647.8, | |
| "completions/mean_length": 176.624609375, | |
| "completions/mean_terminated_length": 175.29497375488282, | |
| "completions/min_length": 66.0, | |
| "completions/min_terminated_length": 66.0, | |
| "epoch": 0.32, | |
| "grad_norm": 0.13792400062084198, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "num_tokens": 335821793.0, | |
| "reward": 1.1494127273559571, | |
| "reward_std": 0.10425310283899307, | |
| "rewards/accuracy_reward": 0.51708984375, | |
| "rewards/brier_reward": 0.7831019043922425, | |
| "rewards/confidence_one_or_zero": 0.01220703125, | |
| "rewards/format_reward": 0.9986328125, | |
| "rewards/mean_confidence_reward": 0.4873457133769989, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_completions/clipped_ratio": 0.0, | |
| "eval_completions/max_length": 408.0, | |
| "eval_completions/max_terminated_length": 408.0, | |
| "eval_completions/mean_length": 175.2724952697754, | |
| "eval_completions/mean_terminated_length": 175.2724952697754, | |
| "eval_completions/min_length": 94.75, | |
| "eval_completions/min_terminated_length": 94.75, | |
| "eval_loss": 0.0, | |
| "eval_num_tokens": 335821793.0, | |
| "eval_reward": 1.0855459570884705, | |
| "eval_reward_std": 0.25649269297719, | |
| "eval_rewards/accuracy_reward": 0.392578125, | |
| "eval_rewards/brier_reward": 0.7785128951072693, | |
| "eval_rewards/confidence_one_or_zero": 0.021484375, | |
| "eval_rewards/format_reward": 1.0, | |
| "eval_rewards/mean_confidence_reward": 0.47496095299720764, | |
| "eval_runtime": 22.7529, | |
| "eval_samples_per_second": 21.975, | |
| "eval_steps_per_second": 0.176, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.001171875, | |
| "completions/max_length": 1462.2, | |
| "completions/max_terminated_length": 1011.0, | |
| "completions/mean_length": 175.45908203125, | |
| "completions/mean_terminated_length": 173.86156005859374, | |
| "completions/min_length": 49.6, | |
| "completions/min_terminated_length": 49.6, | |
| "epoch": 0.336, | |
| "grad_norm": 0.017879005521535873, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "num_tokens": 352340926.0, | |
| "reward": 1.1430244445800781, | |
| "reward_std": 0.12031411826610565, | |
| "rewards/accuracy_reward": 0.51044921875, | |
| "rewards/brier_reward": 0.7780401229858398, | |
| "rewards/confidence_one_or_zero": 0.015625, | |
| "rewards/format_reward": 0.99755859375, | |
| "rewards/mean_confidence_reward": 0.4848623156547546, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0017578125, | |
| "completions/max_length": 1461.4, | |
| "completions/max_terminated_length": 807.2, | |
| "completions/mean_length": 180.69208984375, | |
| "completions/mean_terminated_length": 178.3111328125, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 1.0, | |
| "epoch": 0.352, | |
| "grad_norm": 0.039632029831409454, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0025, | |
| "num_tokens": 369451629.0, | |
| "reward": 1.1116411447525025, | |
| "reward_std": 0.12863886207342148, | |
| "rewards/accuracy_reward": 0.4513671875, | |
| "rewards/brier_reward": 0.7790430426597595, | |
| "rewards/confidence_one_or_zero": 0.01787109375, | |
| "rewards/format_reward": 0.99287109375, | |
| "rewards/mean_confidence_reward": 0.45101758241653445, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00048828125, | |
| "completions/max_length": 1182.2, | |
| "completions/max_terminated_length": 745.6, | |
| "completions/mean_length": 183.9287109375, | |
| "completions/mean_terminated_length": 183.26663513183593, | |
| "completions/min_length": 7.2, | |
| "completions/min_terminated_length": 7.2, | |
| "epoch": 0.368, | |
| "grad_norm": 0.03524937480688095, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "num_tokens": 386400531.0, | |
| "reward": 1.1271000146865844, | |
| "reward_std": 0.11355644613504409, | |
| "rewards/accuracy_reward": 0.475, | |
| "rewards/brier_reward": 0.7824217438697815, | |
| "rewards/confidence_one_or_zero": 0.01640625, | |
| "rewards/format_reward": 0.99677734375, | |
| "rewards/mean_confidence_reward": 0.465801864862442, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.001171875, | |
| "completions/max_length": 1536.0, | |
| "completions/max_terminated_length": 745.8, | |
| "completions/mean_length": 184.33974609375, | |
| "completions/mean_terminated_length": 182.75419921875, | |
| "completions/min_length": 62.2, | |
| "completions/min_terminated_length": 62.2, | |
| "epoch": 0.384, | |
| "grad_norm": 0.017022427171468735, | |
| "learning_rate": 1e-06, | |
| "loss": 0.004, | |
| "num_tokens": 403144682.0, | |
| "reward": 1.1463651657104492, | |
| "reward_std": 0.12106073200702668, | |
| "rewards/accuracy_reward": 0.506640625, | |
| "rewards/brier_reward": 0.7902879238128662, | |
| "rewards/confidence_one_or_zero": 0.02177734375, | |
| "rewards/format_reward": 0.99580078125, | |
| "rewards/mean_confidence_reward": 0.46812206506729126, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00126953125, | |
| "completions/max_length": 1335.0, | |
| "completions/max_terminated_length": 753.8, | |
| "completions/mean_length": 184.96728515625, | |
| "completions/mean_terminated_length": 183.24875183105468, | |
| "completions/min_length": 60.2, | |
| "completions/min_terminated_length": 60.2, | |
| "epoch": 0.4, | |
| "grad_norm": 0.017970727756619453, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "num_tokens": 420075195.0, | |
| "reward": 1.1312126159667968, | |
| "reward_std": 0.12662589848041533, | |
| "rewards/accuracy_reward": 0.48759765625, | |
| "rewards/brier_reward": 0.7792211294174194, | |
| "rewards/confidence_one_or_zero": 0.0115234375, | |
| "rewards/format_reward": 0.99560546875, | |
| "rewards/mean_confidence_reward": 0.5128134965896607, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.001171875, | |
| "completions/max_length": 1323.8, | |
| "completions/max_terminated_length": 799.8, | |
| "completions/mean_length": 190.72177734375, | |
| "completions/mean_terminated_length": 189.14300231933595, | |
| "completions/min_length": 83.2, | |
| "completions/min_terminated_length": 83.2, | |
| "epoch": 0.416, | |
| "grad_norm": 0.01795245334506035, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0039, | |
| "num_tokens": 436909370.0, | |
| "reward": 1.1423231840133667, | |
| "reward_std": 0.12082898765802383, | |
| "rewards/accuracy_reward": 0.49794921875, | |
| "rewards/brier_reward": 0.7881609320640564, | |
| "rewards/confidence_one_or_zero": 0.00966796875, | |
| "rewards/format_reward": 0.99853515625, | |
| "rewards/mean_confidence_reward": 0.5266152441501617, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0005859375, | |
| "completions/max_length": 1316.6, | |
| "completions/max_terminated_length": 564.4, | |
| "completions/mean_length": 189.984765625, | |
| "completions/mean_terminated_length": 189.19561157226562, | |
| "completions/min_length": 65.4, | |
| "completions/min_terminated_length": 65.4, | |
| "epoch": 0.432, | |
| "grad_norm": 0.12305645644664764, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "num_tokens": 453869150.0, | |
| "reward": 1.167378830909729, | |
| "reward_std": 0.11343645602464676, | |
| "rewards/accuracy_reward": 0.53447265625, | |
| "rewards/brier_reward": 0.8010651707649231, | |
| "rewards/confidence_one_or_zero": 0.00986328125, | |
| "rewards/format_reward": 0.99921875, | |
| "rewards/mean_confidence_reward": 0.5294726729393006, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.000390625, | |
| "completions/max_length": 1255.2, | |
| "completions/max_terminated_length": 826.8, | |
| "completions/mean_length": 201.4958984375, | |
| "completions/mean_terminated_length": 200.97523803710936, | |
| "completions/min_length": 76.6, | |
| "completions/min_terminated_length": 76.6, | |
| "epoch": 0.448, | |
| "grad_norm": 0.01579131931066513, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0017, | |
| "num_tokens": 470885268.0, | |
| "reward": 1.1469329595565796, | |
| "reward_std": 0.1118901401758194, | |
| "rewards/accuracy_reward": 0.49921875, | |
| "rewards/brier_reward": 0.7957204103469848, | |
| "rewards/confidence_one_or_zero": 0.0072265625, | |
| "rewards/format_reward": 0.99892578125, | |
| "rewards/mean_confidence_reward": 0.5328691601753235, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0001953125, | |
| "completions/max_length": 736.6, | |
| "completions/max_terminated_length": 513.0, | |
| "completions/mean_length": 211.05078125, | |
| "completions/mean_terminated_length": 210.79130859375, | |
| "completions/min_length": 81.8, | |
| "completions/min_terminated_length": 81.8, | |
| "epoch": 0.464, | |
| "grad_norm": 0.011122009716928005, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "num_tokens": 488217244.0, | |
| "reward": 1.1156266927719116, | |
| "reward_std": 0.10946736484766006, | |
| "rewards/accuracy_reward": 0.4498046875, | |
| "rewards/brier_reward": 0.781838345527649, | |
| "rewards/confidence_one_or_zero": 0.009765625, | |
| "rewards/format_reward": 0.999609375, | |
| "rewards/mean_confidence_reward": 0.5123164117336273, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0001953125, | |
| "completions/max_length": 999.8, | |
| "completions/max_terminated_length": 586.0, | |
| "completions/mean_length": 209.76787109375, | |
| "completions/mean_terminated_length": 209.50829467773437, | |
| "completions/min_length": 94.0, | |
| "completions/min_terminated_length": 94.0, | |
| "epoch": 0.48, | |
| "grad_norm": 0.011441366747021675, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "num_tokens": 505413299.0, | |
| "reward": 1.1479370832443236, | |
| "reward_std": 0.10615950524806976, | |
| "rewards/accuracy_reward": 0.5056640625, | |
| "rewards/brier_reward": 0.7905020475387573, | |
| "rewards/confidence_one_or_zero": 0.01357421875, | |
| "rewards/format_reward": 0.99970703125, | |
| "rewards/mean_confidence_reward": 0.49698535799980165, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_completions/clipped_ratio": 0.0, | |
| "eval_completions/max_length": 454.75, | |
| "eval_completions/max_terminated_length": 454.75, | |
| "eval_completions/mean_length": 213.07852935791016, | |
| "eval_completions/mean_terminated_length": 213.07852935791016, | |
| "eval_completions/min_length": 122.75, | |
| "eval_completions/min_terminated_length": 122.75, | |
| "eval_loss": 0.0, | |
| "eval_num_tokens": 505413299.0, | |
| "eval_reward": 1.1056718528270721, | |
| "eval_reward_std": 0.2651190534234047, | |
| "eval_rewards/accuracy_reward": 0.41015625, | |
| "eval_rewards/brier_reward": 0.8011865168809891, | |
| "eval_rewards/confidence_one_or_zero": 0.005859375, | |
| "eval_rewards/format_reward": 1.0, | |
| "eval_rewards/mean_confidence_reward": 0.4764648526906967, | |
| "eval_runtime": 25.1582, | |
| "eval_samples_per_second": 19.874, | |
| "eval_steps_per_second": 0.159, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 575.8, | |
| "completions/max_terminated_length": 575.8, | |
| "completions/mean_length": 212.034765625, | |
| "completions/mean_terminated_length": 212.034765625, | |
| "completions/min_length": 104.2, | |
| "completions/min_terminated_length": 104.2, | |
| "epoch": 0.496, | |
| "grad_norm": 0.0033662207424640656, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "num_tokens": 522892375.0, | |
| "reward": 1.1667110204696656, | |
| "reward_std": 0.1078746810555458, | |
| "rewards/accuracy_reward": 0.5400390625, | |
| "rewards/brier_reward": 0.7934796333312988, | |
| "rewards/confidence_one_or_zero": 0.01025390625, | |
| "rewards/format_reward": 0.99990234375, | |
| "rewards/mean_confidence_reward": 0.5157265901565552, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0001953125, | |
| "completions/max_length": 924.6, | |
| "completions/max_terminated_length": 532.0, | |
| "completions/mean_length": 210.298828125, | |
| "completions/mean_terminated_length": 210.0409729003906, | |
| "completions/min_length": 97.4, | |
| "completions/min_terminated_length": 97.4, | |
| "epoch": 0.512, | |
| "grad_norm": 0.012034610845148563, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "num_tokens": 540191499.0, | |
| "reward": 1.1708500623703002, | |
| "reward_std": 0.11257555186748505, | |
| "rewards/accuracy_reward": 0.5369140625, | |
| "rewards/brier_reward": 0.8050778865814209, | |
| "rewards/confidence_one_or_zero": 0.00703125, | |
| "rewards/format_reward": 0.99970703125, | |
| "rewards/mean_confidence_reward": 0.5358349800109863, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00029296875, | |
| "completions/max_length": 1004.8, | |
| "completions/max_terminated_length": 661.4, | |
| "completions/mean_length": 207.5166015625, | |
| "completions/mean_terminated_length": 207.12822265625, | |
| "completions/min_length": 95.0, | |
| "completions/min_terminated_length": 95.0, | |
| "epoch": 0.528, | |
| "grad_norm": 0.015975475311279297, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "num_tokens": 557346005.0, | |
| "reward": 1.1701508998870849, | |
| "reward_std": 0.10177138149738311, | |
| "rewards/accuracy_reward": 0.5318359375, | |
| "rewards/brier_reward": 0.8088555216789246, | |
| "rewards/confidence_one_or_zero": 0.00927734375, | |
| "rewards/format_reward": 0.999609375, | |
| "rewards/mean_confidence_reward": 0.5180878877639771, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 9.765625e-05, | |
| "completions/max_length": 768.2, | |
| "completions/max_terminated_length": 551.8, | |
| "completions/mean_length": 209.73466796875, | |
| "completions/mean_terminated_length": 209.60549926757812, | |
| "completions/min_length": 101.0, | |
| "completions/min_terminated_length": 101.0, | |
| "epoch": 0.544, | |
| "grad_norm": 0.0062843854539096355, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "num_tokens": 574657272.0, | |
| "reward": 1.1808686017990113, | |
| "reward_std": 0.10841628313064575, | |
| "rewards/accuracy_reward": 0.561328125, | |
| "rewards/brier_reward": 0.8007986664772033, | |
| "rewards/confidence_one_or_zero": 0.00556640625, | |
| "rewards/format_reward": 0.999609375, | |
| "rewards/mean_confidence_reward": 0.5436318397521973, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00029296875, | |
| "completions/max_length": 918.4, | |
| "completions/max_terminated_length": 479.4, | |
| "completions/mean_length": 208.92236328125, | |
| "completions/mean_terminated_length": 208.53166809082032, | |
| "completions/min_length": 104.0, | |
| "completions/min_terminated_length": 104.0, | |
| "epoch": 0.56, | |
| "grad_norm": 0.012134869582951069, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "num_tokens": 591618045.0, | |
| "reward": 1.1645673036575317, | |
| "reward_std": 0.09899833053350449, | |
| "rewards/accuracy_reward": 0.51962890625, | |
| "rewards/brier_reward": 0.809797728061676, | |
| "rewards/confidence_one_or_zero": 0.0025390625, | |
| "rewards/format_reward": 0.99970703125, | |
| "rewards/mean_confidence_reward": 0.5343828201293945, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.000390625, | |
| "completions/max_length": 1103.4, | |
| "completions/max_terminated_length": 482.4, | |
| "completions/mean_length": 208.23701171875, | |
| "completions/mean_terminated_length": 207.72026672363282, | |
| "completions/min_length": 101.0, | |
| "completions/min_terminated_length": 101.0, | |
| "epoch": 0.576, | |
| "grad_norm": 0.014361800625920296, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "num_tokens": 608937016.0, | |
| "reward": 1.153262686729431, | |
| "reward_std": 0.09391747117042541, | |
| "rewards/accuracy_reward": 0.50830078125, | |
| "rewards/brier_reward": 0.7986142754554748, | |
| "rewards/confidence_one_or_zero": 0.0078125, | |
| "rewards/format_reward": 0.999609375, | |
| "rewards/mean_confidence_reward": 0.5241601765155792, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.000390625, | |
| "completions/max_length": 1119.4, | |
| "completions/max_terminated_length": 530.6, | |
| "completions/mean_length": 206.19326171875, | |
| "completions/mean_terminated_length": 205.6750518798828, | |
| "completions/min_length": 102.8, | |
| "completions/min_terminated_length": 102.8, | |
| "epoch": 0.592, | |
| "grad_norm": 0.008724602870643139, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "num_tokens": 626216147.0, | |
| "reward": 1.160974383354187, | |
| "reward_std": 0.0895046427845955, | |
| "rewards/accuracy_reward": 0.51416015625, | |
| "rewards/brier_reward": 0.8081783413887024, | |
| "rewards/confidence_one_or_zero": 0.0060546875, | |
| "rewards/format_reward": 0.999609375, | |
| "rewards/mean_confidence_reward": 0.5143349647521973, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 9.765625e-05, | |
| "completions/max_length": 663.0, | |
| "completions/max_terminated_length": 556.8, | |
| "completions/mean_length": 205.84892578125, | |
| "completions/mean_terminated_length": 205.7189514160156, | |
| "completions/min_length": 100.0, | |
| "completions/min_terminated_length": 100.0, | |
| "epoch": 0.608, | |
| "grad_norm": 0.008571009151637554, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0005, | |
| "num_tokens": 643323528.0, | |
| "reward": 1.170970630645752, | |
| "reward_std": 0.0794813334941864, | |
| "rewards/accuracy_reward": 0.5240234375, | |
| "rewards/brier_reward": 0.8181121468544006, | |
| "rewards/confidence_one_or_zero": 0.00751953125, | |
| "rewards/format_reward": 0.9998046875, | |
| "rewards/mean_confidence_reward": 0.48931640982627866, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 573.0, | |
| "completions/max_terminated_length": 573.0, | |
| "completions/mean_length": 204.29423828125, | |
| "completions/mean_terminated_length": 204.29423828125, | |
| "completions/min_length": 99.4, | |
| "completions/min_terminated_length": 99.4, | |
| "epoch": 0.624, | |
| "grad_norm": 0.006929480005055666, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 660759405.0, | |
| "reward": 1.1707647800445558, | |
| "reward_std": 0.08714393228292465, | |
| "rewards/accuracy_reward": 0.5267578125, | |
| "rewards/brier_reward": 0.8147708535194397, | |
| "rewards/confidence_one_or_zero": 0.00556640625, | |
| "rewards/format_reward": 1.0, | |
| "rewards/mean_confidence_reward": 0.4799599587917328, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00048828125, | |
| "completions/max_length": 876.8, | |
| "completions/max_terminated_length": 465.2, | |
| "completions/mean_length": 204.7708984375, | |
| "completions/mean_terminated_length": 204.12095947265624, | |
| "completions/min_length": 101.4, | |
| "completions/min_terminated_length": 101.4, | |
| "epoch": 0.64, | |
| "grad_norm": 0.006568757817149162, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "num_tokens": 678198947.0, | |
| "reward": 1.188051414489746, | |
| "reward_std": 0.08363442420959473, | |
| "rewards/accuracy_reward": 0.56201171875, | |
| "rewards/brier_reward": 0.8145784258842468, | |
| "rewards/confidence_one_or_zero": 0.0029296875, | |
| "rewards/format_reward": 0.99951171875, | |
| "rewards/mean_confidence_reward": 0.5187148451805115, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_completions/clipped_ratio": 0.0, | |
| "eval_completions/max_length": 362.5, | |
| "eval_completions/max_terminated_length": 362.5, | |
| "eval_completions/mean_length": 202.62493133544922, | |
| "eval_completions/mean_terminated_length": 202.62493133544922, | |
| "eval_completions/min_length": 116.25, | |
| "eval_completions/min_terminated_length": 116.25, | |
| "eval_loss": 0.0, | |
| "eval_num_tokens": 678198947.0, | |
| "eval_reward": 1.1124199032783508, | |
| "eval_reward_std": 0.2762097716331482, | |
| "eval_rewards/accuracy_reward": 0.41796875, | |
| "eval_rewards/brier_reward": 0.8068701177835464, | |
| "eval_rewards/confidence_one_or_zero": 0.001953125, | |
| "eval_rewards/format_reward": 1.0, | |
| "eval_rewards/mean_confidence_reward": 0.5002929642796516, | |
| "eval_runtime": 20.9461, | |
| "eval_samples_per_second": 23.871, | |
| "eval_steps_per_second": 0.191, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.000390625, | |
| "completions/max_length": 1170.6, | |
| "completions/max_terminated_length": 699.2, | |
| "completions/mean_length": 200.96318359375, | |
| "completions/mean_terminated_length": 200.4418518066406, | |
| "completions/min_length": 99.0, | |
| "completions/min_terminated_length": 99.0, | |
| "epoch": 0.656, | |
| "grad_norm": 0.0061631170101463795, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0011, | |
| "num_tokens": 695113354.0, | |
| "reward": 1.1497117042541505, | |
| "reward_std": 0.09324304014444351, | |
| "rewards/accuracy_reward": 0.50517578125, | |
| "rewards/brier_reward": 0.7946373224258423, | |
| "rewards/confidence_one_or_zero": 0.00361328125, | |
| "rewards/format_reward": 0.999609375, | |
| "rewards/mean_confidence_reward": 0.5214970707893372, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 9.765625e-05, | |
| "completions/max_length": 666.0, | |
| "completions/max_terminated_length": 434.4, | |
| "completions/mean_length": 199.86826171875, | |
| "completions/mean_terminated_length": 199.73834228515625, | |
| "completions/min_length": 101.8, | |
| "completions/min_terminated_length": 101.8, | |
| "epoch": 0.672, | |
| "grad_norm": 0.005808599293231964, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0006, | |
| "num_tokens": 712073445.0, | |
| "reward": 1.168108344078064, | |
| "reward_std": 0.08731473982334137, | |
| "rewards/accuracy_reward": 0.5232421875, | |
| "rewards/brier_reward": 0.8130711436271667, | |
| "rewards/confidence_one_or_zero": 0.003515625, | |
| "rewards/format_reward": 0.99990234375, | |
| "rewards/mean_confidence_reward": 0.5128886938095093, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 9.765625e-05, | |
| "completions/max_length": 675.2, | |
| "completions/max_terminated_length": 629.4, | |
| "completions/mean_length": 205.4484375, | |
| "completions/mean_terminated_length": 205.31842651367188, | |
| "completions/min_length": 104.0, | |
| "completions/min_terminated_length": 104.0, | |
| "epoch": 0.688, | |
| "grad_norm": 0.005605604965239763, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "num_tokens": 729131157.0, | |
| "reward": 1.1653831958770753, | |
| "reward_std": 0.09237445890903473, | |
| "rewards/accuracy_reward": 0.5251953125, | |
| "rewards/brier_reward": 0.8056677460670472, | |
| "rewards/confidence_one_or_zero": 0.0005859375, | |
| "rewards/format_reward": 0.99990234375, | |
| "rewards/mean_confidence_reward": 0.5064306616783142, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 411.0, | |
| "completions/max_terminated_length": 411.0, | |
| "completions/mean_length": 202.72197265625, | |
| "completions/mean_terminated_length": 202.72197265625, | |
| "completions/min_length": 104.2, | |
| "completions/min_terminated_length": 104.2, | |
| "epoch": 0.704, | |
| "grad_norm": 0.0074613383039832115, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "num_tokens": 746073174.0, | |
| "reward": 1.176201581954956, | |
| "reward_std": 0.07743649333715438, | |
| "rewards/accuracy_reward": 0.53837890625, | |
| "rewards/brier_reward": 0.8140231966972351, | |
| "rewards/confidence_one_or_zero": 0.00517578125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/mean_confidence_reward": 0.5200888633728027, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 656.2, | |
| "completions/max_terminated_length": 656.2, | |
| "completions/mean_length": 202.00751953125, | |
| "completions/mean_terminated_length": 202.00751953125, | |
| "completions/min_length": 99.4, | |
| "completions/min_terminated_length": 99.4, | |
| "epoch": 0.72, | |
| "grad_norm": 0.008289781399071217, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 763151587.0, | |
| "reward": 1.186832022666931, | |
| "reward_std": 0.08442019075155258, | |
| "rewards/accuracy_reward": 0.55576171875, | |
| "rewards/brier_reward": 0.817901360988617, | |
| "rewards/confidence_one_or_zero": 0.0025390625, | |
| "rewards/format_reward": 1.0, | |
| "rewards/mean_confidence_reward": 0.5230859518051147, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0005859375, | |
| "completions/max_length": 647.4, | |
| "completions/max_terminated_length": 516.6, | |
| "completions/mean_length": 205.50849609375, | |
| "completions/mean_terminated_length": 204.73146057128906, | |
| "completions/min_length": 97.6, | |
| "completions/min_terminated_length": 97.6, | |
| "epoch": 0.736, | |
| "grad_norm": 0.008065270259976387, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0016, | |
| "num_tokens": 780195578.0, | |
| "reward": 1.1865583658218384, | |
| "reward_std": 0.08369777351617813, | |
| "rewards/accuracy_reward": 0.56474609375, | |
| "rewards/brier_reward": 0.8089555978775025, | |
| "rewards/confidence_one_or_zero": 0.003125, | |
| "rewards/format_reward": 0.9994140625, | |
| "rewards/mean_confidence_reward": 0.5270449399948121, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 606.4, | |
| "completions/max_terminated_length": 606.4, | |
| "completions/mean_length": 206.17548828125, | |
| "completions/mean_terminated_length": 206.17548828125, | |
| "completions/min_length": 102.2, | |
| "completions/min_terminated_length": 102.2, | |
| "epoch": 0.752, | |
| "grad_norm": 0.009517704136669636, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "num_tokens": 797534015.0, | |
| "reward": 1.1857917547225951, | |
| "reward_std": 0.08198632448911666, | |
| "rewards/accuracy_reward": 0.5568359375, | |
| "rewards/brier_reward": 0.8151372194290161, | |
| "rewards/confidence_one_or_zero": 0.005859375, | |
| "rewards/format_reward": 0.999609375, | |
| "rewards/mean_confidence_reward": 0.5247080326080322, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.000390625, | |
| "completions/max_length": 1174.4, | |
| "completions/max_terminated_length": 875.4, | |
| "completions/mean_length": 215.8525390625, | |
| "completions/mean_terminated_length": 215.33623962402345, | |
| "completions/min_length": 103.4, | |
| "completions/min_terminated_length": 103.4, | |
| "epoch": 0.768, | |
| "grad_norm": 0.02318732999265194, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "num_tokens": 814677049.0, | |
| "reward": 1.1634629249572754, | |
| "reward_std": 0.08838685750961303, | |
| "rewards/accuracy_reward": 0.50859375, | |
| "rewards/brier_reward": 0.8190146684646606, | |
| "rewards/confidence_one_or_zero": 0.00322265625, | |
| "rewards/format_reward": 0.99931640625, | |
| "rewards/mean_confidence_reward": 0.535248053073883, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 467.2, | |
| "completions/max_terminated_length": 467.2, | |
| "completions/mean_length": 216.6044921875, | |
| "completions/mean_terminated_length": 216.6044921875, | |
| "completions/min_length": 108.2, | |
| "completions/min_terminated_length": 108.2, | |
| "epoch": 0.784, | |
| "grad_norm": 0.02482554130256176, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 832069447.0, | |
| "reward": 1.1771643161773682, | |
| "reward_std": 0.09045170843601227, | |
| "rewards/accuracy_reward": 0.55361328125, | |
| "rewards/brier_reward": 0.8010073304176331, | |
| "rewards/confidence_one_or_zero": 0.00498046875, | |
| "rewards/format_reward": 0.99970703125, | |
| "rewards/mean_confidence_reward": 0.5427734732627869, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 497.2, | |
| "completions/max_terminated_length": 497.2, | |
| "completions/mean_length": 218.06123046875, | |
| "completions/mean_terminated_length": 218.06123046875, | |
| "completions/min_length": 113.6, | |
| "completions/min_terminated_length": 113.6, | |
| "epoch": 0.8, | |
| "grad_norm": 0.005663494113832712, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "num_tokens": 849312954.0, | |
| "reward": 1.2057418823242188, | |
| "reward_std": 0.08701288551092148, | |
| "rewards/accuracy_reward": 0.587109375, | |
| "rewards/brier_reward": 0.8243734002113342, | |
| "rewards/confidence_one_or_zero": 0.0033203125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/mean_confidence_reward": 0.5567070245742798, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_completions/clipped_ratio": 0.0, | |
| "eval_completions/max_length": 408.0, | |
| "eval_completions/max_terminated_length": 408.0, | |
| "eval_completions/mean_length": 220.35998153686523, | |
| "eval_completions/mean_terminated_length": 220.35998153686523, | |
| "eval_completions/min_length": 134.5, | |
| "eval_completions/min_terminated_length": 134.5, | |
| "eval_loss": 0.0, | |
| "eval_num_tokens": 849312954.0, | |
| "eval_reward": 1.118152379989624, | |
| "eval_reward_std": 0.28543028980493546, | |
| "eval_rewards/accuracy_reward": 0.435546875, | |
| "eval_rewards/brier_reward": 0.8007568567991257, | |
| "eval_rewards/confidence_one_or_zero": 0.001953125, | |
| "eval_rewards/format_reward": 1.0, | |
| "eval_rewards/mean_confidence_reward": 0.5190429389476776, | |
| "eval_runtime": 22.0758, | |
| "eval_samples_per_second": 22.649, | |
| "eval_steps_per_second": 0.181, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 425.8, | |
| "completions/max_terminated_length": 425.8, | |
| "completions/mean_length": 218.22822265625, | |
| "completions/mean_terminated_length": 218.22822265625, | |
| "completions/min_length": 110.4, | |
| "completions/min_terminated_length": 110.4, | |
| "epoch": 0.816, | |
| "grad_norm": 0.021623145788908005, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "num_tokens": 866646779.0, | |
| "reward": 1.1943198680877685, | |
| "reward_std": 0.08455176651477814, | |
| "rewards/accuracy_reward": 0.5806640625, | |
| "rewards/brier_reward": 0.808072280883789, | |
| "rewards/confidence_one_or_zero": 0.00185546875, | |
| "rewards/format_reward": 0.99990234375, | |
| "rewards/mean_confidence_reward": 0.5697656273841858, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 9.765625e-05, | |
| "completions/max_length": 688.2, | |
| "completions/max_terminated_length": 466.0, | |
| "completions/mean_length": 219.43544921875, | |
| "completions/mean_terminated_length": 219.30695190429688, | |
| "completions/min_length": 112.6, | |
| "completions/min_terminated_length": 112.6, | |
| "epoch": 0.832, | |
| "grad_norm": 0.010291030630469322, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "num_tokens": 883902150.0, | |
| "reward": 1.1886512756347656, | |
| "reward_std": 0.08867516815662384, | |
| "rewards/accuracy_reward": 0.552734375, | |
| "rewards/brier_reward": 0.8246647596359253, | |
| "rewards/confidence_one_or_zero": 0.001953125, | |
| "rewards/format_reward": 0.99990234375, | |
| "rewards/mean_confidence_reward": 0.5912207126617431, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 444.2, | |
| "completions/max_terminated_length": 444.2, | |
| "completions/mean_length": 219.04970703125, | |
| "completions/mean_terminated_length": 219.04970703125, | |
| "completions/min_length": 118.6, | |
| "completions/min_terminated_length": 118.6, | |
| "epoch": 0.848, | |
| "grad_norm": 0.007072034757584333, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 901159587.0, | |
| "reward": 1.1735345602035523, | |
| "reward_std": 0.08179984986782074, | |
| "rewards/accuracy_reward": 0.53017578125, | |
| "rewards/brier_reward": 0.8168921947479248, | |
| "rewards/confidence_one_or_zero": 0.001171875, | |
| "rewards/format_reward": 1.0, | |
| "rewards/mean_confidence_reward": 0.5902080178260803, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 9.765625e-05, | |
| "completions/max_length": 655.6, | |
| "completions/max_terminated_length": 498.2, | |
| "completions/mean_length": 225.3056640625, | |
| "completions/mean_terminated_length": 225.17782287597657, | |
| "completions/min_length": 124.0, | |
| "completions/min_terminated_length": 124.0, | |
| "epoch": 0.864, | |
| "grad_norm": 0.009159094654023647, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "num_tokens": 918453533.0, | |
| "reward": 1.199086856842041, | |
| "reward_std": 0.08750579506158829, | |
| "rewards/accuracy_reward": 0.5861328125, | |
| "rewards/brier_reward": 0.8121374368667602, | |
| "rewards/confidence_one_or_zero": 0.00283203125, | |
| "rewards/format_reward": 0.99990234375, | |
| "rewards/mean_confidence_reward": 0.552389633655548, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 488.2, | |
| "completions/max_terminated_length": 488.2, | |
| "completions/mean_length": 225.61953125, | |
| "completions/mean_terminated_length": 225.61953125, | |
| "completions/min_length": 117.4, | |
| "completions/min_terminated_length": 117.4, | |
| "epoch": 0.88, | |
| "grad_norm": 0.013025188818573952, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0002, | |
| "num_tokens": 935910949.0, | |
| "reward": 1.1634913206100463, | |
| "reward_std": 0.08096154034137726, | |
| "rewards/accuracy_reward": 0.508984375, | |
| "rewards/brier_reward": 0.8179973006248474, | |
| "rewards/confidence_one_or_zero": 0.001953125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/mean_confidence_reward": 0.5161884605884552, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00087890625, | |
| "completions/max_length": 1493.2, | |
| "completions/max_terminated_length": 1388.4, | |
| "completions/mean_length": 230.47041015625, | |
| "completions/mean_terminated_length": 229.32456970214844, | |
| "completions/min_length": 111.4, | |
| "completions/min_terminated_length": 111.4, | |
| "epoch": 0.896, | |
| "grad_norm": 0.013893580064177513, | |
| "learning_rate": 1e-06, | |
| "loss": 0.002, | |
| "num_tokens": 953381814.0, | |
| "reward": 1.171729063987732, | |
| "reward_std": 0.07993723750114441, | |
| "rewards/accuracy_reward": 0.53193359375, | |
| "rewards/brier_reward": 0.8125000953674316, | |
| "rewards/confidence_one_or_zero": 0.00244140625, | |
| "rewards/format_reward": 0.9990234375, | |
| "rewards/mean_confidence_reward": 0.5306591987609863, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00185546875, | |
| "completions/max_length": 1531.0, | |
| "completions/max_terminated_length": 1445.0, | |
| "completions/mean_length": 230.8126953125, | |
| "completions/mean_terminated_length": 228.377685546875, | |
| "completions/min_length": 82.0, | |
| "completions/min_terminated_length": 82.0, | |
| "epoch": 0.912, | |
| "grad_norm": 0.036593444645404816, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0051, | |
| "num_tokens": 970796632.0, | |
| "reward": 1.1708195447921752, | |
| "reward_std": 0.09443000853061675, | |
| "rewards/accuracy_reward": 0.529296875, | |
| "rewards/brier_reward": 0.8148803591728211, | |
| "rewards/confidence_one_or_zero": 0.00146484375, | |
| "rewards/format_reward": 0.9974609375, | |
| "rewards/mean_confidence_reward": 0.5297421932220459, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00048828125, | |
| "completions/max_length": 1076.0, | |
| "completions/max_terminated_length": 947.4, | |
| "completions/mean_length": 224.44541015625, | |
| "completions/mean_terminated_length": 223.80473937988282, | |
| "completions/min_length": 112.2, | |
| "completions/min_terminated_length": 112.2, | |
| "epoch": 0.928, | |
| "grad_norm": 0.032313406467437744, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0014, | |
| "num_tokens": 988121769.0, | |
| "reward": 1.1626015663146974, | |
| "reward_std": 0.08359554558992385, | |
| "rewards/accuracy_reward": 0.52421875, | |
| "rewards/brier_reward": 0.8017645239830017, | |
| "rewards/confidence_one_or_zero": 9.765625e-05, | |
| "rewards/format_reward": 0.99921875, | |
| "rewards/mean_confidence_reward": 0.5579541087150574, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 9.765625e-05, | |
| "completions/max_length": 1049.2, | |
| "completions/max_terminated_length": 1025.4, | |
| "completions/mean_length": 229.90673828125, | |
| "completions/mean_terminated_length": 229.7791961669922, | |
| "completions/min_length": 100.6, | |
| "completions/min_terminated_length": 100.6, | |
| "epoch": 0.944, | |
| "grad_norm": 0.03256411850452423, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 1005451438.0, | |
| "reward": 1.1718948125839233, | |
| "reward_std": 0.09756192564964294, | |
| "rewards/accuracy_reward": 0.53125, | |
| "rewards/brier_reward": 0.8127339124679566, | |
| "rewards/confidence_one_or_zero": 0.00029296875, | |
| "rewards/format_reward": 0.9998046875, | |
| "rewards/mean_confidence_reward": 0.5793359398841857, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 664.8, | |
| "completions/max_terminated_length": 664.8, | |
| "completions/mean_length": 237.31884765625, | |
| "completions/mean_terminated_length": 237.31884765625, | |
| "completions/min_length": 110.4, | |
| "completions/min_terminated_length": 110.4, | |
| "epoch": 0.96, | |
| "grad_norm": 0.005703456234186888, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0003, | |
| "num_tokens": 1022821903.0, | |
| "reward": 1.1647626161575317, | |
| "reward_std": 0.07187836617231369, | |
| "rewards/accuracy_reward": 0.51796875, | |
| "rewards/brier_reward": 0.8118483901023865, | |
| "rewards/confidence_one_or_zero": 0.00029296875, | |
| "rewards/format_reward": 0.99970703125, | |
| "rewards/mean_confidence_reward": 0.5718408465385437, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_completions/clipped_ratio": 0.0, | |
| "eval_completions/max_length": 407.0, | |
| "eval_completions/max_terminated_length": 407.0, | |
| "eval_completions/mean_length": 245.04660415649414, | |
| "eval_completions/mean_terminated_length": 245.04660415649414, | |
| "eval_completions/min_length": 155.25, | |
| "eval_completions/min_terminated_length": 155.25, | |
| "eval_loss": 0.0, | |
| "eval_num_tokens": 1022821903.0, | |
| "eval_reward": 1.12132129073143, | |
| "eval_reward_std": 0.27718404680490494, | |
| "eval_rewards/accuracy_reward": 0.43359375, | |
| "eval_rewards/brier_reward": 0.8090478628873825, | |
| "eval_rewards/confidence_one_or_zero": 0.001953125, | |
| "eval_rewards/format_reward": 1.0, | |
| "eval_rewards/mean_confidence_reward": 0.508496105670929, | |
| "eval_runtime": 22.3953, | |
| "eval_samples_per_second": 22.326, | |
| "eval_steps_per_second": 0.179, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0001953125, | |
| "completions/max_length": 980.4, | |
| "completions/max_terminated_length": 564.8, | |
| "completions/mean_length": 244.62080078125, | |
| "completions/mean_terminated_length": 244.36865234375, | |
| "completions/min_length": 118.8, | |
| "completions/min_terminated_length": 118.8, | |
| "epoch": 0.976, | |
| "grad_norm": 0.01138161402195692, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0007, | |
| "num_tokens": 1040187940.0, | |
| "reward": 1.1814841508865357, | |
| "reward_std": 0.07296017110347748, | |
| "rewards/accuracy_reward": 0.5513671875, | |
| "rewards/brier_reward": 0.8119907259941102, | |
| "rewards/confidence_one_or_zero": 0.0001953125, | |
| "rewards/format_reward": 0.999609375, | |
| "rewards/mean_confidence_reward": 0.5193847775459289, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 682.0, | |
| "completions/max_terminated_length": 682.0, | |
| "completions/mean_length": 238.13291015625, | |
| "completions/mean_terminated_length": 238.13291015625, | |
| "completions/min_length": 127.2, | |
| "completions/min_terminated_length": 127.2, | |
| "epoch": 0.992, | |
| "grad_norm": 0.006512052845209837, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 1057754901.0, | |
| "reward": 1.1691525459289551, | |
| "reward_std": 0.06929974779486656, | |
| "rewards/accuracy_reward": 0.5232421875, | |
| "rewards/brier_reward": 0.8151596069335938, | |
| "rewards/confidence_one_or_zero": 0.00087890625, | |
| "rewards/format_reward": 0.99990234375, | |
| "rewards/mean_confidence_reward": 0.4931289255619049, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 559.5, | |
| "completions/max_terminated_length": 559.5, | |
| "completions/mean_length": 235.83663940429688, | |
| "completions/mean_terminated_length": 235.83663940429688, | |
| "completions/min_length": 122.0, | |
| "completions/min_terminated_length": 122.0, | |
| "epoch": 0.9984, | |
| "num_tokens": 1064720670.0, | |
| "reward": 1.1648695468902588, | |
| "reward_std": 0.073847196996212, | |
| "rewards/accuracy_reward": 0.54248046875, | |
| "rewards/brier_reward": 0.787746012210846, | |
| "rewards/confidence_one_or_zero": 0.000244140625, | |
| "rewards/format_reward": 0.99951171875, | |
| "rewards/mean_confidence_reward": 0.5165649354457855, | |
| "step": 312, | |
| "total_flos": 0.0, | |
| "train_loss": 0.005198169167087121, | |
| "train_runtime": 92786.4477, | |
| "train_samples_per_second": 0.216, | |
| "train_steps_per_second": 0.003 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 312, | |
| "num_input_tokens_seen": 1064720670, | |
| "num_train_epochs": 1, | |
| "save_steps": 60, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |