| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9996190476190476, | |
| "eval_steps": 50, | |
| "global_step": 164, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.77177734375, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 128.0, | |
| "completions/mean_length": 116.989453125, | |
| "completions/mean_terminated_length": 79.75372467041015, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "epoch": 0.030476190476190476, | |
| "grad_norm": 0.044517017900943756, | |
| "learning_rate": 5.555555555555555e-07, | |
| "loss": 0.0351, | |
| "num_tokens": 3786900.0, | |
| "reward": 0.1602703720331192, | |
| "reward_std": 0.3331510126590729, | |
| "rewards/accuracy_reward": 0.02919921875, | |
| "rewards/brier_reward": 0.12679073810577393, | |
| "rewards/confidence_one_or_zero": 0.11162109375, | |
| "rewards/format_reward": 0.16455078125, | |
| "rewards/log_2_reward": 1.5036844968795777, | |
| "rewards/mean_confidence_reward": 0.10052483528852463, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4662109375, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 128.0, | |
| "completions/mean_length": 100.52607421875, | |
| "completions/mean_terminated_length": 77.6335220336914, | |
| "completions/min_length": 2.2, | |
| "completions/min_terminated_length": 2.2, | |
| "epoch": 0.06095238095238095, | |
| "grad_norm": 0.05621691420674324, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0748, | |
| "num_tokens": 7405151.0, | |
| "reward": 0.4929037868976593, | |
| "reward_std": 0.4538139343261719, | |
| "rewards/accuracy_reward": 0.1025390625, | |
| "rewards/brier_reward": 0.38190129995346067, | |
| "rewards/confidence_one_or_zero": 0.20380859375, | |
| "rewards/format_reward": 0.5013671875, | |
| "rewards/log_2_reward": 4.606710624694824, | |
| "rewards/mean_confidence_reward": 0.24942362308502197, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.04775390625, | |
| "completions/max_length": 128.0, | |
| "completions/max_terminated_length": 126.2, | |
| "completions/mean_length": 64.80859375, | |
| "completions/mean_terminated_length": 61.81011734008789, | |
| "completions/min_length": 16.0, | |
| "completions/min_terminated_length": 16.0, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.007118818815797567, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0258, | |
| "num_tokens": 10656791.0, | |
| "reward": 0.914728832244873, | |
| "reward_std": 0.2808063507080078, | |
| "rewards/accuracy_reward": 0.1908203125, | |
| "rewards/brier_reward": 0.6954732894897461, | |
| "rewards/confidence_one_or_zero": 0.298828125, | |
| "rewards/format_reward": 0.9431640625, | |
| "rewards/log_2_reward": 8.584617424011231, | |
| "rewards/mean_confidence_reward": 0.4371844470500946, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00048828125, | |
| "completions/max_length": 119.2, | |
| "completions/max_terminated_length": 109.2, | |
| "completions/mean_length": 43.25625, | |
| "completions/mean_terminated_length": 43.215232849121094, | |
| "completions/min_length": 19.0, | |
| "completions/min_terminated_length": 19.0, | |
| "epoch": 0.1219047619047619, | |
| "grad_norm": 0.0038136562798172235, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0002, | |
| "num_tokens": 13686359.0, | |
| "reward": 0.9965023279190064, | |
| "reward_std": 0.15809117555618285, | |
| "rewards/accuracy_reward": 0.21064453125, | |
| "rewards/brier_reward": 0.7860710978507995, | |
| "rewards/confidence_one_or_zero": 0.33076171875, | |
| "rewards/format_reward": 0.9962890625, | |
| "rewards/log_2_reward": 9.255117416381836, | |
| "rewards/mean_confidence_reward": 0.3694546759128571, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 9.765625e-05, | |
| "completions/max_length": 110.8, | |
| "completions/max_terminated_length": 103.0, | |
| "completions/mean_length": 41.52744140625, | |
| "completions/mean_terminated_length": 41.51896286010742, | |
| "completions/min_length": 21.4, | |
| "completions/min_terminated_length": 21.4, | |
| "epoch": 0.1523809523809524, | |
| "grad_norm": 0.003012983128428459, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0001, | |
| "num_tokens": 16700880.0, | |
| "reward": 1.0461133480072022, | |
| "reward_std": 0.10825497806072235, | |
| "rewards/accuracy_reward": 0.26142578125, | |
| "rewards/brier_reward": 0.8320704221725463, | |
| "rewards/confidence_one_or_zero": 0.34912109375, | |
| "rewards/format_reward": 0.99873046875, | |
| "rewards/log_2_reward": 9.48110408782959, | |
| "rewards/mean_confidence_reward": 0.26596843302249906, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 82.8, | |
| "completions/max_terminated_length": 82.8, | |
| "completions/mean_length": 39.5283203125, | |
| "completions/mean_terminated_length": 39.5283203125, | |
| "completions/min_length": 21.8, | |
| "completions/min_terminated_length": 21.8, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.0036328076384961605, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 19692754.0, | |
| "reward": 1.048519992828369, | |
| "reward_std": 0.0714333064854145, | |
| "rewards/accuracy_reward": 0.271875, | |
| "rewards/brier_reward": 0.8257508277893066, | |
| "rewards/confidence_one_or_zero": 0.299609375, | |
| "rewards/format_reward": 0.9994140625, | |
| "rewards/log_2_reward": 9.493891334533691, | |
| "rewards/mean_confidence_reward": 0.22483715116977693, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 69.8, | |
| "completions/max_terminated_length": 69.8, | |
| "completions/mean_length": 35.08447265625, | |
| "completions/mean_terminated_length": 35.08447265625, | |
| "completions/min_length": 21.0, | |
| "completions/min_terminated_length": 21.0, | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 0.0022133691236376762, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0002, | |
| "num_tokens": 22637203.0, | |
| "reward": 1.0370203256607056, | |
| "reward_std": 0.05964324176311493, | |
| "rewards/accuracy_reward": 0.26513671875, | |
| "rewards/brier_reward": 0.8091967940330506, | |
| "rewards/confidence_one_or_zero": 0.1693359375, | |
| "rewards/format_reward": 0.99970703125, | |
| "rewards/log_2_reward": 9.430614471435547, | |
| "rewards/mean_confidence_reward": 0.2505485266447067, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 66.2, | |
| "completions/max_terminated_length": 66.2, | |
| "completions/mean_length": 31.94296875, | |
| "completions/mean_terminated_length": 31.94296875, | |
| "completions/min_length": 24.0, | |
| "completions/min_terminated_length": 24.0, | |
| "epoch": 0.2438095238095238, | |
| "grad_norm": 0.0017201935406774282, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "num_tokens": 25555211.0, | |
| "reward": 1.0471293449401855, | |
| "reward_std": 0.05364943891763687, | |
| "rewards/accuracy_reward": 0.3115234375, | |
| "rewards/brier_reward": 0.7830281615257263, | |
| "rewards/confidence_one_or_zero": 0.04189453125, | |
| "rewards/format_reward": 0.99970703125, | |
| "rewards/log_2_reward": 9.361624145507813, | |
| "rewards/mean_confidence_reward": 0.28543118238449094, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 9.765625e-05, | |
| "completions/max_length": 69.2, | |
| "completions/max_terminated_length": 54.0, | |
| "completions/mean_length": 30.14716796875, | |
| "completions/mean_terminated_length": 30.13763847351074, | |
| "completions/min_length": 25.6, | |
| "completions/min_terminated_length": 25.6, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.0046602096408605576, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "num_tokens": 28453838.0, | |
| "reward": 1.0581303119659424, | |
| "reward_std": 0.04390195086598396, | |
| "rewards/accuracy_reward": 0.357421875, | |
| "rewards/brier_reward": 0.7590341210365296, | |
| "rewards/confidence_one_or_zero": 0.00849609375, | |
| "rewards/format_reward": 0.9998046875, | |
| "rewards/log_2_reward": 9.308100318908691, | |
| "rewards/mean_confidence_reward": 0.28627516627311705, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 47.2, | |
| "completions/max_terminated_length": 47.2, | |
| "completions/mean_length": 29.5861328125, | |
| "completions/mean_terminated_length": 29.5861328125, | |
| "completions/min_length": 26.0, | |
| "completions/min_terminated_length": 26.0, | |
| "epoch": 0.3047619047619048, | |
| "grad_norm": 0.0013400259194895625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 31340320.0, | |
| "reward": 1.05360004901886, | |
| "reward_std": 0.044026906788349154, | |
| "rewards/accuracy_reward": 0.3212890625, | |
| "rewards/brier_reward": 0.785910964012146, | |
| "rewards/confidence_one_or_zero": 0.0009765625, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.37942886352539, | |
| "rewards/mean_confidence_reward": 0.28688542246818544, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3047619047619048, | |
| "eval_completions/clipped_ratio": 0.0, | |
| "eval_completions/max_length": 36.25, | |
| "eval_completions/max_terminated_length": 36.25, | |
| "eval_completions/mean_length": 29.30576515197754, | |
| "eval_completions/mean_terminated_length": 29.30576515197754, | |
| "eval_completions/min_length": 27.75, | |
| "eval_completions/min_terminated_length": 27.75, | |
| "eval_loss": 0.0, | |
| "eval_num_tokens": 31340320.0, | |
| "eval_reward": 1.0531243979930878, | |
| "eval_reward_std": 0.14904534071683884, | |
| "eval_rewards/accuracy_reward": 0.32421875, | |
| "eval_rewards/brier_reward": 0.7820300608873367, | |
| "eval_rewards/confidence_one_or_zero": 0.0, | |
| "eval_rewards/format_reward": 1.0, | |
| "eval_rewards/log_2_reward": 9.371180772781372, | |
| "eval_rewards/mean_confidence_reward": 0.29496094584465027, | |
| "eval_runtime": 6.873, | |
| "eval_samples_per_second": 72.748, | |
| "eval_steps_per_second": 0.582, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 48.6, | |
| "completions/max_terminated_length": 48.6, | |
| "completions/mean_length": 29.42314453125, | |
| "completions/mean_terminated_length": 29.42314453125, | |
| "completions/min_length": 26.8, | |
| "completions/min_terminated_length": 26.8, | |
| "epoch": 0.3352380952380952, | |
| "grad_norm": 0.005661148112267256, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 34228909.0, | |
| "reward": 1.0620482921600343, | |
| "reward_std": 0.03885223716497421, | |
| "rewards/accuracy_reward": 0.35185546875, | |
| "rewards/brier_reward": 0.7722411632537842, | |
| "rewards/confidence_one_or_zero": 0.000390625, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.348680305480958, | |
| "rewards/mean_confidence_reward": 0.29889385104179383, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 44.0, | |
| "completions/max_terminated_length": 44.0, | |
| "completions/mean_length": 29.43212890625, | |
| "completions/mean_terminated_length": 29.43212890625, | |
| "completions/min_length": 27.0, | |
| "completions/min_terminated_length": 27.0, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 0.0018649547128006816, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 37120310.0, | |
| "reward": 1.0558890104293823, | |
| "reward_std": 0.04158634543418884, | |
| "rewards/accuracy_reward": 0.340625, | |
| "rewards/brier_reward": 0.7711530804634095, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.347337532043458, | |
| "rewards/mean_confidence_reward": 0.31556963324546816, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 46.4, | |
| "completions/max_terminated_length": 46.4, | |
| "completions/mean_length": 29.49033203125, | |
| "completions/mean_terminated_length": 29.49033203125, | |
| "completions/min_length": 27.0, | |
| "completions/min_terminated_length": 27.0, | |
| "epoch": 0.3961904761904762, | |
| "grad_norm": 0.0009951787069439888, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 40010803.0, | |
| "reward": 1.0620671272277833, | |
| "reward_std": 0.03282982967793942, | |
| "rewards/accuracy_reward": 0.33857421875, | |
| "rewards/brier_reward": 0.7855600833892822, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.380914878845214, | |
| "rewards/mean_confidence_reward": 0.32113115191459657, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 48.0, | |
| "completions/max_terminated_length": 48.0, | |
| "completions/mean_length": 29.60234375, | |
| "completions/mean_terminated_length": 29.60234375, | |
| "completions/min_length": 26.6, | |
| "completions/min_terminated_length": 26.6, | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 0.0017366721294820309, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 42903595.0, | |
| "reward": 1.0687182664871215, | |
| "reward_std": 0.03353095762431622, | |
| "rewards/accuracy_reward": 0.36728515625, | |
| "rewards/brier_reward": 0.7702489614486694, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 0.99990234375, | |
| "rewards/log_2_reward": 9.347804069519043, | |
| "rewards/mean_confidence_reward": 0.33598633408546447, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 47.6, | |
| "completions/max_terminated_length": 47.6, | |
| "completions/mean_length": 29.707421875, | |
| "completions/mean_terminated_length": 29.707421875, | |
| "completions/min_length": 27.0, | |
| "completions/min_terminated_length": 27.0, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.0029517621733248234, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 45793111.0, | |
| "reward": 1.0516783237457275, | |
| "reward_std": 0.030857810378074647, | |
| "rewards/accuracy_reward": 0.30888671875, | |
| "rewards/brier_reward": 0.7945676207542419, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 0.99990234375, | |
| "rewards/log_2_reward": 9.399543380737304, | |
| "rewards/mean_confidence_reward": 0.34110644459724426, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 46.6, | |
| "completions/max_terminated_length": 46.6, | |
| "completions/mean_length": 29.79677734375, | |
| "completions/mean_terminated_length": 29.79677734375, | |
| "completions/min_length": 27.0, | |
| "completions/min_terminated_length": 27.0, | |
| "epoch": 0.4876190476190476, | |
| "grad_norm": 0.0009430632926523685, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 48682550.0, | |
| "reward": 1.073946213722229, | |
| "reward_std": 0.03284625560045242, | |
| "rewards/accuracy_reward": 0.37626953125, | |
| "rewards/brier_reward": 0.7716229557991028, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.351155662536621, | |
| "rewards/mean_confidence_reward": 0.344040048122406, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 43.8, | |
| "completions/max_terminated_length": 43.8, | |
| "completions/mean_length": 29.8119140625, | |
| "completions/mean_terminated_length": 29.8119140625, | |
| "completions/min_length": 27.0, | |
| "completions/min_terminated_length": 27.0, | |
| "epoch": 0.518095238095238, | |
| "grad_norm": 0.0008346997783519328, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 51573360.0, | |
| "reward": 1.0541439533233643, | |
| "reward_std": 0.030336325988173485, | |
| "rewards/accuracy_reward": 0.33095703125, | |
| "rewards/brier_reward": 0.7773308038711548, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.363373184204102, | |
| "rewards/mean_confidence_reward": 0.36398438215255735, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 43.2, | |
| "completions/max_terminated_length": 43.2, | |
| "completions/mean_length": 30.00830078125, | |
| "completions/mean_terminated_length": 30.00830078125, | |
| "completions/min_length": 27.0, | |
| "completions/min_terminated_length": 27.0, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 0.001722269575111568, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 54467493.0, | |
| "reward": 1.0457963228225708, | |
| "reward_std": 0.027028150111436843, | |
| "rewards/accuracy_reward": 0.2970703125, | |
| "rewards/brier_reward": 0.7945223331451416, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.400186538696289, | |
| "rewards/mean_confidence_reward": 0.3450214922428131, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 43.4, | |
| "completions/max_terminated_length": 43.4, | |
| "completions/mean_length": 30.32822265625, | |
| "completions/mean_terminated_length": 30.32822265625, | |
| "completions/min_length": 26.8, | |
| "completions/min_terminated_length": 26.8, | |
| "epoch": 0.579047619047619, | |
| "grad_norm": 0.0012162128696218133, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 57363686.0, | |
| "reward": 1.0605377197265624, | |
| "reward_std": 0.02524040825664997, | |
| "rewards/accuracy_reward": 0.3380859375, | |
| "rewards/brier_reward": 0.7829894661903382, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.374969482421875, | |
| "rewards/mean_confidence_reward": 0.31404297351837157, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 41.2, | |
| "completions/max_terminated_length": 41.2, | |
| "completions/mean_length": 30.5470703125, | |
| "completions/mean_terminated_length": 30.5470703125, | |
| "completions/min_length": 26.8, | |
| "completions/min_terminated_length": 26.8, | |
| "epoch": 0.6095238095238096, | |
| "grad_norm": 0.0015784272691234946, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 60264456.0, | |
| "reward": 1.0614788293838502, | |
| "reward_std": 0.024412815272808076, | |
| "rewards/accuracy_reward": 0.34287109375, | |
| "rewards/brier_reward": 0.7800864815711975, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.369251251220703, | |
| "rewards/mean_confidence_reward": 0.30150097608566284, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6095238095238096, | |
| "eval_completions/clipped_ratio": 0.0, | |
| "eval_completions/max_length": 38.5, | |
| "eval_completions/max_terminated_length": 38.5, | |
| "eval_completions/mean_length": 30.642173767089844, | |
| "eval_completions/mean_terminated_length": 30.642173767089844, | |
| "eval_completions/min_length": 27.5, | |
| "eval_completions/min_terminated_length": 27.5, | |
| "eval_loss": 0.0, | |
| "eval_num_tokens": 60264456.0, | |
| "eval_reward": 1.0593858063220978, | |
| "eval_reward_std": 0.14802763611078262, | |
| "eval_rewards/accuracy_reward": 0.3359375, | |
| "eval_rewards/brier_reward": 0.7828342020511627, | |
| "eval_rewards/confidence_one_or_zero": 0.0, | |
| "eval_rewards/format_reward": 1.0, | |
| "eval_rewards/log_2_reward": 9.374672651290894, | |
| "eval_rewards/mean_confidence_reward": 0.3010351434350014, | |
| "eval_runtime": 6.9233, | |
| "eval_samples_per_second": 72.22, | |
| "eval_steps_per_second": 0.578, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 42.0, | |
| "completions/max_terminated_length": 42.0, | |
| "completions/mean_length": 30.66103515625, | |
| "completions/mean_terminated_length": 30.66103515625, | |
| "completions/min_length": 27.0, | |
| "completions/min_terminated_length": 27.0, | |
| "epoch": 0.64, | |
| "grad_norm": 0.0011995319509878755, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 63169977.0, | |
| "reward": 1.0606130838394165, | |
| "reward_std": 0.026628994569182395, | |
| "rewards/accuracy_reward": 0.3349609375, | |
| "rewards/brier_reward": 0.7862652897834778, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.382536125183105, | |
| "rewards/mean_confidence_reward": 0.30452245473861694, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 43.4, | |
| "completions/max_terminated_length": 43.4, | |
| "completions/mean_length": 30.59970703125, | |
| "completions/mean_terminated_length": 30.59970703125, | |
| "completions/min_length": 27.0, | |
| "completions/min_terminated_length": 27.0, | |
| "epoch": 0.6704761904761904, | |
| "grad_norm": 0.0009227950358763337, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 66071414.0, | |
| "reward": 1.051579189300537, | |
| "reward_std": 0.021834611520171167, | |
| "rewards/accuracy_reward": 0.313671875, | |
| "rewards/brier_reward": 0.789486539363861, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.389471054077148, | |
| "rewards/mean_confidence_reward": 0.31677929162979124, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 45.6, | |
| "completions/max_terminated_length": 45.6, | |
| "completions/mean_length": 30.77373046875, | |
| "completions/mean_terminated_length": 30.77373046875, | |
| "completions/min_length": 27.6, | |
| "completions/min_terminated_length": 27.6, | |
| "epoch": 0.700952380952381, | |
| "grad_norm": 0.0009503339533694088, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 68973321.0, | |
| "reward": 1.0458170413970946, | |
| "reward_std": 0.02407231219112873, | |
| "rewards/accuracy_reward": 0.3013671875, | |
| "rewards/brier_reward": 0.790267014503479, | |
| "rewards/confidence_one_or_zero": 9.765625e-05, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.389829635620117, | |
| "rewards/mean_confidence_reward": 0.318223637342453, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 44.6, | |
| "completions/max_terminated_length": 44.6, | |
| "completions/mean_length": 30.7498046875, | |
| "completions/mean_terminated_length": 30.7498046875, | |
| "completions/min_length": 26.8, | |
| "completions/min_terminated_length": 26.8, | |
| "epoch": 0.7314285714285714, | |
| "grad_norm": 0.0011252901749685407, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 71876487.0, | |
| "reward": 1.0573019981384277, | |
| "reward_std": 0.020349294319748877, | |
| "rewards/accuracy_reward": 0.33916015625, | |
| "rewards/brier_reward": 0.7754438757896424, | |
| "rewards/confidence_one_or_zero": 0.0001953125, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.358468627929687, | |
| "rewards/mean_confidence_reward": 0.3264623939990997, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 42.2, | |
| "completions/max_terminated_length": 42.2, | |
| "completions/mean_length": 30.65869140625, | |
| "completions/mean_terminated_length": 30.65869140625, | |
| "completions/min_length": 27.4, | |
| "completions/min_terminated_length": 27.4, | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 0.0011465527350082994, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 74779456.0, | |
| "reward": 1.0477258205413817, | |
| "reward_std": 0.019322525709867477, | |
| "rewards/accuracy_reward": 0.3068359375, | |
| "rewards/brier_reward": 0.7886156797409057, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.386696052551269, | |
| "rewards/mean_confidence_reward": 0.3149255871772766, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 42.0, | |
| "completions/max_terminated_length": 42.0, | |
| "completions/mean_length": 30.628125, | |
| "completions/mean_terminated_length": 30.628125, | |
| "completions/min_length": 27.4, | |
| "completions/min_terminated_length": 27.4, | |
| "epoch": 0.7923809523809524, | |
| "grad_norm": 0.00031550656422041357, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 77682176.0, | |
| "reward": 1.0575794219970702, | |
| "reward_std": 0.017232473567128183, | |
| "rewards/accuracy_reward": 0.330078125, | |
| "rewards/brier_reward": 0.7850807905197144, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.379909324645997, | |
| "rewards/mean_confidence_reward": 0.3171168327331543, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 41.2, | |
| "completions/max_terminated_length": 41.2, | |
| "completions/mean_length": 30.61865234375, | |
| "completions/mean_terminated_length": 30.61865234375, | |
| "completions/min_length": 27.6, | |
| "completions/min_terminated_length": 27.6, | |
| "epoch": 0.8228571428571428, | |
| "grad_norm": 0.0007638483075425029, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 80578847.0, | |
| "reward": 1.046373963356018, | |
| "reward_std": 0.0160041656345129, | |
| "rewards/accuracy_reward": 0.29697265625, | |
| "rewards/brier_reward": 0.795775318145752, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.402251243591309, | |
| "rewards/mean_confidence_reward": 0.30945849418640137, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 41.2, | |
| "completions/max_terminated_length": 41.2, | |
| "completions/mean_length": 30.74453125, | |
| "completions/mean_terminated_length": 30.74453125, | |
| "completions/min_length": 28.0, | |
| "completions/min_terminated_length": 28.0, | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 0.0011101987911388278, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 83481287.0, | |
| "reward": 1.0471273899078368, | |
| "reward_std": 0.01917775347828865, | |
| "rewards/accuracy_reward": 0.305859375, | |
| "rewards/brier_reward": 0.7883954048156738, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.386114120483398, | |
| "rewards/mean_confidence_reward": 0.30657861828804017, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 42.4, | |
| "completions/max_terminated_length": 42.4, | |
| "completions/mean_length": 30.97451171875, | |
| "completions/mean_terminated_length": 30.97451171875, | |
| "completions/min_length": 27.8, | |
| "completions/min_terminated_length": 27.8, | |
| "epoch": 0.8838095238095238, | |
| "grad_norm": 0.00115675397682935, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 86392354.0, | |
| "reward": 1.05747447013855, | |
| "reward_std": 0.017029477655887602, | |
| "rewards/accuracy_reward": 0.3435546875, | |
| "rewards/brier_reward": 0.7713942408561707, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.349239540100097, | |
| "rewards/mean_confidence_reward": 0.30917675495147706, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 38.6, | |
| "completions/max_terminated_length": 38.6, | |
| "completions/mean_length": 31.1416015625, | |
| "completions/mean_terminated_length": 31.1416015625, | |
| "completions/min_length": 27.6, | |
| "completions/min_terminated_length": 27.6, | |
| "epoch": 0.9142857142857143, | |
| "grad_norm": 0.0005026182625442743, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 89297004.0, | |
| "reward": 1.0541616201400756, | |
| "reward_std": 0.014724909700453282, | |
| "rewards/accuracy_reward": 0.33291015625, | |
| "rewards/brier_reward": 0.7755107522010803, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 0.99990234375, | |
| "rewards/log_2_reward": 9.356867027282714, | |
| "rewards/mean_confidence_reward": 0.29771387577056885, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9142857142857143, | |
| "eval_completions/clipped_ratio": 0.0, | |
| "eval_completions/max_length": 37.0, | |
| "eval_completions/max_terminated_length": 37.0, | |
| "eval_completions/mean_length": 31.187903881072998, | |
| "eval_completions/mean_terminated_length": 31.187903881072998, | |
| "eval_completions/min_length": 28.0, | |
| "eval_completions/min_terminated_length": 28.0, | |
| "eval_loss": 0.0, | |
| "eval_num_tokens": 89297004.0, | |
| "eval_reward": 1.0590836107730865, | |
| "eval_reward_std": 0.14529624581336975, | |
| "eval_rewards/accuracy_reward": 0.337890625, | |
| "eval_rewards/brier_reward": 0.7802765667438507, | |
| "eval_rewards/confidence_one_or_zero": 0.0, | |
| "eval_rewards/format_reward": 1.0, | |
| "eval_rewards/log_2_reward": 9.368727684020996, | |
| "eval_rewards/mean_confidence_reward": 0.2976953163743019, | |
| "eval_runtime": 6.9998, | |
| "eval_samples_per_second": 71.431, | |
| "eval_steps_per_second": 0.571, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 40.0, | |
| "completions/max_terminated_length": 40.0, | |
| "completions/mean_length": 31.29521484375, | |
| "completions/mean_terminated_length": 31.29521484375, | |
| "completions/min_length": 27.8, | |
| "completions/min_terminated_length": 27.8, | |
| "epoch": 0.9447619047619048, | |
| "grad_norm": 0.0007179967360571027, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 92205211.0, | |
| "reward": 1.0541197299957275, | |
| "reward_std": 0.013267694972455502, | |
| "rewards/accuracy_reward": 0.325390625, | |
| "rewards/brier_reward": 0.7828488230705262, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.37431640625, | |
| "rewards/mean_confidence_reward": 0.29936914443969725, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 38.0, | |
| "completions/max_terminated_length": 38.0, | |
| "completions/mean_length": 31.15537109375, | |
| "completions/mean_terminated_length": 31.15537109375, | |
| "completions/min_length": 27.4, | |
| "completions/min_terminated_length": 27.4, | |
| "epoch": 0.9752380952380952, | |
| "grad_norm": 0.000989082851447165, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "num_tokens": 95112946.0, | |
| "reward": 1.0628147840499877, | |
| "reward_std": 0.015992810018360615, | |
| "rewards/accuracy_reward": 0.34912109375, | |
| "rewards/brier_reward": 0.7765084505081177, | |
| "rewards/confidence_one_or_zero": 9.765625e-05, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.361271095275878, | |
| "rewards/mean_confidence_reward": 0.3030927777290344, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 39.75, | |
| "completions/max_terminated_length": 39.75, | |
| "completions/mean_length": 30.896739959716797, | |
| "completions/mean_terminated_length": 30.896739959716797, | |
| "completions/min_length": 27.75, | |
| "completions/min_terminated_length": 27.75, | |
| "epoch": 0.9996190476190476, | |
| "num_tokens": 97437501.0, | |
| "reward": 1.0623335242271423, | |
| "reward_std": 0.014995867619290948, | |
| "rewards/accuracy_reward": 0.351318359375, | |
| "rewards/brier_reward": 0.7733487039804459, | |
| "rewards/confidence_one_or_zero": 0.0, | |
| "rewards/format_reward": 1.0, | |
| "rewards/log_2_reward": 9.354329109191895, | |
| "rewards/mean_confidence_reward": 0.3190649375319481, | |
| "step": 164, | |
| "total_flos": 0.0, | |
| "train_loss": 0.004139085497216711, | |
| "train_runtime": 15544.1305, | |
| "train_samples_per_second": 0.675, | |
| "train_steps_per_second": 0.011 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 164, | |
| "num_input_tokens_seen": 97437501, | |
| "num_train_epochs": 1, | |
| "save_steps": 60, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |