{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996190476190476, "eval_steps": 50, "global_step": 164, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.77177734375, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 116.989453125, "completions/mean_terminated_length": 79.75372467041015, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.030476190476190476, "grad_norm": 0.044517017900943756, "learning_rate": 5.555555555555555e-07, "loss": 0.0351, "num_tokens": 3786900.0, "reward": 0.1602703720331192, "reward_std": 0.3331510126590729, "rewards/accuracy_reward": 0.02919921875, "rewards/brier_reward": 0.12679073810577393, "rewards/confidence_one_or_zero": 0.11162109375, "rewards/format_reward": 0.16455078125, "rewards/log_2_reward": 1.5036844968795777, "rewards/mean_confidence_reward": 0.10052483528852463, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4662109375, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 100.52607421875, "completions/mean_terminated_length": 77.6335220336914, "completions/min_length": 2.2, "completions/min_terminated_length": 2.2, "epoch": 0.06095238095238095, "grad_norm": 0.05621691420674324, "learning_rate": 1e-06, "loss": 0.0748, "num_tokens": 7405151.0, "reward": 0.4929037868976593, "reward_std": 0.4538139343261719, "rewards/accuracy_reward": 0.1025390625, "rewards/brier_reward": 0.38190129995346067, "rewards/confidence_one_or_zero": 0.20380859375, "rewards/format_reward": 0.5013671875, "rewards/log_2_reward": 4.606710624694824, "rewards/mean_confidence_reward": 0.24942362308502197, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04775390625, "completions/max_length": 128.0, "completions/max_terminated_length": 126.2, "completions/mean_length": 64.80859375, "completions/mean_terminated_length": 61.81011734008789, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.09142857142857143, "grad_norm": 0.007118818815797567, "learning_rate": 1e-06, "loss": 0.0258, "num_tokens": 10656791.0, "reward": 0.914728832244873, "reward_std": 0.2808063507080078, "rewards/accuracy_reward": 0.1908203125, "rewards/brier_reward": 0.6954732894897461, "rewards/confidence_one_or_zero": 0.298828125, "rewards/format_reward": 0.9431640625, "rewards/log_2_reward": 8.584617424011231, "rewards/mean_confidence_reward": 0.4371844470500946, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00048828125, "completions/max_length": 119.2, "completions/max_terminated_length": 109.2, "completions/mean_length": 43.25625, "completions/mean_terminated_length": 43.215232849121094, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.1219047619047619, "grad_norm": 0.0038136562798172235, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 13686359.0, "reward": 0.9965023279190064, "reward_std": 0.15809117555618285, "rewards/accuracy_reward": 0.21064453125, "rewards/brier_reward": 0.7860710978507995, "rewards/confidence_one_or_zero": 0.33076171875, "rewards/format_reward": 0.9962890625, "rewards/log_2_reward": 9.255117416381836, "rewards/mean_confidence_reward": 0.3694546759128571, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 110.8, "completions/max_terminated_length": 103.0, "completions/mean_length": 41.52744140625, "completions/mean_terminated_length": 41.51896286010742, "completions/min_length": 21.4, "completions/min_terminated_length": 21.4, "epoch": 0.1523809523809524, "grad_norm": 0.003012983128428459, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 16700880.0, "reward": 1.0461133480072022, "reward_std": 0.10825497806072235, "rewards/accuracy_reward": 0.26142578125, "rewards/brier_reward": 0.8320704221725463, "rewards/confidence_one_or_zero": 0.34912109375, "rewards/format_reward": 0.99873046875, "rewards/log_2_reward": 9.48110408782959, "rewards/mean_confidence_reward": 0.26596843302249906, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.8, "completions/max_terminated_length": 82.8, "completions/mean_length": 39.5283203125, "completions/mean_terminated_length": 39.5283203125, "completions/min_length": 21.8, "completions/min_terminated_length": 21.8, "epoch": 0.18285714285714286, "grad_norm": 0.0036328076384961605, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 19692754.0, "reward": 1.048519992828369, "reward_std": 0.0714333064854145, "rewards/accuracy_reward": 0.271875, "rewards/brier_reward": 0.8257508277893066, "rewards/confidence_one_or_zero": 0.299609375, "rewards/format_reward": 0.9994140625, "rewards/log_2_reward": 9.493891334533691, "rewards/mean_confidence_reward": 0.22483715116977693, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.8, "completions/max_terminated_length": 69.8, "completions/mean_length": 35.08447265625, "completions/mean_terminated_length": 35.08447265625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.21333333333333335, "grad_norm": 0.0022133691236376762, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 22637203.0, "reward": 1.0370203256607056, "reward_std": 0.05964324176311493, "rewards/accuracy_reward": 0.26513671875, "rewards/brier_reward": 0.8091967940330506, "rewards/confidence_one_or_zero": 0.1693359375, "rewards/format_reward": 0.99970703125, "rewards/log_2_reward": 9.430614471435547, "rewards/mean_confidence_reward": 0.2505485266447067, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.2, "completions/max_terminated_length": 66.2, "completions/mean_length": 31.94296875, "completions/mean_terminated_length": 31.94296875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.2438095238095238, "grad_norm": 0.0017201935406774282, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 25555211.0, "reward": 1.0471293449401855, "reward_std": 0.05364943891763687, "rewards/accuracy_reward": 0.3115234375, "rewards/brier_reward": 0.7830281615257263, "rewards/confidence_one_or_zero": 0.04189453125, "rewards/format_reward": 0.99970703125, "rewards/log_2_reward": 9.361624145507813, "rewards/mean_confidence_reward": 0.28543118238449094, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 69.2, "completions/max_terminated_length": 54.0, "completions/mean_length": 30.14716796875, "completions/mean_terminated_length": 30.13763847351074, "completions/min_length": 25.6, "completions/min_terminated_length": 25.6, "epoch": 0.2742857142857143, "grad_norm": 0.0046602096408605576, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 28453838.0, "reward": 1.0581303119659424, "reward_std": 0.04390195086598396, "rewards/accuracy_reward": 0.357421875, "rewards/brier_reward": 0.7590341210365296, "rewards/confidence_one_or_zero": 0.00849609375, "rewards/format_reward": 0.9998046875, "rewards/log_2_reward": 9.308100318908691, "rewards/mean_confidence_reward": 0.28627516627311705, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.2, "completions/max_terminated_length": 47.2, "completions/mean_length": 29.5861328125, "completions/mean_terminated_length": 29.5861328125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3047619047619048, "grad_norm": 0.0013400259194895625, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 31340320.0, "reward": 1.05360004901886, "reward_std": 0.044026906788349154, "rewards/accuracy_reward": 0.3212890625, "rewards/brier_reward": 0.785910964012146, "rewards/confidence_one_or_zero": 0.0009765625, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.37942886352539, "rewards/mean_confidence_reward": 0.28688542246818544, "step": 50 }, { "epoch": 0.3047619047619048, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 36.25, "eval_completions/max_terminated_length": 36.25, "eval_completions/mean_length": 29.30576515197754, "eval_completions/mean_terminated_length": 29.30576515197754, "eval_completions/min_length": 27.75, "eval_completions/min_terminated_length": 27.75, "eval_loss": 0.0, "eval_num_tokens": 31340320.0, "eval_reward": 1.0531243979930878, "eval_reward_std": 0.14904534071683884, "eval_rewards/accuracy_reward": 0.32421875, "eval_rewards/brier_reward": 0.7820300608873367, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/log_2_reward": 9.371180772781372, "eval_rewards/mean_confidence_reward": 0.29496094584465027, "eval_runtime": 6.873, "eval_samples_per_second": 72.748, "eval_steps_per_second": 0.582, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.6, "completions/max_terminated_length": 48.6, "completions/mean_length": 29.42314453125, "completions/mean_terminated_length": 29.42314453125, "completions/min_length": 26.8, "completions/min_terminated_length": 26.8, "epoch": 0.3352380952380952, "grad_norm": 0.005661148112267256, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 34228909.0, "reward": 1.0620482921600343, "reward_std": 0.03885223716497421, "rewards/accuracy_reward": 0.35185546875, "rewards/brier_reward": 0.7722411632537842, "rewards/confidence_one_or_zero": 0.000390625, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.348680305480958, "rewards/mean_confidence_reward": 0.29889385104179383, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.0, "completions/max_terminated_length": 44.0, "completions/mean_length": 29.43212890625, "completions/mean_terminated_length": 29.43212890625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3657142857142857, "grad_norm": 0.0018649547128006816, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 37120310.0, "reward": 1.0558890104293823, "reward_std": 0.04158634543418884, "rewards/accuracy_reward": 0.340625, "rewards/brier_reward": 0.7711530804634095, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.347337532043458, "rewards/mean_confidence_reward": 0.31556963324546816, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.4, "completions/max_terminated_length": 46.4, "completions/mean_length": 29.49033203125, "completions/mean_terminated_length": 29.49033203125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3961904761904762, "grad_norm": 0.0009951787069439888, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 40010803.0, "reward": 1.0620671272277833, "reward_std": 0.03282982967793942, "rewards/accuracy_reward": 0.33857421875, "rewards/brier_reward": 0.7855600833892822, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.380914878845214, "rewards/mean_confidence_reward": 0.32113115191459657, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 29.60234375, "completions/mean_terminated_length": 29.60234375, "completions/min_length": 26.6, "completions/min_terminated_length": 26.6, "epoch": 0.4266666666666667, "grad_norm": 0.0017366721294820309, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 42903595.0, "reward": 1.0687182664871215, "reward_std": 0.03353095762431622, "rewards/accuracy_reward": 0.36728515625, "rewards/brier_reward": 0.7702489614486694, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99990234375, "rewards/log_2_reward": 9.347804069519043, "rewards/mean_confidence_reward": 0.33598633408546447, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 47.6, "completions/max_terminated_length": 47.6, "completions/mean_length": 29.707421875, "completions/mean_terminated_length": 29.707421875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.45714285714285713, "grad_norm": 0.0029517621733248234, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 45793111.0, "reward": 1.0516783237457275, "reward_std": 0.030857810378074647, "rewards/accuracy_reward": 0.30888671875, "rewards/brier_reward": 0.7945676207542419, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99990234375, "rewards/log_2_reward": 9.399543380737304, "rewards/mean_confidence_reward": 0.34110644459724426, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 46.6, "completions/max_terminated_length": 46.6, "completions/mean_length": 29.79677734375, "completions/mean_terminated_length": 29.79677734375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.4876190476190476, "grad_norm": 0.0009430632926523685, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 48682550.0, "reward": 1.073946213722229, "reward_std": 0.03284625560045242, "rewards/accuracy_reward": 0.37626953125, "rewards/brier_reward": 0.7716229557991028, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.351155662536621, "rewards/mean_confidence_reward": 0.344040048122406, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.8, "completions/max_terminated_length": 43.8, "completions/mean_length": 29.8119140625, "completions/mean_terminated_length": 29.8119140625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.518095238095238, "grad_norm": 0.0008346997783519328, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 51573360.0, "reward": 1.0541439533233643, "reward_std": 0.030336325988173485, "rewards/accuracy_reward": 0.33095703125, "rewards/brier_reward": 0.7773308038711548, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.363373184204102, "rewards/mean_confidence_reward": 0.36398438215255735, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.2, "completions/max_terminated_length": 43.2, "completions/mean_length": 30.00830078125, "completions/mean_terminated_length": 30.00830078125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5485714285714286, "grad_norm": 0.001722269575111568, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 54467493.0, "reward": 1.0457963228225708, "reward_std": 0.027028150111436843, "rewards/accuracy_reward": 0.2970703125, "rewards/brier_reward": 0.7945223331451416, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.400186538696289, "rewards/mean_confidence_reward": 0.3450214922428131, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.4, "completions/max_terminated_length": 43.4, "completions/mean_length": 30.32822265625, "completions/mean_terminated_length": 30.32822265625, "completions/min_length": 26.8, "completions/min_terminated_length": 26.8, "epoch": 0.579047619047619, "grad_norm": 0.0012162128696218133, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 57363686.0, "reward": 1.0605377197265624, "reward_std": 0.02524040825664997, "rewards/accuracy_reward": 0.3380859375, "rewards/brier_reward": 0.7829894661903382, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.374969482421875, "rewards/mean_confidence_reward": 0.31404297351837157, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.2, "completions/max_terminated_length": 41.2, "completions/mean_length": 30.5470703125, "completions/mean_terminated_length": 30.5470703125, "completions/min_length": 26.8, "completions/min_terminated_length": 26.8, "epoch": 0.6095238095238096, "grad_norm": 0.0015784272691234946, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 60264456.0, "reward": 1.0614788293838502, "reward_std": 0.024412815272808076, "rewards/accuracy_reward": 0.34287109375, "rewards/brier_reward": 0.7800864815711975, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.369251251220703, "rewards/mean_confidence_reward": 0.30150097608566284, "step": 100 }, { "epoch": 0.6095238095238096, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 38.5, "eval_completions/max_terminated_length": 38.5, "eval_completions/mean_length": 30.642173767089844, "eval_completions/mean_terminated_length": 30.642173767089844, "eval_completions/min_length": 27.5, "eval_completions/min_terminated_length": 27.5, "eval_loss": 0.0, "eval_num_tokens": 60264456.0, "eval_reward": 1.0593858063220978, "eval_reward_std": 0.14802763611078262, "eval_rewards/accuracy_reward": 0.3359375, "eval_rewards/brier_reward": 0.7828342020511627, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/log_2_reward": 9.374672651290894, "eval_rewards/mean_confidence_reward": 0.3010351434350014, "eval_runtime": 6.9233, "eval_samples_per_second": 72.22, "eval_steps_per_second": 0.578, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 30.66103515625, "completions/mean_terminated_length": 30.66103515625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.64, "grad_norm": 0.0011995319509878755, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 63169977.0, "reward": 1.0606130838394165, "reward_std": 0.026628994569182395, "rewards/accuracy_reward": 0.3349609375, "rewards/brier_reward": 0.7862652897834778, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.382536125183105, "rewards/mean_confidence_reward": 0.30452245473861694, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 43.4, "completions/max_terminated_length": 43.4, "completions/mean_length": 30.59970703125, "completions/mean_terminated_length": 30.59970703125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6704761904761904, "grad_norm": 0.0009227950358763337, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 66071414.0, "reward": 1.051579189300537, "reward_std": 0.021834611520171167, "rewards/accuracy_reward": 0.313671875, "rewards/brier_reward": 0.789486539363861, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.389471054077148, "rewards/mean_confidence_reward": 0.31677929162979124, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 45.6, "completions/max_terminated_length": 45.6, "completions/mean_length": 30.77373046875, "completions/mean_terminated_length": 30.77373046875, "completions/min_length": 27.6, "completions/min_terminated_length": 27.6, "epoch": 0.700952380952381, "grad_norm": 0.0009503339533694088, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 68973321.0, "reward": 1.0458170413970946, "reward_std": 0.02407231219112873, "rewards/accuracy_reward": 0.3013671875, "rewards/brier_reward": 0.790267014503479, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.389829635620117, "rewards/mean_confidence_reward": 0.318223637342453, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 44.6, "completions/max_terminated_length": 44.6, "completions/mean_length": 30.7498046875, "completions/mean_terminated_length": 30.7498046875, "completions/min_length": 26.8, "completions/min_terminated_length": 26.8, "epoch": 0.7314285714285714, "grad_norm": 0.0011252901749685407, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 71876487.0, "reward": 1.0573019981384277, "reward_std": 0.020349294319748877, "rewards/accuracy_reward": 0.33916015625, "rewards/brier_reward": 0.7754438757896424, "rewards/confidence_one_or_zero": 0.0001953125, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.358468627929687, "rewards/mean_confidence_reward": 0.3264623939990997, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.2, "completions/max_terminated_length": 42.2, "completions/mean_length": 30.65869140625, "completions/mean_terminated_length": 30.65869140625, "completions/min_length": 27.4, "completions/min_terminated_length": 27.4, "epoch": 0.7619047619047619, "grad_norm": 0.0011465527350082994, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 74779456.0, "reward": 1.0477258205413817, "reward_std": 0.019322525709867477, "rewards/accuracy_reward": 0.3068359375, "rewards/brier_reward": 0.7886156797409057, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.386696052551269, "rewards/mean_confidence_reward": 0.3149255871772766, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.0, "completions/max_terminated_length": 42.0, "completions/mean_length": 30.628125, "completions/mean_terminated_length": 30.628125, "completions/min_length": 27.4, "completions/min_terminated_length": 27.4, "epoch": 0.7923809523809524, "grad_norm": 0.00031550656422041357, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 77682176.0, "reward": 1.0575794219970702, "reward_std": 0.017232473567128183, "rewards/accuracy_reward": 0.330078125, "rewards/brier_reward": 0.7850807905197144, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.379909324645997, "rewards/mean_confidence_reward": 0.3171168327331543, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.2, "completions/max_terminated_length": 41.2, "completions/mean_length": 30.61865234375, "completions/mean_terminated_length": 30.61865234375, "completions/min_length": 27.6, "completions/min_terminated_length": 27.6, "epoch": 0.8228571428571428, "grad_norm": 0.0007638483075425029, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 80578847.0, "reward": 1.046373963356018, "reward_std": 0.0160041656345129, "rewards/accuracy_reward": 0.29697265625, "rewards/brier_reward": 0.795775318145752, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.402251243591309, "rewards/mean_confidence_reward": 0.30945849418640137, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 41.2, "completions/max_terminated_length": 41.2, "completions/mean_length": 30.74453125, "completions/mean_terminated_length": 30.74453125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.8533333333333334, "grad_norm": 0.0011101987911388278, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 83481287.0, "reward": 1.0471273899078368, "reward_std": 0.01917775347828865, "rewards/accuracy_reward": 0.305859375, "rewards/brier_reward": 0.7883954048156738, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.386114120483398, "rewards/mean_confidence_reward": 0.30657861828804017, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 42.4, "completions/max_terminated_length": 42.4, "completions/mean_length": 30.97451171875, "completions/mean_terminated_length": 30.97451171875, "completions/min_length": 27.8, "completions/min_terminated_length": 27.8, "epoch": 0.8838095238095238, "grad_norm": 0.00115675397682935, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 86392354.0, "reward": 1.05747447013855, "reward_std": 0.017029477655887602, "rewards/accuracy_reward": 0.3435546875, "rewards/brier_reward": 0.7713942408561707, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.349239540100097, "rewards/mean_confidence_reward": 0.30917675495147706, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.6, "completions/max_terminated_length": 38.6, "completions/mean_length": 31.1416015625, "completions/mean_terminated_length": 31.1416015625, "completions/min_length": 27.6, "completions/min_terminated_length": 27.6, "epoch": 0.9142857142857143, "grad_norm": 0.0005026182625442743, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 89297004.0, "reward": 1.0541616201400756, "reward_std": 0.014724909700453282, "rewards/accuracy_reward": 0.33291015625, "rewards/brier_reward": 0.7755107522010803, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99990234375, "rewards/log_2_reward": 9.356867027282714, "rewards/mean_confidence_reward": 0.29771387577056885, "step": 150 }, { "epoch": 0.9142857142857143, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 37.0, "eval_completions/max_terminated_length": 37.0, "eval_completions/mean_length": 31.187903881072998, "eval_completions/mean_terminated_length": 31.187903881072998, "eval_completions/min_length": 28.0, "eval_completions/min_terminated_length": 28.0, "eval_loss": 0.0, "eval_num_tokens": 89297004.0, "eval_reward": 1.0590836107730865, "eval_reward_std": 0.14529624581336975, "eval_rewards/accuracy_reward": 0.337890625, "eval_rewards/brier_reward": 0.7802765667438507, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/log_2_reward": 9.368727684020996, "eval_rewards/mean_confidence_reward": 0.2976953163743019, "eval_runtime": 6.9998, "eval_samples_per_second": 71.431, "eval_steps_per_second": 0.571, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 40.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 31.29521484375, "completions/mean_terminated_length": 31.29521484375, "completions/min_length": 27.8, "completions/min_terminated_length": 27.8, "epoch": 0.9447619047619048, "grad_norm": 0.0007179967360571027, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 92205211.0, "reward": 1.0541197299957275, "reward_std": 0.013267694972455502, "rewards/accuracy_reward": 0.325390625, "rewards/brier_reward": 0.7828488230705262, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.37431640625, "rewards/mean_confidence_reward": 0.29936914443969725, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 38.0, "completions/max_terminated_length": 38.0, "completions/mean_length": 31.15537109375, "completions/mean_terminated_length": 31.15537109375, "completions/min_length": 27.4, "completions/min_terminated_length": 27.4, "epoch": 0.9752380952380952, "grad_norm": 0.000989082851447165, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 95112946.0, "reward": 1.0628147840499877, "reward_std": 0.015992810018360615, "rewards/accuracy_reward": 0.34912109375, "rewards/brier_reward": 0.7765084505081177, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.361271095275878, "rewards/mean_confidence_reward": 0.3030927777290344, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 39.75, "completions/max_terminated_length": 39.75, "completions/mean_length": 30.896739959716797, "completions/mean_terminated_length": 30.896739959716797, "completions/min_length": 27.75, "completions/min_terminated_length": 27.75, "epoch": 0.9996190476190476, "num_tokens": 97437501.0, "reward": 1.0623335242271423, "reward_std": 0.014995867619290948, "rewards/accuracy_reward": 0.351318359375, "rewards/brier_reward": 0.7733487039804459, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.354329109191895, "rewards/mean_confidence_reward": 0.3190649375319481, "step": 164, "total_flos": 0.0, "train_loss": 0.004139085497216711, "train_runtime": 15544.1305, "train_samples_per_second": 0.675, "train_steps_per_second": 0.011 } ], "logging_steps": 5, "max_steps": 164, "num_input_tokens_seen": 97437501, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }