{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996190476190476, "eval_steps": 50, "global_step": 164, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.77080078125, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 116.9482421875, "completions/mean_terminated_length": 79.82200317382812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.030476190476190476, "grad_norm": 0.2547481060028076, "learning_rate": 5.555555555555555e-07, "loss": 0.1961, "num_tokens": 3786478.0, "reward": 0.8699400663375855, "reward_std": 1.7978902339935303, "rewards/accuracy_reward": 0.0283203125, "rewards/brier_reward": 0.12994426488876343, "rewards/confidence_one_or_zero": 0.1083984375, "rewards/format_reward": 0.16884765625, "rewards/log_2_reward": 1.5427121639251709, "rewards/mean_confidence_reward": 0.10250047892332077, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.44853515625, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 98.93623046875, "completions/mean_terminated_length": 76.31569213867188, "completions/min_length": 6.4, "completions/min_terminated_length": 6.4, "epoch": 0.06095238095238095, "grad_norm": 0.9249232411384583, "learning_rate": 1e-06, "loss": 0.3834, "num_tokens": 7388449.0, "reward": 2.7091397285461425, "reward_std": 2.2789249181747437, "rewards/accuracy_reward": 0.107421875, "rewards/brier_reward": 0.3979889452457428, "rewards/confidence_one_or_zero": 0.21123046875, "rewards/format_reward": 0.52158203125, "rewards/log_2_reward": 4.789275646209717, "rewards/mean_confidence_reward": 0.254657456278801, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04111328125, "completions/max_length": 128.0, "completions/max_terminated_length": 126.8, "completions/mean_length": 64.40888671875, "completions/mean_terminated_length": 61.81023941040039, "completions/min_length": 16.6, "completions/min_terminated_length": 16.6, "epoch": 0.09142857142857143, "grad_norm": 0.09482545405626297, "learning_rate": 1e-06, "loss": 0.115, "num_tokens": 10635996.0, "reward": 4.9152580261230465, "reward_std": 1.1983887672424316, "rewards/accuracy_reward": 0.1896484375, "rewards/brier_reward": 0.7029260516166687, "rewards/confidence_one_or_zero": 0.308984375, "rewards/format_reward": 0.94921875, "rewards/log_2_reward": 8.69164867401123, "rewards/mean_confidence_reward": 0.43842514157295226, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0001953125, "completions/max_length": 113.0, "completions/max_terminated_length": 100.2, "completions/mean_length": 43.7119140625, "completions/mean_terminated_length": 43.69547882080078, "completions/min_length": 18.8, "completions/min_terminated_length": 18.8, "epoch": 0.1219047619047619, "grad_norm": 0.00744120217859745, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 13670230.0, "reward": 5.309306526184082, "reward_std": 0.32826481461524964, "rewards/accuracy_reward": 0.18349609375, "rewards/brier_reward": 0.8068550109863282, "rewards/confidence_one_or_zero": 0.39267578125, "rewards/format_reward": 0.99794921875, "rewards/log_2_reward": 9.437167739868164, "rewards/mean_confidence_reward": 0.35327613949775694, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 106.8, "completions/max_terminated_length": 98.0, "completions/mean_length": 42.294140625, "completions/mean_terminated_length": 42.28580551147461, "completions/min_length": 21.8, "completions/min_terminated_length": 21.8, "epoch": 0.1523809523809524, "grad_norm": 0.0060423314571380615, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 16692602.0, "reward": 5.426451015472412, "reward_std": 0.18737927973270416, "rewards/accuracy_reward": 0.16845703125, "rewards/brier_reward": 0.8912161231040955, "rewards/confidence_one_or_zero": 0.5732421875, "rewards/format_reward": 0.998828125, "rewards/log_2_reward": 9.68561668395996, "rewards/mean_confidence_reward": 0.21070439517498016, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.4, "completions/max_terminated_length": 76.4, "completions/mean_length": 41.1986328125, "completions/mean_terminated_length": 41.1986328125, "completions/min_length": 23.2, "completions/min_terminated_length": 23.2, "epoch": 0.18285714285714286, "grad_norm": 0.004254295490682125, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 19701580.0, "reward": 5.47640905380249, "reward_std": 0.0742596685886383, "rewards/accuracy_reward": 0.05986328125, "rewards/brier_reward": 0.9633856773376465, "rewards/confidence_one_or_zero": 0.86806640625, "rewards/format_reward": 0.99921875, "rewards/log_2_reward": 9.893736457824707, "rewards/mean_confidence_reward": 0.05211978480219841, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 39.89755859375, "completions/mean_terminated_length": 39.89755859375, "completions/min_length": 23.8, "completions/min_terminated_length": 23.8, "epoch": 0.21333333333333335, "grad_norm": 5.5466916819568723e-05, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 22695315.0, "reward": 5.49622688293457, "reward_std": 0.015401666914112867, "rewards/accuracy_reward": 0.0021484375, "rewards/brier_reward": 0.9974494576454163, "rewards/confidence_one_or_zero": 0.98984375, "rewards/format_reward": 0.99970703125, "rewards/log_2_reward": 9.990598487854005, "rewards/mean_confidence_reward": 0.0037255858958815226, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 39.62392578125, "completions/mean_terminated_length": 39.62392578125, "completions/min_length": 24.8, "completions/min_terminated_length": 24.8, "epoch": 0.2438095238095238, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 25691976.0, "reward": 5.499967384338379, "reward_std": 0.00018490625079721213, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.9999882698059082, "rewards/confidence_one_or_zero": 0.99970703125, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.999934768676757, "rewards/mean_confidence_reward": 5.859375087311491e-05, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 39.17685546875, "completions/mean_terminated_length": 39.17685546875, "completions/min_length": 25.2, "completions/min_terminated_length": 25.2, "epoch": 0.2742857142857143, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 28683067.0, "reward": 5.499900531768799, "reward_std": 0.0005625242134556174, "rewards/accuracy_reward": 9.765625e-05, "rewards/brier_reward": 0.9999013662338256, "rewards/confidence_one_or_zero": 0.99970703125, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.999703407287598, "rewards/mean_confidence_reward": 6.83593774738256e-05, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.4, "completions/max_terminated_length": 55.4, "completions/mean_length": 38.91845703125, "completions/mean_terminated_length": 38.91845703125, "completions/min_length": 25.4, "completions/min_terminated_length": 25.4, "epoch": 0.3047619047619048, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 31665112.0, "reward": 5.499433135986328, "reward_std": 0.0030375825241208076, "rewards/accuracy_reward": 9.765625e-05, "rewards/brier_reward": 0.9998398423194885, "rewards/confidence_one_or_zero": 0.99990234375, "rewards/format_reward": 0.99990234375, "rewards/log_2_reward": 9.998866271972656, "rewards/mean_confidence_reward": 1.9531250291038305e-05, "step": 50 }, { "epoch": 0.3047619047619048, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 46.5, "eval_completions/max_terminated_length": 46.5, "eval_completions/mean_length": 38.41298484802246, "eval_completions/mean_terminated_length": 38.41298484802246, "eval_completions/min_length": 26.5, "eval_completions/min_terminated_length": 26.5, "eval_loss": 0.0, "eval_num_tokens": 31665112.0, "eval_reward": 5.5, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/brier_reward": 1.0, "eval_rewards/confidence_one_or_zero": 1.0, "eval_rewards/format_reward": 1.0, "eval_rewards/log_2_reward": 10.0, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 8.5224, "eval_samples_per_second": 58.669, "eval_steps_per_second": 0.469, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.8, "completions/max_terminated_length": 57.8, "completions/mean_length": 38.93466796875, "completions/mean_terminated_length": 38.93466796875, "completions/min_length": 25.4, "completions/min_terminated_length": 25.4, "epoch": 0.3352380952380952, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 34651099.0, "reward": 5.4989158630371096, "reward_std": 0.006133038754342124, "rewards/accuracy_reward": 9.765625e-05, "rewards/brier_reward": 0.9997568368911743, "rewards/confidence_one_or_zero": 0.9998046875, "rewards/format_reward": 0.9998046875, "rewards/log_2_reward": 9.99792938232422, "rewards/mean_confidence_reward": 2.9296876164153218e-05, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.4, "completions/max_terminated_length": 58.4, "completions/mean_length": 39.1458984375, "completions/mean_terminated_length": 39.1458984375, "completions/min_length": 25.6, "completions/min_terminated_length": 25.6, "epoch": 0.3657142857142857, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 37641969.0, "reward": 5.499462890625, "reward_std": 0.003038349375128746, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.99990234375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99990234375, "rewards/log_2_reward": 9.9990234375, "rewards/mean_confidence_reward": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.6, "completions/max_terminated_length": 56.6, "completions/mean_length": 39.14443359375, "completions/mean_terminated_length": 39.14443359375, "completions/min_length": 25.8, "completions/min_terminated_length": 25.8, "epoch": 0.3961904761904762, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 40631320.0, "reward": 5.499462890625, "reward_std": 0.003038349375128746, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.99990234375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99990234375, "rewards/log_2_reward": 9.9990234375, "rewards/mean_confidence_reward": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 39.17412109375, "completions/mean_terminated_length": 39.17412109375, "completions/min_length": 24.6, "completions/min_terminated_length": 24.6, "epoch": 0.4266666666666667, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 43622127.0, "reward": 5.499462890625, "reward_std": 0.003038349375128746, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.99990234375, "rewards/confidence_one_or_zero": 0.99990234375, "rewards/format_reward": 0.99990234375, "rewards/log_2_reward": 9.9990234375, "rewards/mean_confidence_reward": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.4, "completions/max_terminated_length": 60.4, "completions/mean_length": 38.94501953125, "completions/mean_terminated_length": 38.94501953125, "completions/min_length": 25.2, "completions/min_terminated_length": 25.2, "epoch": 0.45714285714285713, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 46606236.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.2, "completions/max_terminated_length": 57.2, "completions/mean_length": 39.04951171875, "completions/mean_terminated_length": 39.04951171875, "completions/min_length": 25.4, "completions/min_terminated_length": 25.4, "epoch": 0.4876190476190476, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 49590423.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.2, "completions/max_terminated_length": 60.2, "completions/mean_length": 39.11123046875, "completions/mean_terminated_length": 39.11123046875, "completions/min_length": 25.8, "completions/min_terminated_length": 25.8, "epoch": 0.518095238095238, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 52576458.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 39.10869140625, "completions/mean_terminated_length": 39.10869140625, "completions/min_length": 25.6, "completions/min_terminated_length": 25.6, "epoch": 0.5485714285714286, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 55563779.0, "reward": 5.499970245361328, "reward_std": 0.00016833491390570997, "rewards/accuracy_reward": 9.765625e-05, "rewards/brier_reward": 0.9999374985694885, "rewards/confidence_one_or_zero": 0.99990234375, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.999842834472656, "rewards/mean_confidence_reward": 1.9531250291038305e-05, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.2, "completions/max_terminated_length": 55.2, "completions/mean_length": 39.05234375, "completions/mean_terminated_length": 39.05234375, "completions/min_length": 26.2, "completions/min_terminated_length": 26.2, "epoch": 0.579047619047619, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 58549307.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.6, "completions/max_terminated_length": 57.6, "completions/mean_length": 39.0751953125, "completions/mean_terminated_length": 39.0751953125, "completions/min_length": 25.8, "completions/min_terminated_length": 25.8, "epoch": 0.6095238095238096, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 61537405.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 100 }, { "epoch": 0.6095238095238096, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 53.25, "eval_completions/max_terminated_length": 53.25, "eval_completions/mean_length": 39.020877838134766, "eval_completions/mean_terminated_length": 39.020877838134766, "eval_completions/min_length": 29.0, "eval_completions/min_terminated_length": 29.0, "eval_loss": 0.0, "eval_num_tokens": 61537405.0, "eval_reward": 5.5, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/brier_reward": 1.0, "eval_rewards/confidence_one_or_zero": 1.0, "eval_rewards/format_reward": 1.0, "eval_rewards/log_2_reward": 10.0, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 8.25, "eval_samples_per_second": 60.606, "eval_steps_per_second": 0.485, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 39.10244140625, "completions/mean_terminated_length": 39.10244140625, "completions/min_length": 25.6, "completions/min_terminated_length": 25.6, "epoch": 0.64, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 64529366.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.8, "completions/max_terminated_length": 55.8, "completions/mean_length": 39.0609375, "completions/mean_terminated_length": 39.0609375, "completions/min_length": 25.8, "completions/min_terminated_length": 25.8, "epoch": 0.6704761904761904, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 67517446.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.4, "completions/max_terminated_length": 56.4, "completions/mean_length": 38.946875, "completions/mean_terminated_length": 38.946875, "completions/min_length": 25.4, "completions/min_terminated_length": 25.4, "epoch": 0.700952380952381, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 70503046.0, "reward": 5.499989128112793, "reward_std": 6.163541693240404e-05, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.9999960899353028, "rewards/confidence_one_or_zero": 0.99990234375, "rewards/format_reward": 1.0, "rewards/log_2_reward": 9.999978256225585, "rewards/mean_confidence_reward": 1.9531250291038305e-05, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.4, "completions/max_terminated_length": 58.4, "completions/mean_length": 39.1416015625, "completions/mean_terminated_length": 39.1416015625, "completions/min_length": 26.2, "completions/min_terminated_length": 26.2, "epoch": 0.7314285714285714, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 73492144.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.8, "completions/max_terminated_length": 57.8, "completions/mean_length": 39.1482421875, "completions/mean_terminated_length": 39.1482421875, "completions/min_length": 26.2, "completions/min_terminated_length": 26.2, "epoch": 0.7619047619047619, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 76482046.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 39.20634765625, "completions/mean_terminated_length": 39.20634765625, "completions/min_length": 25.6, "completions/min_terminated_length": 25.6, "epoch": 0.7923809523809524, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 79472607.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.8, "completions/max_terminated_length": 57.8, "completions/mean_length": 38.862890625, "completions/mean_terminated_length": 38.862890625, "completions/min_length": 25.8, "completions/min_terminated_length": 25.8, "epoch": 0.8228571428571428, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 82453699.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.6, "completions/max_terminated_length": 56.6, "completions/mean_length": 39.03369140625, "completions/mean_terminated_length": 39.03369140625, "completions/min_length": 24.8, "completions/min_terminated_length": 24.8, "epoch": 0.8533333333333334, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 85441020.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.4, "completions/max_terminated_length": 57.4, "completions/mean_length": 39.27880859375, "completions/mean_terminated_length": 39.27880859375, "completions/min_length": 25.8, "completions/min_terminated_length": 25.8, "epoch": 0.8838095238095238, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 88437123.0, "reward": 5.499462890625, "reward_std": 0.003038349375128746, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.99990234375, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.99990234375, "rewards/log_2_reward": 9.9990234375, "rewards/mean_confidence_reward": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.4, "completions/max_terminated_length": 56.4, "completions/mean_length": 38.880078125, "completions/mean_terminated_length": 38.880078125, "completions/min_length": 25.4, "completions/min_terminated_length": 25.4, "epoch": 0.9142857142857143, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 91421015.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 150 }, { "epoch": 0.9142857142857143, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 52.5, "eval_completions/max_terminated_length": 52.5, "eval_completions/mean_length": 38.77289867401123, "eval_completions/mean_terminated_length": 38.77289867401123, "eval_completions/min_length": 28.5, "eval_completions/min_terminated_length": 28.5, "eval_loss": 0.0, "eval_num_tokens": 91421015.0, "eval_reward": 5.5, "eval_reward_std": 0.0, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/brier_reward": 1.0, "eval_rewards/confidence_one_or_zero": 1.0, "eval_rewards/format_reward": 1.0, "eval_rewards/log_2_reward": 10.0, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 9.1652, "eval_samples_per_second": 54.554, "eval_steps_per_second": 0.436, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.8, "completions/max_terminated_length": 57.8, "completions/mean_length": 38.95888671875, "completions/mean_terminated_length": 38.95888671875, "completions/min_length": 25.6, "completions/min_terminated_length": 25.6, "epoch": 0.9447619047619048, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 94407698.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.8, "completions/max_terminated_length": 56.8, "completions/mean_length": 39.01162109375, "completions/mean_terminated_length": 39.01162109375, "completions/min_length": 25.8, "completions/min_terminated_length": 25.8, "epoch": 0.9752380952380952, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 97395881.0, "reward": 5.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 1.0, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 1.0, "rewards/log_2_reward": 10.0, "rewards/mean_confidence_reward": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.75, "completions/max_terminated_length": 56.75, "completions/mean_length": 38.78663635253906, "completions/mean_terminated_length": 38.78663635253906, "completions/min_length": 25.75, "completions/min_terminated_length": 25.75, "epoch": 0.9996190476190476, "num_tokens": 99785845.0, "reward": 5.49932861328125, "reward_std": 0.0037979367189109325, "rewards/accuracy_reward": 0.0, "rewards/brier_reward": 0.9998779296875, "rewards/confidence_one_or_zero": 1.0, "rewards/format_reward": 0.9998779296875, "rewards/log_2_reward": 9.998779296875, "rewards/mean_confidence_reward": 0.0, "step": 164, "total_flos": 0.0, "train_loss": 0.021034386144844753, "train_runtime": 15577.6776, "train_samples_per_second": 0.674, "train_steps_per_second": 0.011 } ], "logging_steps": 5, "max_steps": 164, "num_input_tokens_seen": 99785845, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }