{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.006, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06598061177646741, "epoch": 2e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003923534415662289, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 102665.0, "reward": 2.355022430419922, "reward_std": 0.3552054464817047, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.501227617263794, "rewards/rollout_reward_func/std": 0.18640437722206116, "sampling/importance_sampling_ratio/max": 1.0961512327194214, "sampling/importance_sampling_ratio/mean": 0.9703092575073242, "sampling/importance_sampling_ratio/min": 0.5060414671897888, "sampling/sampling_logp_difference/max": 0.6756159067153931, "sampling/sampling_logp_difference/mean": 0.0183907151222229, "step": 1, "step_time": 29.075429260999726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06598061177646741, "epoch": 4e-05, "grad_norm": 0.003917683847248554, "kl": 0.0, "learning_rate": 2.2857142857142855e-07, "loss": -0.0, "step": 2, "step_time": 11.468670933999988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05838002988048174, "epoch": 6e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004170146305114031, "kl": 0.0014184596652553338, "learning_rate": 4.571428571428571e-07, "loss": 0.0, "num_tokens": 205842.0, "reward": 2.2323365211486816, "reward_std": 0.41563019156455994, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.970565140247345, "rewards/probe_shaping_dominance/std": 0.11582481861114502, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.12296734005212784, "rewards/rollout_reward_func/mean": -0.5944784879684448, "rewards/rollout_reward_func/std": 0.19796565175056458, "sampling/importance_sampling_ratio/max": 1.4160258769989014, "sampling/importance_sampling_ratio/mean": 1.0286931991577148, "sampling/importance_sampling_ratio/min": 0.8523033857345581, "sampling/sampling_logp_difference/max": 0.34715062379837036, "sampling/sampling_logp_difference/mean": 0.01565416157245636, "step": 3, "step_time": 26.976024578999954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.06273448248884961, "epoch": 8e-05, "grad_norm": 0.0025308942422270775, "kl": 0.004324701569430545, "learning_rate": 6.857142857142857e-07, "loss": 0.0, "step": 4, "step_time": 12.765090235000116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06146455561975017, "epoch": 0.0001, "frac_reward_zero_std": 0.0, "grad_norm": 0.010321667417883873, "kl": 0.005618094519680539, "learning_rate": 9.142857142857142e-07, "loss": 0.0, "num_tokens": 303571.0, "reward": 2.236471176147461, "reward_std": 0.5468828678131104, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.9439389705657959, "rewards/probe_shaping_dominance/std": 0.15084654092788696, "rewards/probe_terminal_raw/mean": 0.0625, "rewards/probe_terminal_raw/std": 0.16800537705421448, "rewards/rollout_reward_func/mean": -0.5324676036834717, "rewards/rollout_reward_func/std": 0.24024422466754913, "sampling/importance_sampling_ratio/max": 1.3134887218475342, "sampling/importance_sampling_ratio/mean": 0.9676171541213989, "sampling/importance_sampling_ratio/min": 0.41273218393325806, "sampling/sampling_logp_difference/max": 0.8849565982818604, "sampling/sampling_logp_difference/mean": 0.026659058406949043, "step": 5, "step_time": 26.660728665999955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.06810211515403353, "epoch": 0.00012, "grad_norm": 0.007714552339166403, "kl": 0.0028154569756466685, "learning_rate": 1.1428571428571428e-06, "loss": 0.0, "step": 6, "step_time": 11.44768754599977 }, { "clip_ratio/high_max": 0.06250000186264515, "clip_ratio/high_mean": 0.031250000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04115386162811774, "epoch": 0.00014, "frac_reward_zero_std": 0.0, "grad_norm": 0.003323962911963463, "kl": 0.001360555283525855, "learning_rate": 1.3714285714285715e-06, "loss": 0.0, "num_tokens": 410424.0, "reward": 2.2917943000793457, "reward_std": 0.44559940695762634, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9733562469482422, "rewards/probe_shaping_dominance/std": 0.10957542806863785, "rewards/probe_terminal_raw/mean": 0.027566056698560715, "rewards/probe_terminal_raw/std": 0.10949952900409698, "rewards/rollout_reward_func/mean": -0.5341278314590454, "rewards/rollout_reward_func/std": 0.27136242389678955, "sampling/importance_sampling_ratio/max": 1.0618572235107422, "sampling/importance_sampling_ratio/mean": 0.9585317969322205, "sampling/importance_sampling_ratio/min": 0.2324376255273819, "sampling/sampling_logp_difference/max": 1.470571756362915, "sampling/sampling_logp_difference/mean": 0.02589060366153717, "step": 7, "step_time": 27.5007078120002 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.04836703218097682, "epoch": 0.00016, "grad_norm": 0.005415800027549267, "kl": 0.001694043724171479, "learning_rate": 1.6e-06, "loss": 0.0, "step": 8, "step_time": 12.223170772000117 }, { "clip_ratio/high_max": 0.06250000186264515, "clip_ratio/high_mean": 0.031250000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07847535189284827, "epoch": 0.00018, "frac_reward_zero_std": 0.0, "grad_norm": 0.010716816410422325, "kl": 0.004078912243130617, "learning_rate": 1.8285714285714284e-06, "loss": -0.0, "num_tokens": 511562.0, "reward": 2.3355042934417725, "reward_std": 0.43706634640693665, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.9753304123878479, "rewards/probe_shaping_dominance/std": 0.0984005331993103, "rewards/probe_terminal_raw/mean": 0.026295732706785202, "rewards/probe_terminal_raw/std": 0.10541322082281113, "rewards/rollout_reward_func/mean": -0.553621768951416, "rewards/rollout_reward_func/std": 0.20992274582386017, "sampling/importance_sampling_ratio/max": 2.0806119441986084, "sampling/importance_sampling_ratio/mean": 1.0222396850585938, "sampling/importance_sampling_ratio/min": 0.5085986256599426, "sampling/sampling_logp_difference/max": 0.7373225688934326, "sampling/sampling_logp_difference/mean": 0.028744252398610115, "step": 9, "step_time": 26.567318749999913 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "entropy": 0.07049791459576227, "epoch": 0.0002, "grad_norm": 0.004469083622097969, "kl": 0.026501665124972873, "learning_rate": 2.057142857142857e-06, "loss": -0.0, "step": 10, "step_time": 11.64468052799998 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05360435344118741, "epoch": 0.00022, "frac_reward_zero_std": 0.0, "grad_norm": 0.008809314109385014, "kl": 0.004907062985087585, "learning_rate": 2.2857142857142856e-06, "loss": -0.0, "num_tokens": 616201.0, "reward": 2.4397201538085938, "reward_std": 0.5087255239486694, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0625, "rewards/probe_invalid_count/std": 0.24593468010425568, "rewards/probe_shaping_dominance/mean": 0.9706827998161316, "rewards/probe_shaping_dominance/std": 0.12301044911146164, "rewards/probe_terminal_raw/mean": 0.025406504049897194, "rewards/probe_terminal_raw/std": 0.10275532305240631, "rewards/rollout_reward_func/mean": -0.5063689351081848, "rewards/rollout_reward_func/std": 0.27631497383117676, "sampling/importance_sampling_ratio/max": 1.1329089403152466, "sampling/importance_sampling_ratio/mean": 0.9933090806007385, "sampling/importance_sampling_ratio/min": 0.768523633480072, "sampling/sampling_logp_difference/max": 0.2632848620414734, "sampling/sampling_logp_difference/mean": 0.007937189191579819, "step": 11, "step_time": 27.777191968999887 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 0.056653952024134924, "epoch": 0.00024, "grad_norm": 0.005528156645596027, "kl": 0.0032436020156101364, "learning_rate": 2.5142857142857142e-06, "loss": -0.0, "step": 12, "step_time": 11.55833436900025 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.038703071273630485, "epoch": 0.00026, "frac_reward_zero_std": 0.0, "grad_norm": 0.007530162110924721, "kl": 0.09287417630221206, "learning_rate": 2.742857142857143e-06, "loss": -0.0, "num_tokens": 724364.0, "reward": 2.351245880126953, "reward_std": 0.4424680173397064, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9703531265258789, "rewards/probe_shaping_dominance/std": 0.11669508367776871, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.12296734005212784, "rewards/rollout_reward_func/mean": -0.47535717487335205, "rewards/rollout_reward_func/std": 0.24601998925209045, "sampling/importance_sampling_ratio/max": 1.440869688987732, "sampling/importance_sampling_ratio/mean": 1.0093717575073242, "sampling/importance_sampling_ratio/min": 0.7920892238616943, "sampling/sampling_logp_difference/max": 0.3652459681034088, "sampling/sampling_logp_difference/mean": 0.008522224612534046, "step": 13, "step_time": 27.311093626999764 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.0453864433511626, "epoch": 0.00028, "grad_norm": 0.006435270421206951, "kl": 0.010504724175871893, "learning_rate": 2.9714285714285716e-06, "loss": -0.0, "step": 14, "step_time": 11.88281524700028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.037373697148723295, "epoch": 0.0003, "frac_reward_zero_std": 0.0, "grad_norm": 0.006931572686880827, "kl": 0.0013499163329698805, "learning_rate": 3.2e-06, "loss": -0.0, "num_tokens": 828160.0, "reward": 2.3457703590393066, "reward_std": 0.32655069231987, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9890751838684082, "rewards/probe_shaping_dominance/std": 0.06179998442530632, "rewards/probe_terminal_raw/mean": 0.01092479657381773, "rewards/probe_terminal_raw/std": 0.06179998070001602, "rewards/rollout_reward_func/mean": -0.5417294502258301, "rewards/rollout_reward_func/std": 0.19428227841854095, "sampling/importance_sampling_ratio/max": 1.5512616634368896, "sampling/importance_sampling_ratio/mean": 1.0071200132369995, "sampling/importance_sampling_ratio/min": 0.7788013219833374, "sampling/sampling_logp_difference/max": 0.43915224075317383, "sampling/sampling_logp_difference/mean": 0.008885648101568222, "step": 15, "step_time": 27.3166221219999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.03605123230954632, "epoch": 0.00032, "grad_norm": 0.007162998430430889, "kl": 0.0005329122045578671, "learning_rate": 3.428571428571428e-06, "loss": -0.0, "step": 16, "step_time": 12.10146868100037 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04818721191259101, "epoch": 0.00034, "frac_reward_zero_std": 0.0, "grad_norm": 0.0020011591259390116, "kl": 0.000880227197208705, "learning_rate": 3.657142857142857e-06, "loss": 0.0, "num_tokens": 933852.0, "reward": 2.2396738529205322, "reward_std": 0.3769412934780121, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9811877012252808, "rewards/probe_shaping_dominance/std": 0.07417813688516617, "rewards/probe_terminal_raw/mean": 0.019435975700616837, "rewards/probe_terminal_raw/std": 0.07648143172264099, "rewards/rollout_reward_func/mean": -0.554699718952179, "rewards/rollout_reward_func/std": 0.14253978431224823, "sampling/importance_sampling_ratio/max": 1.3911144733428955, "sampling/importance_sampling_ratio/mean": 1.0014019012451172, "sampling/importance_sampling_ratio/min": 0.647373378276825, "sampling/sampling_logp_difference/max": 0.4348297119140625, "sampling/sampling_logp_difference/mean": 0.01693439856171608, "step": 17, "step_time": 27.466287271999818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.059825868549523875, "epoch": 0.00036, "grad_norm": 0.00400462094694376, "kl": 0.0010442571770683529, "learning_rate": 3.885714285714286e-06, "loss": 0.0, "step": 18, "step_time": 11.729434232999665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07400128486915492, "epoch": 0.00038, "frac_reward_zero_std": 0.0, "grad_norm": 0.004407655913382769, "kl": 0.0058712156430829054, "learning_rate": 4.114285714285714e-06, "loss": -0.0, "num_tokens": 1040669.0, "reward": 2.3979897499084473, "reward_std": 0.3378089666366577, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9721384644508362, "rewards/probe_shaping_dominance/std": 0.10964522510766983, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.12296734005212784, "rewards/rollout_reward_func/mean": -0.4928986430168152, "rewards/rollout_reward_func/std": 0.280559241771698, "sampling/importance_sampling_ratio/max": 1.2489417791366577, "sampling/importance_sampling_ratio/mean": 0.9779143333435059, "sampling/importance_sampling_ratio/min": 0.5380392670631409, "sampling/sampling_logp_difference/max": 0.619827151298523, "sampling/sampling_logp_difference/mean": 0.017949596047401428, "step": 19, "step_time": 28.172511434999933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.07153313838352915, "epoch": 0.0004, "grad_norm": 0.010058136656880379, "kl": 0.01704683385832595, "learning_rate": 4.342857142857142e-06, "loss": -0.0, "step": 20, "step_time": 11.798744088000149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07958520320244133, "epoch": 0.00042, "frac_reward_zero_std": 0.0, "grad_norm": 0.015031801536679268, "kl": 0.02134023218428638, "learning_rate": 4.571428571428571e-06, "loss": 0.0, "num_tokens": 1146440.0, "reward": 2.2259719371795654, "reward_std": 0.4264923334121704, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9849475622177124, "rewards/probe_shaping_dominance/std": 0.08514932543039322, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.505850613117218, "rewards/rollout_reward_func/std": 0.22946372628211975, "sampling/importance_sampling_ratio/max": 1.8730424642562866, "sampling/importance_sampling_ratio/mean": 1.0450382232666016, "sampling/importance_sampling_ratio/min": 0.6261028051376343, "sampling/sampling_logp_difference/max": 0.6275629997253418, "sampling/sampling_logp_difference/mean": 0.033233314752578735, "step": 21, "step_time": 27.33938806800029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08325157128274441, "epoch": 0.00044, "grad_norm": 0.01334489043802023, "kl": 0.01684667149083907, "learning_rate": 4.8e-06, "loss": 0.0, "step": 22, "step_time": 11.840854454999999 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06853798200609162, "epoch": 0.00046, "frac_reward_zero_std": 0.0, "grad_norm": 0.01755128987133503, "kl": 0.003467819899402258, "learning_rate": 5.0285714285714285e-06, "loss": 0.0001, "num_tokens": 1248638.0, "reward": 2.270667552947998, "reward_std": 0.47502174973487854, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.978266716003418, "rewards/probe_shaping_dominance/std": 0.08902076631784439, "rewards/probe_terminal_raw/mean": 0.025406504049897194, "rewards/probe_terminal_raw/std": 0.10275533050298691, "rewards/rollout_reward_func/mean": -0.495505690574646, "rewards/rollout_reward_func/std": 0.24283160269260406, "sampling/importance_sampling_ratio/max": 2.039003610610962, "sampling/importance_sampling_ratio/mean": 1.0263185501098633, "sampling/importance_sampling_ratio/min": 0.6725395321846008, "sampling/sampling_logp_difference/max": 0.8136651515960693, "sampling/sampling_logp_difference/mean": 0.02945869043469429, "step": 23, "step_time": 27.97098964299971 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.0757814844546374, "epoch": 0.00048, "grad_norm": 0.02817094884812832, "kl": 0.009625433100154623, "learning_rate": 5.257142857142857e-06, "loss": 0.0001, "step": 24, "step_time": 11.866423993000353 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.046443949002423324, "epoch": 0.0005, "frac_reward_zero_std": 0.0, "grad_norm": 0.002482979791238904, "kl": 0.011937914369631542, "learning_rate": 5.485714285714286e-06, "loss": -0.0, "num_tokens": 1348967.0, "reward": 2.4115562438964844, "reward_std": 0.4029836654663086, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.987568199634552, "rewards/probe_shaping_dominance/std": 0.07032480090856552, "rewards/probe_terminal_raw/mean": 0.011941056698560715, "rewards/probe_terminal_raw/std": 0.06754881888628006, "rewards/rollout_reward_func/mean": -0.4754529595375061, "rewards/rollout_reward_func/std": 0.20119507610797882, "sampling/importance_sampling_ratio/max": 1.2200837135314941, "sampling/importance_sampling_ratio/mean": 0.9975783824920654, "sampling/importance_sampling_ratio/min": 0.8279879689216614, "sampling/sampling_logp_difference/max": 0.1989191770553589, "sampling/sampling_logp_difference/mean": 0.011062754318118095, "step": 25, "step_time": 26.57660025700011 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.048878253597649746, "epoch": 0.00052, "grad_norm": 0.009242719039320946, "kl": 0.008345632606265863, "learning_rate": 5.7142857142857145e-06, "loss": -0.0, "step": 26, "step_time": 11.446816336000438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.03407000357401557, "epoch": 0.00054, "frac_reward_zero_std": 0.0, "grad_norm": 0.0032159958500415087, "kl": 0.0009551170151098631, "learning_rate": 5.942857142857143e-06, "loss": 0.0001, "num_tokens": 1454840.0, "reward": 2.308957099914551, "reward_std": 0.35809147357940674, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9919047951698303, "rewards/probe_shaping_dominance/std": 0.04579342529177666, "rewards/probe_terminal_raw/mean": 0.00825711339712143, "rewards/probe_terminal_raw/std": 0.04670928418636322, "rewards/rollout_reward_func/mean": -0.4849545955657959, "rewards/rollout_reward_func/std": 0.17723596096038818, "sampling/importance_sampling_ratio/max": 1.3277825117111206, "sampling/importance_sampling_ratio/mean": 1.03197181224823, "sampling/importance_sampling_ratio/min": 0.9784432053565979, "sampling/sampling_logp_difference/max": 0.2835111618041992, "sampling/sampling_logp_difference/mean": 0.010589659214019775, "step": 27, "step_time": 27.828797529999974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.031250000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "entropy": 0.03843506844714284, "epoch": 0.00056, "grad_norm": 0.001164909452199936, "kl": 0.0005121690442896343, "learning_rate": 6.171428571428571e-06, "loss": 0.0001, "step": 28, "step_time": 11.809285704000104 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04930314904777333, "epoch": 0.00058, "frac_reward_zero_std": 0.0, "grad_norm": 0.002748744795098901, "kl": 0.004907883932952495, "learning_rate": 6.4e-06, "loss": -0.0, "num_tokens": 1556979.0, "reward": 2.240399122238159, "reward_std": 0.4602973461151123, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9860905408859253, "rewards/probe_shaping_dominance/std": 0.0786839947104454, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.5863164663314819, "rewards/rollout_reward_func/std": 0.2140309065580368, "sampling/importance_sampling_ratio/max": 1.2453359365463257, "sampling/importance_sampling_ratio/mean": 0.9654719233512878, "sampling/importance_sampling_ratio/min": 0.4166664183139801, "sampling/sampling_logp_difference/max": 0.8754727840423584, "sampling/sampling_logp_difference/mean": 0.023819994181394577, "step": 29, "step_time": 26.907328863000203 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "entropy": 0.04787830199347809, "epoch": 0.0006, "grad_norm": 0.004575630649924278, "kl": 0.021033072499267114, "learning_rate": 6.628571428571428e-06, "loss": -0.0, "step": 30, "step_time": 12.03838489500049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06083334801951423, "epoch": 0.00062, "frac_reward_zero_std": 0.0, "grad_norm": 0.013260968960821629, "kl": 0.0185297402889546, "learning_rate": 6.857142857142856e-06, "loss": 0.0001, "num_tokens": 1662740.0, "reward": 2.1973555088043213, "reward_std": 0.43850135803222656, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.991719663143158, "rewards/probe_shaping_dominance/std": 0.046840641647577286, "rewards/probe_terminal_raw/mean": 0.008384146727621555, "rewards/probe_terminal_raw/std": 0.04742789641022682, "rewards/rollout_reward_func/mean": -0.5964983701705933, "rewards/rollout_reward_func/std": 0.296856164932251, "sampling/importance_sampling_ratio/max": 2.8883938789367676, "sampling/importance_sampling_ratio/mean": 1.041499376296997, "sampling/importance_sampling_ratio/min": 0.611585795879364, "sampling/sampling_logp_difference/max": 0.9767682552337646, "sampling/sampling_logp_difference/mean": 0.02332986891269684, "step": 31, "step_time": 27.415389096000126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03750000149011612, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03750000149011612, "entropy": 0.06045454426202923, "epoch": 0.00064, "grad_norm": 0.014426084235310555, "kl": 0.027800074360129656, "learning_rate": 7.085714285714285e-06, "loss": 0.0001, "step": 32, "step_time": 11.844893076999824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0422610079695005, "epoch": 0.00066, "frac_reward_zero_std": 0.0, "grad_norm": 0.005128172226250172, "kl": 0.009348716392499568, "learning_rate": 7.314285714285714e-06, "loss": 0.0, "num_tokens": 1765521.0, "reward": 2.3525331020355225, "reward_std": 0.3403870165348053, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.9837720394134521, "rewards/probe_shaping_dominance/std": 0.06571495532989502, "rewards/probe_terminal_raw/mean": 0.01880081370472908, "rewards/probe_terminal_raw/std": 0.0745616927742958, "rewards/rollout_reward_func/mean": -0.5375398397445679, "rewards/rollout_reward_func/std": 0.22309184074401855, "sampling/importance_sampling_ratio/max": 1.275700569152832, "sampling/importance_sampling_ratio/mean": 0.994273841381073, "sampling/importance_sampling_ratio/min": 0.600629448890686, "sampling/sampling_logp_difference/max": 0.5097755193710327, "sampling/sampling_logp_difference/mean": 0.011872323229908943, "step": 33, "step_time": 27.277823512999475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.0630603444587905, "epoch": 0.00068, "grad_norm": 0.007451063022017479, "kl": 0.007260499390742581, "learning_rate": 7.542857142857142e-06, "loss": 0.0, "step": 34, "step_time": 12.15706381699988 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06165817377041094, "epoch": 0.0007, "frac_reward_zero_std": 0.0, "grad_norm": 0.01730046235024929, "kl": 0.007911830088153327, "learning_rate": 7.771428571428572e-06, "loss": 0.0, "num_tokens": 1868519.0, "reward": 2.275172233581543, "reward_std": 0.48706814646720886, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.42121174931526184, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9609175324440002, "rewards/probe_shaping_dominance/std": 0.12447085976600647, "rewards/probe_terminal_raw/mean": 0.042174797505140305, "rewards/probe_terminal_raw/std": 0.13503843545913696, "rewards/rollout_reward_func/mean": -0.552919864654541, "rewards/rollout_reward_func/std": 0.20079734921455383, "sampling/importance_sampling_ratio/max": 2.4695143699645996, "sampling/importance_sampling_ratio/mean": 1.0170851945877075, "sampling/importance_sampling_ratio/min": 0.5358201861381531, "sampling/sampling_logp_difference/max": 0.9040230512619019, "sampling/sampling_logp_difference/mean": 0.023447973653674126, "step": 35, "step_time": 26.740296546999843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 0.058829510177019984, "epoch": 0.00072, "grad_norm": 0.0026921494863927364, "kl": 0.008077224918185522, "learning_rate": 8e-06, "loss": 0.0, "step": 36, "step_time": 11.526741372999822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0314667156167161, "epoch": 0.00074, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024028760381042957, "kl": 0.00625098004627489, "learning_rate": 7.999999998518522e-06, "loss": -0.0, "num_tokens": 1970124.0, "reward": 2.264838933944702, "reward_std": 0.5270799994468689, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9424034953117371, "rewards/probe_shaping_dominance/std": 0.13680323958396912, "rewards/probe_terminal_raw/mean": 0.05856199190020561, "rewards/probe_terminal_raw/std": 0.1405627578496933, "rewards/rollout_reward_func/mean": -0.4673765003681183, "rewards/rollout_reward_func/std": 0.2097388207912445, "sampling/importance_sampling_ratio/max": 1.8680520057678223, "sampling/importance_sampling_ratio/mean": 1.0426936149597168, "sampling/importance_sampling_ratio/min": 0.9883837103843689, "sampling/sampling_logp_difference/max": 0.6248946189880371, "sampling/sampling_logp_difference/mean": 0.012692131102085114, "step": 37, "step_time": 26.3523716899997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.0313223133775864, "epoch": 0.00076, "grad_norm": 0.0023324843496084213, "kl": 0.0035868614445746516, "learning_rate": 7.99999999407409e-06, "loss": -0.0, "step": 38, "step_time": 12.628685679 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05816701124422252, "epoch": 0.00078, "frac_reward_zero_std": 0.0, "grad_norm": 0.007494654040783644, "kl": 0.03421914212867705, "learning_rate": 7.999999986666703e-06, "loss": -0.0, "num_tokens": 2076598.0, "reward": 2.311230182647705, "reward_std": 0.36618658900260925, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.9932495951652527, "rewards/probe_shaping_dominance/std": 0.03818599134683609, "rewards/probe_terminal_raw/mean": 0.00889227632433176, "rewards/probe_terminal_raw/std": 0.05030231550335884, "rewards/rollout_reward_func/mean": -0.6096617579460144, "rewards/rollout_reward_func/std": 0.20722205936908722, "sampling/importance_sampling_ratio/max": 1.4155004024505615, "sampling/importance_sampling_ratio/mean": 0.9876462817192078, "sampling/importance_sampling_ratio/min": 0.7839126586914062, "sampling/sampling_logp_difference/max": 0.3471514582633972, "sampling/sampling_logp_difference/mean": 0.0168665312230587, "step": 39, "step_time": 26.542540336999764 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.057803097704891115, "epoch": 0.0008, "grad_norm": 0.004047502297908068, "kl": 0.02604524488651805, "learning_rate": 7.99999997629636e-06, "loss": -0.0, "step": 40, "step_time": 11.67055183600064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.030615816707722843, "epoch": 0.00082, "frac_reward_zero_std": 0.0, "grad_norm": 0.002531230915337801, "kl": 0.0002023791248291218, "learning_rate": 7.999999962963062e-06, "loss": 0.0, "num_tokens": 2182025.0, "reward": 2.3659095764160156, "reward_std": 0.3363305926322937, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.970511794090271, "rewards/probe_shaping_dominance/std": 0.11608950048685074, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.12296734005212784, "rewards/rollout_reward_func/mean": -0.49210208654403687, "rewards/rollout_reward_func/std": 0.19491301476955414, "sampling/importance_sampling_ratio/max": 1.0795150995254517, "sampling/importance_sampling_ratio/mean": 1.0009956359863281, "sampling/importance_sampling_ratio/min": 0.9117990136146545, "sampling/sampling_logp_difference/max": 0.09234827756881714, "sampling/sampling_logp_difference/mean": 0.004786844830960035, "step": 41, "step_time": 26.691086626000242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.029701224237214774, "epoch": 0.00084, "grad_norm": 0.0024189443793147802, "kl": 0.0002964178702313802, "learning_rate": 7.999999946666809e-06, "loss": 0.0, "step": 42, "step_time": 12.699684607000108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05056236406426251, "epoch": 0.00086, "frac_reward_zero_std": 0.0, "grad_norm": 0.01209098007529974, "kl": 0.010471812368450628, "learning_rate": 7.999999927407602e-06, "loss": -0.0, "num_tokens": 2286142.0, "reward": 2.469311237335205, "reward_std": 0.4115804135799408, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.0, "rewards/probe_completion_length/std": 0.2540002465248108, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9852168560028076, "rewards/probe_shaping_dominance/std": 0.0836259201169014, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.48153066635131836, "rewards/rollout_reward_func/std": 0.24715669453144073, "sampling/importance_sampling_ratio/max": 2.0913164615631104, "sampling/importance_sampling_ratio/mean": 1.0417256355285645, "sampling/importance_sampling_ratio/min": 0.8711547255516052, "sampling/sampling_logp_difference/max": 0.7377924919128418, "sampling/sampling_logp_difference/mean": 0.016645925119519234, "step": 43, "step_time": 26.97034319699992 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 0.06137717463207082, "epoch": 0.00088, "grad_norm": 0.004214904736727476, "kl": 0.02022934940032428, "learning_rate": 7.99999990518544e-06, "loss": -0.0, "step": 44, "step_time": 11.70073124500027 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028125000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0973230431554839, "epoch": 0.0009, "frac_reward_zero_std": 0.0, "grad_norm": 0.008132712915539742, "kl": 0.012426901788174405, "learning_rate": 7.999999880000322e-06, "loss": 0.0, "num_tokens": 2390804.0, "reward": 2.2431583404541016, "reward_std": 0.5248546600341797, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9669345021247864, "rewards/probe_shaping_dominance/std": 0.10819793492555618, "rewards/probe_terminal_raw/mean": 0.038998983800411224, "rewards/probe_terminal_raw/std": 0.1286177635192871, "rewards/rollout_reward_func/mean": -0.4940252900123596, "rewards/rollout_reward_func/std": 0.255024790763855, "sampling/importance_sampling_ratio/max": 1.6163866519927979, "sampling/importance_sampling_ratio/mean": 0.9977768659591675, "sampling/importance_sampling_ratio/min": 0.3879617154598236, "sampling/sampling_logp_difference/max": 0.9467527270317078, "sampling/sampling_logp_difference/mean": 0.02932477556169033, "step": 45, "step_time": 26.472695325999894 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.043750000186264515, "entropy": 0.09132259455509484, "epoch": 0.00092, "grad_norm": 0.004103749990463257, "kl": 0.02156046110090415, "learning_rate": 7.99999985185225e-06, "loss": 0.0, "step": 46, "step_time": 12.17020241299997 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08302483463194221, "epoch": 0.00094, "frac_reward_zero_std": 0.0, "grad_norm": 0.00962382648140192, "kl": 0.05296483388110573, "learning_rate": 7.999999820741223e-06, "loss": 0.0, "num_tokens": 2498950.0, "reward": 2.3484296798706055, "reward_std": 0.40232396125793457, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9905115365982056, "rewards/probe_shaping_dominance/std": 0.05367483198642731, "rewards/probe_terminal_raw/mean": 0.009019308723509312, "rewards/probe_terminal_raw/std": 0.05102091282606125, "rewards/rollout_reward_func/mean": -0.507351279258728, "rewards/rollout_reward_func/std": 0.22662682831287384, "sampling/importance_sampling_ratio/max": 1.3692384958267212, "sampling/importance_sampling_ratio/mean": 0.9901071786880493, "sampling/importance_sampling_ratio/min": 0.3076327443122864, "sampling/sampling_logp_difference/max": 1.179471731185913, "sampling/sampling_logp_difference/mean": 0.03242562711238861, "step": 47, "step_time": 26.895124169999463 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 0.07248554454417899, "epoch": 0.00096, "grad_norm": 0.01555224135518074, "kl": 0.039988372170228104, "learning_rate": 7.99999978666724e-06, "loss": -0.0, "step": 48, "step_time": 11.803917615999808 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06659889499132987, "epoch": 0.00098, "frac_reward_zero_std": 0.0, "grad_norm": 0.007047319784760475, "kl": 0.038143942947499454, "learning_rate": 7.999999749630303e-06, "loss": 0.0001, "num_tokens": 2605752.0, "reward": 2.304872512817383, "reward_std": 0.4004109501838684, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5201276540756226, "rewards/rollout_reward_func/std": 0.2584696114063263, "sampling/importance_sampling_ratio/max": 2.615042209625244, "sampling/importance_sampling_ratio/mean": 1.0269113779067993, "sampling/importance_sampling_ratio/min": 0.39808669686317444, "sampling/sampling_logp_difference/max": 0.9612793922424316, "sampling/sampling_logp_difference/mean": 0.03832431882619858, "step": 49, "step_time": 26.91781551100007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.0618492451030761, "epoch": 0.001, "grad_norm": 0.00791104231029749, "kl": 0.05557279207035515, "learning_rate": 7.999999709630412e-06, "loss": 0.0001, "step": 50, "step_time": 12.788009578999208 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05631835470558144, "epoch": 0.00102, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038132003974169493, "kl": 0.029594353904632498, "learning_rate": 7.999999666667564e-06, "loss": 0.0, "num_tokens": 2707257.0, "reward": 2.346804618835449, "reward_std": 0.2936249077320099, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.571945309638977, "rewards/rollout_reward_func/std": 0.23333650827407837, "sampling/importance_sampling_ratio/max": 1.6730494499206543, "sampling/importance_sampling_ratio/mean": 0.9981693029403687, "sampling/importance_sampling_ratio/min": 0.40917959809303284, "sampling/sampling_logp_difference/max": 0.9063196182250977, "sampling/sampling_logp_difference/mean": 0.024803204461932182, "step": 51, "step_time": 26.73268852599972 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.05670045691658743, "epoch": 0.00104, "grad_norm": 0.003768681548535824, "kl": 0.030258090482694455, "learning_rate": 7.999999620741765e-06, "loss": 0.0, "step": 52, "step_time": 11.579914525999584 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1101932916790247, "epoch": 0.00106, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030547163914889097, "kl": 0.01951221001081649, "learning_rate": 7.999999571853009e-06, "loss": 0.0, "num_tokens": 2811393.0, "reward": 2.1927480697631836, "reward_std": 0.406143456697464, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9884125590324402, "rewards/probe_shaping_dominance/std": 0.0655483826994896, "rewards/probe_terminal_raw/mean": 0.01143292710185051, "rewards/probe_terminal_raw/std": 0.06467439979314804, "rewards/rollout_reward_func/mean": -0.5695973038673401, "rewards/rollout_reward_func/std": 0.16589799523353577, "sampling/importance_sampling_ratio/max": 1.0527032613754272, "sampling/importance_sampling_ratio/mean": 0.9693626165390015, "sampling/importance_sampling_ratio/min": 0.5484977960586548, "sampling/sampling_logp_difference/max": 0.6245040893554688, "sampling/sampling_logp_difference/mean": 0.023702893406152725, "step": 53, "step_time": 27.233809398999938 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 0.11004186974605545, "epoch": 0.00108, "grad_norm": 0.006082055624574423, "kl": 0.04293493747854882, "learning_rate": 7.999999520001299e-06, "loss": 0.0, "step": 54, "step_time": 12.14583877500013 }, { "clip_ratio/high_max": 0.05208333395421505, "clip_ratio/high_mean": 0.026041666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0866635709971888, "epoch": 0.0011, "frac_reward_zero_std": 0.0, "grad_norm": 0.005786150228232145, "kl": 0.045153988463084715, "learning_rate": 7.999999465186634e-06, "loss": 0.0, "num_tokens": 2914367.0, "reward": 2.3385372161865234, "reward_std": 0.3273521363735199, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5177128314971924, "rewards/rollout_reward_func/std": 0.2579730451107025, "sampling/importance_sampling_ratio/max": 1.2267568111419678, "sampling/importance_sampling_ratio/mean": 0.9484584331512451, "sampling/importance_sampling_ratio/min": 0.5135900378227234, "sampling/sampling_logp_difference/max": 0.6663306355476379, "sampling/sampling_logp_difference/mean": 0.0320717915892601, "step": 55, "step_time": 26.36462075400027 }, { "clip_ratio/high_max": 0.0729166679084301, "clip_ratio/high_mean": 0.046875000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875000931322575, "entropy": 0.09378209740680177, "epoch": 0.00112, "grad_norm": 0.007270520552992821, "kl": 0.05788560025212064, "learning_rate": 7.999999407409014e-06, "loss": 0.0, "step": 56, "step_time": 11.583988187999921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08144025912042707, "epoch": 0.00114, "frac_reward_zero_std": 0.0, "grad_norm": 0.006977744400501251, "kl": 0.16513798182256778, "learning_rate": 7.99999934666844e-06, "loss": -0.0, "num_tokens": 3018848.0, "reward": 2.2243924140930176, "reward_std": 0.4345919191837311, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9774075150489807, "rewards/probe_shaping_dominance/std": 0.08980447798967361, "rewards/probe_terminal_raw/mean": 0.02489837259054184, "rewards/probe_terminal_raw/std": 0.1013173907995224, "rewards/rollout_reward_func/mean": -0.540413498878479, "rewards/rollout_reward_func/std": 0.20110559463500977, "sampling/importance_sampling_ratio/max": 2.1173288822174072, "sampling/importance_sampling_ratio/mean": 1.0253949165344238, "sampling/importance_sampling_ratio/min": 0.34861743450164795, "sampling/sampling_logp_difference/max": 1.0653817653656006, "sampling/sampling_logp_difference/mean": 0.03663061559200287, "step": 57, "step_time": 27.63207101699959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08666177466511726, "epoch": 0.00116, "grad_norm": 0.00648898771032691, "kl": 0.14551325980573893, "learning_rate": 7.999999282964912e-06, "loss": 0.0, "step": 58, "step_time": 12.149218646000236 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0776638601673767, "epoch": 0.00118, "frac_reward_zero_std": 0.0, "grad_norm": 0.006341388914734125, "kl": 0.1438233179026156, "learning_rate": 7.999999216298429e-06, "loss": 0.0, "num_tokens": 3118313.0, "reward": 2.337385654449463, "reward_std": 0.40537285804748535, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.9689397215843201, "rewards/probe_shaping_dominance/std": 0.09820227324962616, "rewards/probe_terminal_raw/mean": 0.03201219439506531, "rewards/probe_terminal_raw/std": 0.10123317688703537, "rewards/rollout_reward_func/mean": -0.5198163986206055, "rewards/rollout_reward_func/std": 0.24933888018131256, "sampling/importance_sampling_ratio/max": 1.642152190208435, "sampling/importance_sampling_ratio/mean": 0.9745345115661621, "sampling/importance_sampling_ratio/min": 0.32652705907821655, "sampling/sampling_logp_difference/max": 1.1220024824142456, "sampling/sampling_logp_difference/mean": 0.04093600809574127, "step": 59, "step_time": 26.148361385999806 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.08309375832322985, "epoch": 0.0012, "grad_norm": 0.009624861180782318, "kl": 0.15202067893005733, "learning_rate": 7.999999146668991e-06, "loss": 0.0, "step": 60, "step_time": 11.512923075000117 }, { "clip_ratio/high_max": 0.07083333469927311, "clip_ratio/high_mean": 0.035416667349636555, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.056250001303851604, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10362166631966829, "epoch": 0.00122, "frac_reward_zero_std": 0.0, "grad_norm": 0.012691067531704903, "kl": 0.22026659833500162, "learning_rate": 7.999999074076601e-06, "loss": 0.0001, "num_tokens": 3227556.0, "reward": 2.3282229900360107, "reward_std": 0.4200522303581238, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.972678005695343, "rewards/probe_shaping_dominance/std": 0.10808944702148438, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.12296734005212784, "rewards/rollout_reward_func/mean": -0.4694550037384033, "rewards/rollout_reward_func/std": 0.2165255844593048, "sampling/importance_sampling_ratio/max": 1.6590189933776855, "sampling/importance_sampling_ratio/mean": 0.9916884899139404, "sampling/importance_sampling_ratio/min": 0.47236600518226624, "sampling/sampling_logp_difference/max": 0.7500003576278687, "sampling/sampling_logp_difference/mean": 0.045740097761154175, "step": 61, "step_time": 28.188819883999713 }, { "clip_ratio/high_max": 0.07083333469927311, "clip_ratio/high_mean": 0.035416667349636555, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04583333432674408, "entropy": 0.10312735941261053, "epoch": 0.00124, "grad_norm": 0.019286708906292915, "kl": 0.11081840936094522, "learning_rate": 7.999998998521257e-06, "loss": 0.0001, "step": 62, "step_time": 11.837706676999915 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08664211053110193, "epoch": 0.00126, "frac_reward_zero_std": 0.0, "grad_norm": 0.015839533880352974, "kl": 0.10350155318124621, "learning_rate": 7.999998920002956e-06, "loss": -0.0, "num_tokens": 3332394.0, "reward": 2.405167579650879, "reward_std": 0.46130281686782837, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.9712571501731873, "rewards/probe_shaping_dominance/std": 0.11318810284137726, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.12296734005212784, "rewards/rollout_reward_func/mean": -0.48483964800834656, "rewards/rollout_reward_func/std": 0.24800339341163635, "sampling/importance_sampling_ratio/max": 1.9499810934066772, "sampling/importance_sampling_ratio/mean": 0.9958123564720154, "sampling/importance_sampling_ratio/min": 0.30673947930336, "sampling/sampling_logp_difference/max": 0.8753989338874817, "sampling/sampling_logp_difference/mean": 0.03312094882130623, "step": 63, "step_time": 26.684526995999477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.08623273627017625, "epoch": 0.00128, "grad_norm": 0.022980431094765663, "kl": 0.11929617358450173, "learning_rate": 7.999998838521705e-06, "loss": -0.0, "step": 64, "step_time": 12.258063536000009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07359768182504922, "epoch": 0.0013, "frac_reward_zero_std": 0.0, "grad_norm": 0.011483771726489067, "kl": 0.10457528214246281, "learning_rate": 7.999998754077496e-06, "loss": -0.0, "num_tokens": 3436726.0, "reward": 2.377361297607422, "reward_std": 0.5483381748199463, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.41638875007629395, "rewards/rollout_reward_func/std": 0.2915210723876953, "sampling/importance_sampling_ratio/max": 1.1877729892730713, "sampling/importance_sampling_ratio/mean": 0.9874942898750305, "sampling/importance_sampling_ratio/min": 0.26991596817970276, "sampling/sampling_logp_difference/max": 1.309645414352417, "sampling/sampling_logp_difference/mean": 0.027806004509329796, "step": 65, "step_time": 27.115505474999736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.06954771315213293, "epoch": 0.00132, "grad_norm": 0.011225158348679543, "kl": 0.4594924821127222, "learning_rate": 7.999998666670336e-06, "loss": -0.0, "step": 66, "step_time": 11.664916763999372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05904226377606392, "epoch": 0.00134, "frac_reward_zero_std": 0.0, "grad_norm": 0.012288263067603111, "kl": 0.0946728276903741, "learning_rate": 7.999998576300222e-06, "loss": -0.0, "num_tokens": 3541291.0, "reward": 2.2826719284057617, "reward_std": 0.36464667320251465, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9738304615020752, "rewards/probe_shaping_dominance/std": 0.0877876877784729, "rewards/probe_terminal_raw/mean": 0.03137703239917755, "rewards/probe_terminal_raw/std": 0.10557617992162704, "rewards/rollout_reward_func/mean": -0.6100356578826904, "rewards/rollout_reward_func/std": 0.23593732714653015, "sampling/importance_sampling_ratio/max": 1.271332859992981, "sampling/importance_sampling_ratio/mean": 0.9844968914985657, "sampling/importance_sampling_ratio/min": 0.3530118763446808, "sampling/sampling_logp_difference/max": 1.0369465351104736, "sampling/sampling_logp_difference/mean": 0.02148618921637535, "step": 67, "step_time": 26.421768857000643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06591829867102206, "epoch": 0.00136, "grad_norm": 0.01136076170951128, "kl": 0.09406092630524654, "learning_rate": 7.999998482967154e-06, "loss": -0.0, "step": 68, "step_time": 12.272947167999973 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09870199719443917, "epoch": 0.00138, "frac_reward_zero_std": 0.0, "grad_norm": 0.017969369888305664, "kl": 0.16196376640436938, "learning_rate": 7.999998386671134e-06, "loss": 0.0, "num_tokens": 3645068.0, "reward": 2.2971627712249756, "reward_std": 0.3776472806930542, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9890751838684082, "rewards/probe_shaping_dominance/std": 0.06179998442530632, "rewards/probe_terminal_raw/mean": 0.01092479657381773, "rewards/probe_terminal_raw/std": 0.06179998070001602, "rewards/rollout_reward_func/mean": -0.5590872764587402, "rewards/rollout_reward_func/std": 0.19611209630966187, "sampling/importance_sampling_ratio/max": 2.4048268795013428, "sampling/importance_sampling_ratio/mean": 0.9662601947784424, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3840640783309937, "sampling/sampling_logp_difference/mean": 0.0624161995947361, "step": 69, "step_time": 26.791781901999457 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.1055372767150402, "epoch": 0.0014, "grad_norm": 0.006739933043718338, "kl": 0.17029937845654786, "learning_rate": 7.999998287412158e-06, "loss": 0.0, "step": 70, "step_time": 11.527228552999532 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0416666679084301, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0755673204548657, "epoch": 0.00142, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038206640165299177, "kl": 0.27058742146891746, "learning_rate": 7.99999818519023e-06, "loss": -0.0, "num_tokens": 3745050.0, "reward": 2.4418420791625977, "reward_std": 0.3276258409023285, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9443497061729431, "rewards/probe_shaping_dominance/std": 0.17115506529808044, "rewards/probe_terminal_raw/mean": 0.05487804859876633, "rewards/probe_terminal_raw/std": 0.15910547971725464, "rewards/rollout_reward_func/mean": -0.4761357307434082, "rewards/rollout_reward_func/std": 0.27386248111724854, "sampling/importance_sampling_ratio/max": 1.2027363777160645, "sampling/importance_sampling_ratio/mean": 0.9526693224906921, "sampling/importance_sampling_ratio/min": 0.26859819889068604, "sampling/sampling_logp_difference/max": 1.314541220664978, "sampling/sampling_logp_difference/mean": 0.04236820340156555, "step": 71, "step_time": 25.810139078000248 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.07833350286819041, "epoch": 0.00144, "grad_norm": 0.006155087612569332, "kl": 0.15766439647995867, "learning_rate": 7.999998080005348e-06, "loss": -0.0, "step": 72, "step_time": 11.807300304999444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.12527845823206007, "epoch": 0.00146, "frac_reward_zero_std": 0.0, "grad_norm": 0.012825227342545986, "kl": 0.4211071440950036, "learning_rate": 7.999997971857512e-06, "loss": 0.0001, "num_tokens": 3846778.0, "reward": 2.290764570236206, "reward_std": 0.5837900042533875, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.4908435642719269, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9892492890357971, "rewards/probe_shaping_dominance/std": 0.06081530824303627, "rewards/probe_terminal_raw/mean": 0.010797764174640179, "rewards/probe_terminal_raw/std": 0.06108137592673302, "rewards/rollout_reward_func/mean": -0.4405323565006256, "rewards/rollout_reward_func/std": 0.3242381811141968, "sampling/importance_sampling_ratio/max": 1.6338335275650024, "sampling/importance_sampling_ratio/mean": 0.9540376663208008, "sampling/importance_sampling_ratio/min": 0.19394879043102264, "sampling/sampling_logp_difference/max": 1.26481294631958, "sampling/sampling_logp_difference/mean": 0.07170334458351135, "step": 73, "step_time": 27.727274773000772 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.035416667349636555, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.035416667349636555, "entropy": 0.1369485834147781, "epoch": 0.00148, "grad_norm": 0.006000218912959099, "kl": 0.3834730681264773, "learning_rate": 7.999997860746726e-06, "loss": 0.0, "step": 74, "step_time": 11.550198297999486 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05670425167772919, "epoch": 0.0015, "frac_reward_zero_std": 0.0, "grad_norm": 0.004246127791702747, "kl": 0.26258886672280823, "learning_rate": 7.999997746672985e-06, "loss": 0.0001, "num_tokens": 3952684.0, "reward": 2.3076558113098145, "reward_std": 0.2708474397659302, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5798441171646118, "rewards/rollout_reward_func/std": 0.21061494946479797, "sampling/importance_sampling_ratio/max": 1.4762965440750122, "sampling/importance_sampling_ratio/mean": 0.9765973091125488, "sampling/importance_sampling_ratio/min": 0.1482001394033432, "sampling/sampling_logp_difference/max": 1.9091930389404297, "sampling/sampling_logp_difference/mean": 0.034642815589904785, "step": 75, "step_time": 27.424144634000186 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.06237753387540579, "epoch": 0.00152, "grad_norm": 0.005785573739558458, "kl": 0.34405436088127317, "learning_rate": 7.999997629636291e-06, "loss": 0.0001, "step": 76, "step_time": 12.303879873000824 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08415639377199113, "epoch": 0.00154, "frac_reward_zero_std": 0.0, "grad_norm": 0.005243807099759579, "kl": 0.17415540551155573, "learning_rate": 7.999997509636644e-06, "loss": 0.0, "num_tokens": 4058589.0, "reward": 2.46805739402771, "reward_std": 0.32934877276420593, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9650155901908875, "rewards/probe_shaping_dominance/std": 0.11404264718294144, "rewards/probe_terminal_raw/mean": 0.04090446978807449, "rewards/probe_terminal_raw/std": 0.13221491873264313, "rewards/rollout_reward_func/mean": -0.45661279559135437, "rewards/rollout_reward_func/std": 0.2438260018825531, "sampling/importance_sampling_ratio/max": 1.467045783996582, "sampling/importance_sampling_ratio/mean": 0.9993070363998413, "sampling/importance_sampling_ratio/min": 0.5919517874717712, "sampling/sampling_logp_difference/max": 0.5126774311065674, "sampling/sampling_logp_difference/mean": 0.021975167095661163, "step": 77, "step_time": 27.026433300999997 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.08070441009476781, "epoch": 0.00156, "grad_norm": 0.0065447925589978695, "kl": 0.1744868414461962, "learning_rate": 7.999997386674047e-06, "loss": 0.0, "step": 78, "step_time": 11.744910646999415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07671235466841608, "epoch": 0.00158, "frac_reward_zero_std": 0.0, "grad_norm": 0.007739327382296324, "kl": 0.10829602145804529, "learning_rate": 7.999997260748495e-06, "loss": 0.0, "num_tokens": 4163362.0, "reward": 2.291594982147217, "reward_std": 0.39855584502220154, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9897778034210205, "rewards/probe_shaping_dominance/std": 0.05782533064484596, "rewards/probe_terminal_raw/mean": 0.009908536449074745, "rewards/probe_terminal_raw/std": 0.05605114996433258, "rewards/rollout_reward_func/mean": -0.5330914855003357, "rewards/rollout_reward_func/std": 0.2664976716041565, "sampling/importance_sampling_ratio/max": 1.3343223333358765, "sampling/importance_sampling_ratio/mean": 0.9947078227996826, "sampling/importance_sampling_ratio/min": 0.4244631230831146, "sampling/sampling_logp_difference/max": 0.9074487686157227, "sampling/sampling_logp_difference/mean": 0.022345466539263725, "step": 79, "step_time": 27.107436816999325 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.08035149308852851, "epoch": 0.0016, "grad_norm": 0.00506787933409214, "kl": 0.1221858259250439, "learning_rate": 7.999997131859992e-06, "loss": 0.0, "step": 80, "step_time": 12.165714977000334 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.031250000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04375000111758709, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1357073881663382, "epoch": 0.00162, "frac_reward_zero_std": 0.0, "grad_norm": 0.008707523345947266, "kl": 0.19407588429749012, "learning_rate": 7.999997000008536e-06, "loss": 0.0, "num_tokens": 4264863.0, "reward": 2.4384140968322754, "reward_std": 0.4922390580177307, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.963716983795166, "rewards/probe_shaping_dominance/std": 0.11659354716539383, "rewards/probe_terminal_raw/mean": 0.03658536449074745, "rewards/probe_terminal_raw/std": 0.11809173226356506, "rewards/rollout_reward_func/mean": -0.44938817620277405, "rewards/rollout_reward_func/std": 0.28418225049972534, "sampling/importance_sampling_ratio/max": 1.7522894144058228, "sampling/importance_sampling_ratio/mean": 0.9879751205444336, "sampling/importance_sampling_ratio/min": 0.4941127300262451, "sampling/sampling_logp_difference/max": 0.5609221458435059, "sampling/sampling_logp_difference/mean": 0.03759397938847542, "step": 81, "step_time": 26.34822328099972 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.14159703021869063, "epoch": 0.00164, "grad_norm": 0.009574824012815952, "kl": 0.1771204932992987, "learning_rate": 7.999996865194129e-06, "loss": 0.0, "step": 82, "step_time": 11.777719495999463 }, { "clip_ratio/high_max": 0.06250000186264515, "clip_ratio/high_mean": 0.031250000931322575, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04375000111758709, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11876969272270799, "epoch": 0.00166, "frac_reward_zero_std": 0.0, "grad_norm": 0.010034332983195782, "kl": 0.36267855847108876, "learning_rate": 7.99999672741677e-06, "loss": 0.0001, "num_tokens": 4371298.0, "reward": 2.316115379333496, "reward_std": 0.4054742753505707, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9912324547767639, "rewards/probe_shaping_dominance/std": 0.049596767872571945, "rewards/probe_terminal_raw/mean": 0.009273373521864414, "rewards/probe_terminal_raw/std": 0.05245811864733696, "rewards/rollout_reward_func/mean": -0.5093902349472046, "rewards/rollout_reward_func/std": 0.24608401954174042, "sampling/importance_sampling_ratio/max": 1.394594430923462, "sampling/importance_sampling_ratio/mean": 0.9233759045600891, "sampling/importance_sampling_ratio/min": 0.08404743671417236, "sampling/sampling_logp_difference/max": 2.4710586071014404, "sampling/sampling_logp_difference/mean": 0.07214178144931793, "step": 83, "step_time": 27.42874688900065 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 0.11967162042856216, "epoch": 0.00168, "grad_norm": 0.009677170775830746, "kl": 0.30461428755370434, "learning_rate": 7.999996586676458e-06, "loss": 0.0001, "step": 84, "step_time": 12.210796541999116 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08633493585512042, "epoch": 0.0017, "frac_reward_zero_std": 0.0, "grad_norm": 0.009309964254498482, "kl": 0.34726120328798515, "learning_rate": 7.999996442973193e-06, "loss": -0.0, "num_tokens": 4476938.0, "reward": 2.3256678581237793, "reward_std": 0.3970645070075989, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5930821895599365, "rewards/rollout_reward_func/std": 0.20994225144386292, "sampling/importance_sampling_ratio/max": 2.7198355197906494, "sampling/importance_sampling_ratio/mean": 0.965837836265564, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.636561870574951, "sampling/sampling_logp_difference/mean": 0.07213791459798813, "step": 85, "step_time": 26.77135907899992 }, { "clip_ratio/high_max": 0.06250000186264515, "clip_ratio/high_mean": 0.031250000931322575, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0416666679084301, "entropy": 0.08549185702577233, "epoch": 0.00172, "grad_norm": 0.00986558198928833, "kl": 0.6476581503327452, "learning_rate": 7.99999629630698e-06, "loss": -0.0, "step": 86, "step_time": 11.659285754999019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08913910732371733, "epoch": 0.00174, "frac_reward_zero_std": 0.0, "grad_norm": 0.005745335482060909, "kl": 0.21945283197192111, "learning_rate": 7.999996146677813e-06, "loss": -0.0001, "num_tokens": 4579856.0, "reward": 2.2342212200164795, "reward_std": 0.5761978030204773, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.985537052154541, "rewards/probe_shaping_dominance/std": 0.08181492984294891, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.4981907904148102, "rewards/rollout_reward_func/std": 0.2684464752674103, "sampling/importance_sampling_ratio/max": 1.1302220821380615, "sampling/importance_sampling_ratio/mean": 0.9439641833305359, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.921440839767456, "sampling/sampling_logp_difference/mean": 0.047181740403175354, "step": 87, "step_time": 27.09005630599995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08115326758706942, "epoch": 0.00176, "grad_norm": 0.003665071912109852, "kl": 0.22057799324602456, "learning_rate": 7.999995994085696e-06, "loss": -0.0001, "step": 88, "step_time": 12.136771756998769 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07007716363295913, "epoch": 0.00178, "frac_reward_zero_std": 0.0, "grad_norm": 0.007810859940946102, "kl": 0.6949258089686055, "learning_rate": 7.999995838530628e-06, "loss": -0.0, "num_tokens": 4685612.0, "reward": 2.3873391151428223, "reward_std": 0.4150564968585968, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5001606941223145, "rewards/rollout_reward_func/std": 0.2632400095462799, "sampling/importance_sampling_ratio/max": 1.329830527305603, "sampling/importance_sampling_ratio/mean": 0.9396188259124756, "sampling/importance_sampling_ratio/min": 0.09286217391490936, "sampling/sampling_logp_difference/max": 2.376638174057007, "sampling/sampling_logp_difference/mean": 0.05502761900424957, "step": 89, "step_time": 26.554008219000025 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.07465687586227432, "epoch": 0.0018, "grad_norm": 0.009502755478024483, "kl": 0.22063382680062205, "learning_rate": 7.99999568001261e-06, "loss": -0.0, "step": 90, "step_time": 12.219043876999876 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.14270146866329014, "epoch": 0.00182, "frac_reward_zero_std": 0.0, "grad_norm": 0.008744009770452976, "kl": 0.11013963767254609, "learning_rate": 7.999995518531638e-06, "loss": -0.0001, "num_tokens": 4789951.0, "reward": 2.567716360092163, "reward_std": 0.9114633798599243, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.125, "rewards/probe_completion_length/std": 0.9069623351097107, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9740893840789795, "rewards/probe_shaping_dominance/std": 0.10259100794792175, "rewards/probe_terminal_raw/mean": 0.02515243925154209, "rewards/probe_terminal_raw/std": 0.10202876478433609, "rewards/rollout_reward_func/mean": -0.5065252184867859, "rewards/rollout_reward_func/std": 0.20758704841136932, "sampling/importance_sampling_ratio/max": 1.6487281322479248, "sampling/importance_sampling_ratio/mean": 0.9680857062339783, "sampling/importance_sampling_ratio/min": 0.3606947958469391, "sampling/sampling_logp_difference/max": 0.7544957399368286, "sampling/sampling_logp_difference/mean": 0.04080694913864136, "step": 91, "step_time": 26.54145688799963 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.04375000111758709, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.054166668094694614, "entropy": 0.1536610189359635, "epoch": 0.00184, "grad_norm": 0.0049968562088906765, "kl": 0.21468755277851415, "learning_rate": 7.999995354087718e-06, "loss": -0.0001, "step": 92, "step_time": 12.239923568000904 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09390545927453786, "epoch": 0.00186, "frac_reward_zero_std": 0.0, "grad_norm": 0.00847246777266264, "kl": 0.4723499550793804, "learning_rate": 7.999995186680847e-06, "loss": -0.0, "num_tokens": 4891817.0, "reward": 2.240363121032715, "reward_std": 0.4286558926105499, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9915216565132141, "rewards/probe_shaping_dominance/std": 0.04796085134148598, "rewards/probe_terminal_raw/mean": 0.008511179126799107, "rewards/probe_terminal_raw/std": 0.04814650118350983, "rewards/rollout_reward_func/mean": -0.5221695899963379, "rewards/rollout_reward_func/std": 0.18585550785064697, "sampling/importance_sampling_ratio/max": 1.2803471088409424, "sampling/importance_sampling_ratio/mean": 0.9798120856285095, "sampling/importance_sampling_ratio/min": 0.28233107924461365, "sampling/sampling_logp_difference/max": 1.2646756172180176, "sampling/sampling_logp_difference/mean": 0.03255663067102432, "step": 93, "step_time": 26.499364807999882 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 0.09494142327457666, "epoch": 0.00188, "grad_norm": 0.005891559179872274, "kl": 0.4762792717665434, "learning_rate": 7.999995016311026e-06, "loss": -0.0, "step": 94, "step_time": 11.590511038999466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.78125, "completions/mean_terminated_length": 2.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0855805806349963, "epoch": 0.0019, "frac_reward_zero_std": 0.0, "grad_norm": 0.010784839279949665, "kl": 0.5285673206672072, "learning_rate": 7.999994842978255e-06, "loss": 0.0, "num_tokens": 4999030.0, "reward": 2.307888984680176, "reward_std": 0.558517575263977, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.78125, "rewards/probe_completion_length/std": 0.420013427734375, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.4233608841896057, "rewards/rollout_reward_func/std": 0.2430049329996109, "sampling/importance_sampling_ratio/max": 2.3040266036987305, "sampling/importance_sampling_ratio/mean": 1.0930638313293457, "sampling/importance_sampling_ratio/min": 0.26607653498649597, "sampling/sampling_logp_difference/max": 1.3239718675613403, "sampling/sampling_logp_difference/mean": 0.0572347566485405, "step": 95, "step_time": 27.32456371700073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.08265005028806627, "epoch": 0.00192, "grad_norm": 0.009639889933168888, "kl": 0.5285577713511884, "learning_rate": 7.999994666682534e-06, "loss": 0.0, "step": 96, "step_time": 12.08934896799974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10442803846672177, "epoch": 0.00194, "frac_reward_zero_std": 0.0, "grad_norm": 0.007832064293324947, "kl": 1.2743625693256035, "learning_rate": 7.999994487423863e-06, "loss": 0.0002, "num_tokens": 5101617.0, "reward": 2.3278391361236572, "reward_std": 0.21062178909778595, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5909109115600586, "rewards/rollout_reward_func/std": 0.17344380915164948, "sampling/importance_sampling_ratio/max": 1.2738028764724731, "sampling/importance_sampling_ratio/mean": 0.8911948204040527, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.880244493484497, "sampling/sampling_logp_difference/mean": 0.08490461856126785, "step": 97, "step_time": 26.761640363000424 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 0.09561855113133788, "epoch": 0.00196, "grad_norm": 0.0042576780542731285, "kl": 0.8573908178368583, "learning_rate": 7.999994305202242e-06, "loss": 0.0002, "step": 98, "step_time": 12.239888331999737 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.12256050202995539, "epoch": 0.00198, "frac_reward_zero_std": 0.0, "grad_norm": 0.03982119634747505, "kl": 0.4613347239792347, "learning_rate": 7.999994120017672e-06, "loss": 0.0, "num_tokens": 5208185.0, "reward": 2.3622024059295654, "reward_std": 0.3201013505458832, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9496574401855469, "rewards/probe_shaping_dominance/std": 0.13700911402702332, "rewards/probe_terminal_raw/mean": 0.0570375993847847, "rewards/probe_terminal_raw/std": 0.15571396052837372, "rewards/rollout_reward_func/mean": -0.5007427334785461, "rewards/rollout_reward_func/std": 0.2577684223651886, "sampling/importance_sampling_ratio/max": 2.246042490005493, "sampling/importance_sampling_ratio/mean": 1.0854158401489258, "sampling/importance_sampling_ratio/min": 0.0747772604227066, "sampling/sampling_logp_difference/max": 2.5932421684265137, "sampling/sampling_logp_difference/mean": 0.07237481325864792, "step": 99, "step_time": 28.563245160000406 }, { "clip_ratio/high_max": 0.05000000074505806, "clip_ratio/high_mean": 0.02500000037252903, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04583333432674408, "entropy": 0.11652607470750809, "epoch": 0.002, "grad_norm": 0.013196082785725594, "kl": 1.1047777848725673, "learning_rate": 7.999993931870152e-06, "loss": -0.0, "step": 100, "step_time": 11.832685018998745 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11155627248808742, "epoch": 0.00202, "frac_reward_zero_std": 0.0, "grad_norm": 0.011043570004403591, "kl": 0.8486065305769444, "learning_rate": 7.999993740759685e-06, "loss": 0.0, "num_tokens": 5312092.0, "reward": 2.469048261642456, "reward_std": 0.296406090259552, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0625, "rewards/probe_invalid_count/std": 0.24593468010425568, "rewards/probe_shaping_dominance/mean": 0.9914976358413696, "rewards/probe_shaping_dominance/std": 0.04809650778770447, "rewards/probe_terminal_raw/mean": 0.00889227632433176, "rewards/probe_terminal_raw/std": 0.05030231550335884, "rewards/rollout_reward_func/mean": -0.5125917196273804, "rewards/rollout_reward_func/std": 0.1837811917066574, "sampling/importance_sampling_ratio/max": 1.2519433498382568, "sampling/importance_sampling_ratio/mean": 0.8515626192092896, "sampling/importance_sampling_ratio/min": 0.08545338362455368, "sampling/sampling_logp_difference/max": 2.4583053588867188, "sampling/sampling_logp_difference/mean": 0.1055741012096405, "step": 101, "step_time": 28.246981163999408 }, { "clip_ratio/high_max": 0.0833333358168602, "clip_ratio/high_mean": 0.0416666679084301, "clip_ratio/low_mean": 0.031250000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07291666883975267, "entropy": 0.10925065912306309, "epoch": 0.00204, "grad_norm": 0.008332287892699242, "kl": 0.7459432929754257, "learning_rate": 7.999993546686268e-06, "loss": 0.0, "step": 102, "step_time": 12.24685298599934 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09578724391758442, "epoch": 0.00206, "frac_reward_zero_std": 0.0, "grad_norm": 0.005429553799331188, "kl": 0.3181111275916919, "learning_rate": 7.999993349649902e-06, "loss": 0.0001, "num_tokens": 5417356.0, "reward": 2.296133279800415, "reward_std": 0.48034343123435974, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.983701229095459, "rewards/probe_shaping_dominance/std": 0.0652911514043808, "rewards/probe_terminal_raw/mean": 0.021214431151747704, "rewards/probe_terminal_raw/std": 0.08383625000715256, "rewards/rollout_reward_func/mean": -0.5025323629379272, "rewards/rollout_reward_func/std": 0.23934274911880493, "sampling/importance_sampling_ratio/max": 1.7521827220916748, "sampling/importance_sampling_ratio/mean": 1.0161978006362915, "sampling/importance_sampling_ratio/min": 0.559285044670105, "sampling/sampling_logp_difference/max": 0.5810226202011108, "sampling/sampling_logp_difference/mean": 0.03578226640820503, "step": 103, "step_time": 28.179791414999727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.09543535858392715, "epoch": 0.00208, "grad_norm": 0.005383977200835943, "kl": 0.31692405231297016, "learning_rate": 7.999993149650587e-06, "loss": 0.0, "step": 104, "step_time": 11.594287923999673 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10437362408265471, "epoch": 0.0021, "frac_reward_zero_std": 0.0, "grad_norm": 0.006486265454441309, "kl": 0.4273503478616476, "learning_rate": 7.999992946688324e-06, "loss": -0.0, "num_tokens": 5522766.0, "reward": 2.39151668548584, "reward_std": 0.39364051818847656, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0625, "rewards/probe_invalid_count/std": 0.24593468010425568, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5272334814071655, "rewards/rollout_reward_func/std": 0.2972264289855957, "sampling/importance_sampling_ratio/max": 1.9010006189346313, "sampling/importance_sampling_ratio/mean": 1.0246827602386475, "sampling/importance_sampling_ratio/min": 0.3678455054759979, "sampling/sampling_logp_difference/max": 1.0000989437103271, "sampling/sampling_logp_difference/mean": 0.03773331269621849, "step": 105, "step_time": 26.660096251999676 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 0.09778932714834809, "epoch": 0.00212, "grad_norm": 0.005733635742217302, "kl": 0.36536745447665453, "learning_rate": 7.999992740763114e-06, "loss": -0.0, "step": 106, "step_time": 12.020263065000563 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09189990477170795, "epoch": 0.00214, "frac_reward_zero_std": 0.0, "grad_norm": 0.006950075738132, "kl": 0.37158518051728606, "learning_rate": 7.999992531874955e-06, "loss": 0.0, "num_tokens": 5624278.0, "reward": 2.3239517211914062, "reward_std": 0.4278637170791626, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9810667037963867, "rewards/probe_shaping_dominance/std": 0.0745052620768547, "rewards/probe_terminal_raw/mean": 0.021214431151747704, "rewards/probe_terminal_raw/std": 0.08405215293169022, "rewards/rollout_reward_func/mean": -0.472079336643219, "rewards/rollout_reward_func/std": 0.24182648956775665, "sampling/importance_sampling_ratio/max": 1.8587580919265747, "sampling/importance_sampling_ratio/mean": 0.9948133230209351, "sampling/importance_sampling_ratio/min": 0.488203763961792, "sampling/sampling_logp_difference/max": 0.6990102529525757, "sampling/sampling_logp_difference/mean": 0.03366800397634506, "step": 107, "step_time": 27.280253950999395 }, { "clip_ratio/high_max": 0.06666666828095913, "clip_ratio/high_mean": 0.033333334140479565, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04375000111758709, "entropy": 0.07755104900570586, "epoch": 0.00216, "grad_norm": 0.0029529579915106297, "kl": 0.3871547483528275, "learning_rate": 7.99999232002385e-06, "loss": 0.0, "step": 108, "step_time": 11.582099404000473 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04583333432674408, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08607161836698651, "epoch": 0.00218, "frac_reward_zero_std": 0.0, "grad_norm": 0.004912302363663912, "kl": 0.3110020191234071, "learning_rate": 7.999992105209796e-06, "loss": 0.0, "num_tokens": 5730240.0, "reward": 2.3713436126708984, "reward_std": 0.34508299827575684, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9868378639221191, "rewards/probe_shaping_dominance/std": 0.07445620000362396, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.5186194181442261, "rewards/rollout_reward_func/std": 0.22763106226921082, "sampling/importance_sampling_ratio/max": 2.4666221141815186, "sampling/importance_sampling_ratio/mean": 0.9437046051025391, "sampling/importance_sampling_ratio/min": 0.16313567757606506, "sampling/sampling_logp_difference/max": 1.8131763935089111, "sampling/sampling_logp_difference/mean": 0.07055296003818512, "step": 109, "step_time": 27.85804966900014 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "entropy": 0.08376848418265581, "epoch": 0.0022, "grad_norm": 0.021030370146036148, "kl": 0.3346872879192233, "learning_rate": 7.999991887432795e-06, "loss": 0.0, "step": 110, "step_time": 12.221424097000181 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03645833395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09564799422514625, "epoch": 0.00222, "frac_reward_zero_std": 0.0, "grad_norm": 0.010623163543641567, "kl": 1.25646445970051, "learning_rate": 7.999991666692848e-06, "loss": 0.0001, "num_tokens": 5834866.0, "reward": 2.371830463409424, "reward_std": 0.455732524394989, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.9698338508605957, "rewards/probe_shaping_dominance/std": 0.118809275329113, "rewards/probe_terminal_raw/mean": 0.02909044735133648, "rewards/probe_terminal_raw/std": 0.11480555683374405, "rewards/rollout_reward_func/mean": -0.45209401845932007, "rewards/rollout_reward_func/std": 0.2390637993812561, "sampling/importance_sampling_ratio/max": 2.435302972793579, "sampling/importance_sampling_ratio/mean": 0.9616929292678833, "sampling/importance_sampling_ratio/min": 0.18086190521717072, "sampling/sampling_logp_difference/max": 1.7100262641906738, "sampling/sampling_logp_difference/mean": 0.06157621741294861, "step": 111, "step_time": 27.536669213000096 }, { "clip_ratio/high_max": 0.05625000037252903, "clip_ratio/high_mean": 0.028125000186264515, "clip_ratio/low_mean": 0.031250000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05937500111758709, "entropy": 0.09540150425164029, "epoch": 0.00224, "grad_norm": 0.005310059990733862, "kl": 0.7572433853056282, "learning_rate": 7.999991442989953e-06, "loss": 0.0001, "step": 112, "step_time": 11.58020766800064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05377835238323314, "epoch": 0.00226, "frac_reward_zero_std": 0.0, "grad_norm": 0.0054777092300355434, "kl": 0.2139036045409739, "learning_rate": 7.999991216324112e-06, "loss": 0.0, "num_tokens": 5941971.0, "reward": 2.3715004920959473, "reward_std": 0.3570369482040405, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.9854661822319031, "rewards/probe_shaping_dominance/std": 0.08221564441919327, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.5483407378196716, "rewards/rollout_reward_func/std": 0.21500766277313232, "sampling/importance_sampling_ratio/max": 1.468092441558838, "sampling/importance_sampling_ratio/mean": 1.0448389053344727, "sampling/importance_sampling_ratio/min": 0.9520513415336609, "sampling/sampling_logp_difference/max": 0.38396334648132324, "sampling/sampling_logp_difference/mean": 0.014699834398925304, "step": 113, "step_time": 26.95208743199919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05231661406651256, "epoch": 0.00228, "grad_norm": 0.005958650726824999, "kl": 0.20708634098750167, "learning_rate": 7.999990986695325e-06, "loss": 0.0, "step": 114, "step_time": 12.898005667000234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1277365549467504, "epoch": 0.0023, "frac_reward_zero_std": 0.0, "grad_norm": 0.010303696617484093, "kl": 0.5558968242257833, "learning_rate": 7.999990754103591e-06, "loss": -0.0, "num_tokens": 6048989.0, "reward": 2.3545703887939453, "reward_std": 0.32267555594444275, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5329294204711914, "rewards/rollout_reward_func/std": 0.1960861086845398, "sampling/importance_sampling_ratio/max": 2.528221368789673, "sampling/importance_sampling_ratio/mean": 0.9982080459594727, "sampling/importance_sampling_ratio/min": 0.042695675045251846, "sampling/sampling_logp_difference/max": 3.153654098510742, "sampling/sampling_logp_difference/mean": 0.08483341336250305, "step": 115, "step_time": 28.715120017999652 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.031250000931322575, "clip_ratio/low_min": 0.02083333395421505, "clip_ratio/region_mean": 0.0416666679084301, "entropy": 0.1117813317105174, "epoch": 0.00232, "grad_norm": 0.006610220763832331, "kl": 0.6069826502352953, "learning_rate": 7.99999051854891e-06, "loss": -0.0, "step": 116, "step_time": 12.037885646000177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.035416667349636555, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.035416667349636555, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08630842622369528, "epoch": 0.00234, "frac_reward_zero_std": 0.0, "grad_norm": 0.023052336648106575, "kl": 4.202049997946233, "learning_rate": 7.999990280031285e-06, "loss": -0.0, "num_tokens": 6156241.0, "reward": 2.3509585857391357, "reward_std": 0.3719061613082886, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5052914619445801, "rewards/rollout_reward_func/std": 0.26930469274520874, "sampling/importance_sampling_ratio/max": 1.4201393127441406, "sampling/importance_sampling_ratio/mean": 0.9191266298294067, "sampling/importance_sampling_ratio/min": 0.04002097621560097, "sampling/sampling_logp_difference/max": 3.218353271484375, "sampling/sampling_logp_difference/mean": 0.08381534367799759, "step": 117, "step_time": 27.4478307280001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.08493484603241086, "epoch": 0.00236, "grad_norm": 0.005157412961125374, "kl": 0.8633453572015242, "learning_rate": 7.999990038550715e-06, "loss": -0.0001, "step": 118, "step_time": 12.410220233000018 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1009751778037753, "epoch": 0.00238, "frac_reward_zero_std": 0.0, "grad_norm": 0.007704886142164469, "kl": 1.133708338191262, "learning_rate": 7.9999897941072e-06, "loss": -0.0, "num_tokens": 6261608.0, "reward": 2.272282600402832, "reward_std": 0.4321046769618988, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5527174472808838, "rewards/rollout_reward_func/std": 0.2261652648448944, "sampling/importance_sampling_ratio/max": 1.9247888326644897, "sampling/importance_sampling_ratio/mean": 0.9601424932479858, "sampling/importance_sampling_ratio/min": 0.10850485414266586, "sampling/sampling_logp_difference/max": 2.221635580062866, "sampling/sampling_logp_difference/mean": 0.06387770175933838, "step": 119, "step_time": 27.243004307998945 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 0.10485226087621413, "epoch": 0.0024, "grad_norm": 0.005486879497766495, "kl": 0.7662449008450487, "learning_rate": 7.999989546700739e-06, "loss": -0.0, "step": 120, "step_time": 11.642901553001138 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05734692560508847, "epoch": 0.00242, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022395749110728502, "kl": 0.4620458657536801, "learning_rate": 7.999989296331334e-06, "loss": 0.0, "num_tokens": 6364884.0, "reward": 2.300528049468994, "reward_std": 0.3925109803676605, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9786701202392578, "rewards/probe_shaping_dominance/std": 0.08399670571088791, "rewards/probe_terminal_raw/mean": 0.020452234894037247, "rewards/probe_terminal_raw/std": 0.08055972307920456, "rewards/rollout_reward_func/mean": -0.5235942602157593, "rewards/rollout_reward_func/std": 0.19283899664878845, "sampling/importance_sampling_ratio/max": 1.684720754623413, "sampling/importance_sampling_ratio/mean": 0.9979562163352966, "sampling/importance_sampling_ratio/min": 0.3297406733036041, "sampling/sampling_logp_difference/max": 1.109449863433838, "sampling/sampling_logp_difference/mean": 0.03222563862800598, "step": 121, "step_time": 27.102160742999786 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.06036481284536421, "epoch": 0.00244, "grad_norm": 0.0021346518769860268, "kl": 0.460031573350534, "learning_rate": 7.999989042998983e-06, "loss": 0.0, "step": 122, "step_time": 12.627941945999737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08197857672348619, "epoch": 0.00246, "frac_reward_zero_std": 0.0, "grad_norm": 0.005835927091538906, "kl": 0.3058228840382071, "learning_rate": 7.99998878670369e-06, "loss": -0.0, "num_tokens": 6470259.0, "reward": 2.4272561073303223, "reward_std": 0.2215338796377182, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.0, "rewards/probe_completion_length/std": 0.0, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9598830342292786, "rewards/probe_shaping_dominance/std": 0.13210204243659973, "rewards/probe_terminal_raw/mean": 0.04026930779218674, "rewards/probe_terminal_raw/std": 0.13092826306819916, "rewards/rollout_reward_func/mean": -0.5228960514068604, "rewards/rollout_reward_func/std": 0.22377446293830872, "sampling/importance_sampling_ratio/max": 1.2321637868881226, "sampling/importance_sampling_ratio/mean": 0.9182083606719971, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2927324771881104, "sampling/sampling_logp_difference/mean": 0.04780565947294235, "step": 123, "step_time": 27.481588907000514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.0761918865609914, "epoch": 0.00248, "grad_norm": 0.005192534998059273, "kl": 0.32337066042236984, "learning_rate": 7.999988527445453e-06, "loss": -0.0, "step": 124, "step_time": 11.74153527999988 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.061301857323996956, "epoch": 0.0025, "frac_reward_zero_std": 0.0, "grad_norm": 0.004524994175881147, "kl": 0.20127144705232547, "learning_rate": 7.99998826522427e-06, "loss": -0.0, "num_tokens": 6573122.0, "reward": 2.5412168502807617, "reward_std": 0.4934008717536926, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0625, "rewards/probe_invalid_count/std": 0.3535533845424652, "rewards/probe_shaping_dominance/mean": 0.9729976058006287, "rewards/probe_shaping_dominance/std": 0.10630916804075241, "rewards/probe_terminal_raw/mean": 0.028963414952158928, "rewards/probe_terminal_raw/std": 0.11434794962406158, "rewards/rollout_reward_func/mean": -0.44199419021606445, "rewards/rollout_reward_func/std": 0.23288173973560333, "sampling/importance_sampling_ratio/max": 2.8899707794189453, "sampling/importance_sampling_ratio/mean": 1.0233311653137207, "sampling/importance_sampling_ratio/min": 0.5645219683647156, "sampling/sampling_logp_difference/max": 1.0612452030181885, "sampling/sampling_logp_difference/mean": 0.02934853918850422, "step": 125, "step_time": 26.56314809100013 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.0587867568974616, "epoch": 0.00252, "grad_norm": 0.003286329098045826, "kl": 0.23132333873703226, "learning_rate": 7.999988000040144e-06, "loss": -0.0, "step": 126, "step_time": 12.704706686999543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06930449209176004, "epoch": 0.00254, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038534458726644516, "kl": 0.8923099512467161, "learning_rate": 7.999987731893076e-06, "loss": -0.0001, "num_tokens": 6674759.0, "reward": 2.476976156234741, "reward_std": 0.5018807053565979, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.09375, "rewards/probe_invalid_count/std": 0.39015090465545654, "rewards/probe_shaping_dominance/mean": 0.9897805452346802, "rewards/probe_shaping_dominance/std": 0.057810164988040924, "rewards/probe_terminal_raw/mean": 0.010797764174640179, "rewards/probe_terminal_raw/std": 0.06108137592673302, "rewards/rollout_reward_func/mean": -0.5048520565032959, "rewards/rollout_reward_func/std": 0.23183932900428772, "sampling/importance_sampling_ratio/max": 2.6555376052856445, "sampling/importance_sampling_ratio/mean": 1.037369728088379, "sampling/importance_sampling_ratio/min": 0.18285271525382996, "sampling/sampling_logp_difference/max": 1.6990761756896973, "sampling/sampling_logp_difference/mean": 0.04799798130989075, "step": 127, "step_time": 26.519593818999965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 0.07739454251714051, "epoch": 0.00256, "grad_norm": 0.0046963742934167385, "kl": 0.8950551702291705, "learning_rate": 7.999987460783066e-06, "loss": -0.0001, "step": 128, "step_time": 11.701040565999392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04215445008594543, "epoch": 0.00258, "frac_reward_zero_std": 0.0, "grad_norm": 0.004038817714899778, "kl": 0.483372636698145, "learning_rate": 7.999987186710111e-06, "loss": -0.0001, "num_tokens": 6778164.0, "reward": 2.3669238090515137, "reward_std": 0.33272045850753784, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9665718078613281, "rewards/probe_shaping_dominance/std": 0.11594124883413315, "rewards/probe_terminal_raw/mean": 0.033663615584373474, "rewards/probe_terminal_raw/std": 0.11093832552433014, "rewards/rollout_reward_func/mean": -0.5208115577697754, "rewards/rollout_reward_func/std": 0.22583386301994324, "sampling/importance_sampling_ratio/max": 1.324372410774231, "sampling/importance_sampling_ratio/mean": 0.9827702045440674, "sampling/importance_sampling_ratio/min": 0.15934889018535614, "sampling/sampling_logp_difference/max": 1.8366597890853882, "sampling/sampling_logp_difference/mean": 0.03050372563302517, "step": 129, "step_time": 29.272002608000093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.040716532384976745, "epoch": 0.0026, "grad_norm": 0.004598686005920172, "kl": 0.48791675676284285, "learning_rate": 7.999986909674215e-06, "loss": -0.0001, "step": 130, "step_time": 11.615075072000309 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07680852155863249, "epoch": 0.00262, "frac_reward_zero_std": 0.0, "grad_norm": 0.004599301610141993, "kl": 0.5561261102557182, "learning_rate": 7.999986629675377e-06, "loss": 0.0001, "num_tokens": 6881343.0, "reward": 2.428385019302368, "reward_std": 0.35835328698158264, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.988267183303833, "rewards/probe_shaping_dominance/std": 0.06637061387300491, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.4630073308944702, "rewards/rollout_reward_func/std": 0.23799148201942444, "sampling/importance_sampling_ratio/max": 2.105088472366333, "sampling/importance_sampling_ratio/mean": 1.0250680446624756, "sampling/importance_sampling_ratio/min": 0.24339471757411957, "sampling/sampling_logp_difference/max": 1.413072109222412, "sampling/sampling_logp_difference/mean": 0.05859563127160072, "step": 131, "step_time": 27.499229768000532 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.07459181371518753, "epoch": 0.00264, "grad_norm": 0.0046109952963888645, "kl": 0.4819548297673464, "learning_rate": 7.999986346713597e-06, "loss": 0.0001, "step": 132, "step_time": 11.681140706999486 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06963892979547381, "epoch": 0.00266, "frac_reward_zero_std": 0.0, "grad_norm": 0.004053663462400436, "kl": 0.29985905811190605, "learning_rate": 7.999986060788874e-06, "loss": -0.0001, "num_tokens": 6984936.0, "reward": 2.398922920227051, "reward_std": 0.3926793932914734, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9907037019729614, "rewards/probe_shaping_dominance/std": 0.052587706595659256, "rewards/probe_terminal_raw/mean": 0.007876016199588776, "rewards/probe_terminal_raw/std": 0.0445534773170948, "rewards/rollout_reward_func/mean": -0.45590683817863464, "rewards/rollout_reward_func/std": 0.20304201543331146, "sampling/importance_sampling_ratio/max": 1.1057724952697754, "sampling/importance_sampling_ratio/mean": 0.917495846748352, "sampling/importance_sampling_ratio/min": 0.2753896415233612, "sampling/sampling_logp_difference/max": 1.2891517877578735, "sampling/sampling_logp_difference/mean": 0.049349602311849594, "step": 133, "step_time": 28.668226430000686 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.07001902349293232, "epoch": 0.00268, "grad_norm": 0.0046079279854893684, "kl": 0.30660303554032, "learning_rate": 7.999985771901212e-06, "loss": -0.0001, "step": 134, "step_time": 11.78814972499913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0837576383491978, "epoch": 0.0027, "frac_reward_zero_std": 0.0, "grad_norm": 0.004887988790869713, "kl": 0.48908784112427384, "learning_rate": 7.999985480050609e-06, "loss": 0.0, "num_tokens": 7089375.0, "reward": 2.383143901824951, "reward_std": 0.2860008180141449, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9715699553489685, "rewards/probe_shaping_dominance/std": 0.11188202351331711, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.12296734005212784, "rewards/rollout_reward_func/mean": -0.5384261608123779, "rewards/rollout_reward_func/std": 0.24632836878299713, "sampling/importance_sampling_ratio/max": 2.175699234008789, "sampling/importance_sampling_ratio/mean": 0.9764343500137329, "sampling/importance_sampling_ratio/min": 0.37150871753692627, "sampling/sampling_logp_difference/max": 1.0082650184631348, "sampling/sampling_logp_difference/mean": 0.04385855793952942, "step": 135, "step_time": 27.26713926100001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.07945482805371284, "epoch": 0.00272, "grad_norm": 0.005393319763243198, "kl": 0.4894396271556616, "learning_rate": 7.999985185237063e-06, "loss": 0.0, "step": 136, "step_time": 11.740167015000225 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016666667070239782, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07358018541708589, "epoch": 0.00274, "frac_reward_zero_std": 0.0, "grad_norm": 0.04956609383225441, "kl": 7.594387605204247, "learning_rate": 7.999984887460579e-06, "loss": 0.0, "num_tokens": 7195651.0, "reward": 2.523413896560669, "reward_std": 1.283755898475647, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.0, "rewards/probe_completion_length/std": 1.1639753580093384, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.9845632314682007, "rewards/probe_shaping_dominance/std": 0.08732341974973679, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.4580242931842804, "rewards/rollout_reward_func/std": 0.29842740297317505, "sampling/importance_sampling_ratio/max": 1.5995361804962158, "sampling/importance_sampling_ratio/mean": 0.9101204872131348, "sampling/importance_sampling_ratio/min": 0.2878796458244324, "sampling/sampling_logp_difference/max": 1.2452144622802734, "sampling/sampling_logp_difference/mean": 0.08170486986637115, "step": 137, "step_time": 35.5617492829997 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04583333432674408, "entropy": 0.0833338184747845, "epoch": 0.00276, "grad_norm": 0.004238603170961142, "kl": 0.8713670628203545, "learning_rate": 7.999984586721153e-06, "loss": -0.0001, "step": 138, "step_time": 13.092057540999122 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.077066877449397, "epoch": 0.00278, "frac_reward_zero_std": 0.0, "grad_norm": 0.006206016521900892, "kl": 0.2502201258515315, "learning_rate": 7.999984283018788e-06, "loss": -0.0001, "num_tokens": 7298420.0, "reward": 2.434345006942749, "reward_std": 0.33564823865890503, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.4531550407409668, "rewards/rollout_reward_func/std": 0.23981256783008575, "sampling/importance_sampling_ratio/max": 1.5575754642486572, "sampling/importance_sampling_ratio/mean": 0.9945090413093567, "sampling/importance_sampling_ratio/min": 0.39499369263648987, "sampling/sampling_logp_difference/max": 0.9288842678070068, "sampling/sampling_logp_difference/mean": 0.0369817316532135, "step": 139, "step_time": 26.636275078999915 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 0.08048825367586687, "epoch": 0.0028, "grad_norm": 0.004995269235223532, "kl": 0.1949386877240613, "learning_rate": 7.999983976353484e-06, "loss": -0.0001, "step": 140, "step_time": 11.886442712999724 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09608687367290258, "epoch": 0.00282, "frac_reward_zero_std": 0.0, "grad_norm": 0.010193965397775173, "kl": 1.043814627239044, "learning_rate": 7.99998366672524e-06, "loss": 0.0001, "num_tokens": 7400213.0, "reward": 2.357463836669922, "reward_std": 0.45996955037117004, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9802613258361816, "rewards/probe_shaping_dominance/std": 0.0787685438990593, "rewards/probe_terminal_raw/mean": 0.017403453588485718, "rewards/probe_terminal_raw/std": 0.06857709586620331, "rewards/rollout_reward_func/mean": -0.46520087122917175, "rewards/rollout_reward_func/std": 0.23765753209590912, "sampling/importance_sampling_ratio/max": 2.0903208255767822, "sampling/importance_sampling_ratio/mean": 1.064300775527954, "sampling/importance_sampling_ratio/min": 0.2817336320877075, "sampling/sampling_logp_difference/max": 1.266794204711914, "sampling/sampling_logp_difference/mean": 0.04518420994281769, "step": 141, "step_time": 27.64493636099951 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "entropy": 0.10309219686314464, "epoch": 0.00284, "grad_norm": 0.01219659112393856, "kl": 0.6812123054987751, "learning_rate": 7.999983354134058e-06, "loss": 0.0, "step": 142, "step_time": 11.569478897000408 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07752494711894542, "epoch": 0.00286, "frac_reward_zero_std": 0.0, "grad_norm": 0.004447088576853275, "kl": 0.28799188635699124, "learning_rate": 7.999983038579937e-06, "loss": -0.0002, "num_tokens": 7502202.0, "reward": 2.4029557704925537, "reward_std": 0.41433292627334595, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.4220443069934845, "rewards/rollout_reward_func/std": 0.2347659468650818, "sampling/importance_sampling_ratio/max": 2.925204277038574, "sampling/importance_sampling_ratio/mean": 1.0200954675674438, "sampling/importance_sampling_ratio/min": 0.2386324405670166, "sampling/sampling_logp_difference/max": 1.4322543144226074, "sampling/sampling_logp_difference/mean": 0.04332014173269272, "step": 143, "step_time": 27.17438340000035 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.07426338468212634, "epoch": 0.00288, "grad_norm": 0.004469662439078093, "kl": 0.2410876297701634, "learning_rate": 7.999982720062878e-06, "loss": -0.0002, "step": 144, "step_time": 12.213636597999539 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08634203940164298, "epoch": 0.0029, "frac_reward_zero_std": 0.0, "grad_norm": 0.002921090926975012, "kl": 0.230285348889538, "learning_rate": 7.99998239858288e-06, "loss": 0.0, "num_tokens": 7607649.0, "reward": 2.3042469024658203, "reward_std": 0.4113651216030121, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5207530856132507, "rewards/rollout_reward_func/std": 0.2033592164516449, "sampling/importance_sampling_ratio/max": 1.081487774848938, "sampling/importance_sampling_ratio/mean": 0.961658239364624, "sampling/importance_sampling_ratio/min": 0.3403857946395874, "sampling/sampling_logp_difference/max": 0.7405810356140137, "sampling/sampling_logp_difference/mean": 0.02413717657327652, "step": 145, "step_time": 28.17627675400081 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.08849173790076748, "epoch": 0.00292, "grad_norm": 0.0025327985640615225, "kl": 0.24220079024462393, "learning_rate": 7.999982074139944e-06, "loss": 0.0, "step": 146, "step_time": 11.552079900000535 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11541430978104472, "epoch": 0.00294, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037195596378296614, "kl": 0.24169684358639643, "learning_rate": 7.999981746734073e-06, "loss": -0.0001, "num_tokens": 7714926.0, "reward": 2.362529754638672, "reward_std": 0.3588845729827881, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9859415292739868, "rewards/probe_shaping_dominance/std": 0.07952678948640823, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.5265366435050964, "rewards/rollout_reward_func/std": 0.2366112768650055, "sampling/importance_sampling_ratio/max": 1.8165228366851807, "sampling/importance_sampling_ratio/mean": 1.0579065084457397, "sampling/importance_sampling_ratio/min": 0.4353120028972626, "sampling/sampling_logp_difference/max": 0.826627790927887, "sampling/sampling_logp_difference/mean": 0.04029189795255661, "step": 147, "step_time": 27.175546237000162 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.1161738510709256, "epoch": 0.00296, "grad_norm": 0.0037887210492044687, "kl": 0.23712664423510432, "learning_rate": 7.999981416365263e-06, "loss": -0.0, "step": 148, "step_time": 12.20823843899916 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.105490946007194, "epoch": 0.00298, "frac_reward_zero_std": 0.0, "grad_norm": 0.005545547232031822, "kl": 0.10429394743793807, "learning_rate": 7.999981083033518e-06, "loss": -0.0, "num_tokens": 7820271.0, "reward": 2.2831099033355713, "reward_std": 0.39255067706108093, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5418901443481445, "rewards/rollout_reward_func/std": 0.2250201553106308, "sampling/importance_sampling_ratio/max": 1.449048399925232, "sampling/importance_sampling_ratio/mean": 0.9792050719261169, "sampling/importance_sampling_ratio/min": 0.2817993760108948, "sampling/sampling_logp_difference/max": 1.2665607929229736, "sampling/sampling_logp_difference/mean": 0.03002801164984703, "step": 149, "step_time": 27.53580150099924 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.10439014286384918, "epoch": 0.003, "grad_norm": 0.00822756253182888, "kl": 0.11194274778247859, "learning_rate": 7.999980746738835e-06, "loss": -0.0, "step": 150, "step_time": 11.669001740000112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1254521356895566, "epoch": 0.00302, "frac_reward_zero_std": 0.0, "grad_norm": 0.008205846883356571, "kl": 0.2568075335584581, "learning_rate": 7.999980407481217e-06, "loss": -0.0, "num_tokens": 7922328.0, "reward": 2.4083704948425293, "reward_std": 0.3905543088912964, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9785879850387573, "rewards/probe_shaping_dominance/std": 0.08963118493556976, "rewards/probe_terminal_raw/mean": 0.0260416679084301, "rewards/probe_terminal_raw/std": 0.1046360433101654, "rewards/rollout_reward_func/mean": -0.45250916481018066, "rewards/rollout_reward_func/std": 0.25463223457336426, "sampling/importance_sampling_ratio/max": 1.165947437286377, "sampling/importance_sampling_ratio/mean": 0.9090801477432251, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9794785976409912, "sampling/sampling_logp_difference/mean": 0.06048261374235153, "step": 151, "step_time": 25.965173581000272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.11868520639836788, "epoch": 0.00304, "grad_norm": 0.008953132666647434, "kl": 0.6233456870540977, "learning_rate": 7.999980065260663e-06, "loss": -0.0001, "step": 152, "step_time": 12.843935258000784 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11048904561903328, "epoch": 0.00306, "frac_reward_zero_std": 0.0, "grad_norm": 0.00968129187822342, "kl": 0.14061896470107627, "learning_rate": 7.999979720077173e-06, "loss": -0.0, "num_tokens": 8026423.0, "reward": 2.419642925262451, "reward_std": 0.30986252427101135, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9758727550506592, "rewards/probe_shaping_dominance/std": 0.10938115417957306, "rewards/probe_terminal_raw/mean": 0.0209603663533926, "rewards/probe_terminal_raw/std": 0.09247327595949173, "rewards/rollout_reward_func/mean": -0.49594029784202576, "rewards/rollout_reward_func/std": 0.2378591150045395, "sampling/importance_sampling_ratio/max": 1.1600902080535889, "sampling/importance_sampling_ratio/mean": 0.9520583152770996, "sampling/importance_sampling_ratio/min": 0.5003088712692261, "sampling/sampling_logp_difference/max": 0.6657150983810425, "sampling/sampling_logp_difference/mean": 0.025925474241375923, "step": 153, "step_time": 26.941947170000276 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.1122089575510472, "epoch": 0.00308, "grad_norm": 0.00867843721061945, "kl": 0.15484224071647645, "learning_rate": 7.99997937193075e-06, "loss": -0.0, "step": 154, "step_time": 11.658896313999776 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0629729179199785, "epoch": 0.0031, "frac_reward_zero_std": 0.0, "grad_norm": 0.003953923936933279, "kl": 0.03362982640601331, "learning_rate": 7.99997902082139e-06, "loss": 0.0, "num_tokens": 8134364.0, "reward": 2.304103374481201, "reward_std": 0.3902580142021179, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9852296113967896, "rewards/probe_shaping_dominance/std": 0.08355414122343063, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.5217512845993042, "rewards/rollout_reward_func/std": 0.20511233806610107, "sampling/importance_sampling_ratio/max": 1.2205973863601685, "sampling/importance_sampling_ratio/mean": 0.9658781290054321, "sampling/importance_sampling_ratio/min": 0.46778079867362976, "sampling/sampling_logp_difference/max": 0.7597565650939941, "sampling/sampling_logp_difference/mean": 0.021998237818479538, "step": 155, "step_time": 27.223922481999125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.06562891032081097, "epoch": 0.00312, "grad_norm": 0.004405137151479721, "kl": 0.038039611198541934, "learning_rate": 7.999978666749097e-06, "loss": 0.0, "step": 156, "step_time": 12.512135376999595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05827112344559282, "epoch": 0.00314, "frac_reward_zero_std": 0.0, "grad_norm": 0.004053921438753605, "kl": 0.22048271807530284, "learning_rate": 7.99997830971387e-06, "loss": -0.0, "num_tokens": 8238748.0, "reward": 2.4397072792053223, "reward_std": 0.3176124691963196, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.9892078638076782, "rewards/probe_shaping_dominance/std": 0.061049580574035645, "rewards/probe_terminal_raw/mean": 0.010670731775462627, "rewards/probe_terminal_raw/std": 0.06036277487874031, "rewards/rollout_reward_func/mean": -0.5101712346076965, "rewards/rollout_reward_func/std": 0.20784814655780792, "sampling/importance_sampling_ratio/max": 1.6952624320983887, "sampling/importance_sampling_ratio/mean": 0.9711546301841736, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1546943187713623, "sampling/sampling_logp_difference/mean": 0.03182876855134964, "step": 157, "step_time": 27.540745071999936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.031250000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "entropy": 0.058620097348466516, "epoch": 0.00316, "grad_norm": 0.0032319524325430393, "kl": 0.2064171105599364, "learning_rate": 7.999977949715709e-06, "loss": -0.0, "step": 158, "step_time": 11.632630814000095 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08823958231369033, "epoch": 0.00318, "frac_reward_zero_std": 0.0, "grad_norm": 0.005462405737489462, "kl": 0.09290702206544665, "learning_rate": 7.999977586754615e-06, "loss": 0.0001, "num_tokens": 8341164.0, "reward": 2.443883180618286, "reward_std": 0.2663474678993225, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9600480794906616, "rewards/probe_shaping_dominance/std": 0.12650074064731598, "rewards/probe_terminal_raw/mean": 0.046875, "rewards/probe_terminal_raw/std": 0.1480722874403, "rewards/rollout_reward_func/mean": -0.48178985714912415, "rewards/rollout_reward_func/std": 0.22425328195095062, "sampling/importance_sampling_ratio/max": 1.382658839225769, "sampling/importance_sampling_ratio/mean": 1.018369197845459, "sampling/importance_sampling_ratio/min": 0.8050516247749329, "sampling/sampling_logp_difference/max": 0.3240091800689697, "sampling/sampling_logp_difference/mean": 0.023685907945036888, "step": 159, "step_time": 27.411928095999883 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.08342378272209316, "epoch": 0.0032, "grad_norm": 0.0198823194950819, "kl": 0.08883899757620384, "learning_rate": 7.999977220830588e-06, "loss": 0.0001, "step": 160, "step_time": 12.353684361999967 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06228045103489421, "epoch": 0.00322, "frac_reward_zero_std": 0.0, "grad_norm": 0.002511651022359729, "kl": 0.1462944263475947, "learning_rate": 7.999976851943628e-06, "loss": -0.0, "num_tokens": 8445224.0, "reward": 2.391735076904297, "reward_std": 0.3887004256248474, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.4645148813724518, "rewards/rollout_reward_func/std": 0.24512337148189545, "sampling/importance_sampling_ratio/max": 1.2499885559082031, "sampling/importance_sampling_ratio/mean": 0.964512288570404, "sampling/importance_sampling_ratio/min": 0.2849932909011841, "sampling/sampling_logp_difference/max": 1.2552961111068726, "sampling/sampling_logp_difference/mean": 0.02673853561282158, "step": 161, "step_time": 26.90330324300021 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.07126606599922525, "epoch": 0.00324, "grad_norm": 0.00517527898773551, "kl": 0.13863739833080524, "learning_rate": 7.999976480093737e-06, "loss": -0.0, "step": 162, "step_time": 11.688447676000578 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07072257142863236, "epoch": 0.00326, "frac_reward_zero_std": 0.0, "grad_norm": 0.004865987226366997, "kl": 0.1391429503753443, "learning_rate": 7.999976105280914e-06, "loss": -0.0, "num_tokens": 8551746.0, "reward": 2.3334262371063232, "reward_std": 0.42871803045272827, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9307848215103149, "rewards/probe_shaping_dominance/std": 0.1679239422082901, "rewards/probe_terminal_raw/mean": 0.07113821059465408, "rewards/probe_terminal_raw/std": 0.1717527210712433, "rewards/rollout_reward_func/mean": -0.5247467756271362, "rewards/rollout_reward_func/std": 0.24572212994098663, "sampling/importance_sampling_ratio/max": 1.3134804964065552, "sampling/importance_sampling_ratio/mean": 1.0010151863098145, "sampling/importance_sampling_ratio/min": 0.42815467715263367, "sampling/sampling_logp_difference/max": 0.8482714891433716, "sampling/sampling_logp_difference/mean": 0.01988227292895317, "step": 163, "step_time": 28.07267034399956 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.07096506169182248, "epoch": 0.00328, "grad_norm": 0.004104274325072765, "kl": 0.13155441358685493, "learning_rate": 7.99997572750516e-06, "loss": -0.0, "step": 164, "step_time": 11.647160391999023 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.078909770467817, "epoch": 0.0033, "frac_reward_zero_std": 0.0, "grad_norm": 0.004251962527632713, "kl": 0.09027766038946083, "learning_rate": 7.999975346766472e-06, "loss": -0.0, "num_tokens": 8658732.0, "reward": 2.414771795272827, "reward_std": 0.3757838010787964, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.9784373044967651, "rewards/probe_shaping_dominance/std": 0.08622659742832184, "rewards/probe_terminal_raw/mean": 0.024517275393009186, "rewards/probe_terminal_raw/std": 0.10027948766946793, "rewards/rollout_reward_func/mean": -0.47568273544311523, "rewards/rollout_reward_func/std": 0.19167323410511017, "sampling/importance_sampling_ratio/max": 1.1542701721191406, "sampling/importance_sampling_ratio/mean": 0.9669894576072693, "sampling/importance_sampling_ratio/min": 0.6857547163963318, "sampling/sampling_logp_difference/max": 0.37537309527397156, "sampling/sampling_logp_difference/mean": 0.017938656732439995, "step": 165, "step_time": 27.2606650000007 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "entropy": 0.07565992117088172, "epoch": 0.00332, "grad_norm": 0.006961170118302107, "kl": 0.08890455095081506, "learning_rate": 7.999974963064855e-06, "loss": -0.0, "step": 166, "step_time": 11.698157390000233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07434030482545495, "epoch": 0.00334, "frac_reward_zero_std": 0.0, "grad_norm": 0.004832255654036999, "kl": 0.15626501338783783, "learning_rate": 7.999974576400308e-06, "loss": -0.0, "num_tokens": 8765380.0, "reward": 2.2938361167907715, "reward_std": 0.4383181631565094, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.960378885269165, "rewards/probe_shaping_dominance/std": 0.12596461176872253, "rewards/probe_terminal_raw/mean": 0.046875, "rewards/probe_terminal_raw/std": 0.1480722874403, "rewards/rollout_reward_func/mean": -0.5071678757667542, "rewards/rollout_reward_func/std": 0.2304636836051941, "sampling/importance_sampling_ratio/max": 1.6727243661880493, "sampling/importance_sampling_ratio/mean": 1.0108327865600586, "sampling/importance_sampling_ratio/min": 0.4802703857421875, "sampling/sampling_logp_difference/max": 0.737343966960907, "sampling/sampling_logp_difference/mean": 0.023180868476629257, "step": 167, "step_time": 28.38192438599981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.0768249062821269, "epoch": 0.00336, "grad_norm": 0.0052077267318964005, "kl": 0.15163502033101395, "learning_rate": 7.999974186772832e-06, "loss": -0.0, "step": 168, "step_time": 11.745391591000953 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11408041534014046, "epoch": 0.00338, "frac_reward_zero_std": 0.0, "grad_norm": 0.005346886347979307, "kl": 0.05663721589365878, "learning_rate": 7.999973794182426e-06, "loss": 0.0, "num_tokens": 8871458.0, "reward": 2.347496271133423, "reward_std": 0.37117481231689453, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.984038233757019, "rewards/probe_shaping_dominance/std": 0.09029316157102585, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.477167010307312, "rewards/rollout_reward_func/std": 0.2263534814119339, "sampling/importance_sampling_ratio/max": 1.2048081159591675, "sampling/importance_sampling_ratio/mean": 0.967424750328064, "sampling/importance_sampling_ratio/min": 0.7366955280303955, "sampling/sampling_logp_difference/max": 0.3062773644924164, "sampling/sampling_logp_difference/mean": 0.022138062864542007, "step": 169, "step_time": 26.940671711000505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11010659678140655, "epoch": 0.0034, "grad_norm": 0.006358719430863857, "kl": 0.05905036644250572, "learning_rate": 7.99997339862909e-06, "loss": 0.0, "step": 170, "step_time": 12.187279679998937 }, { "clip_ratio/high_max": 0.06666666828095913, "clip_ratio/high_mean": 0.033333334140479565, "clip_ratio/low_mean": 0.035416667349636555, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06875000149011612, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10378921253141016, "epoch": 0.00342, "frac_reward_zero_std": 0.0, "grad_norm": 0.004866220988333225, "kl": 0.3513250324758701, "learning_rate": 7.999973000112826e-06, "loss": -0.0, "num_tokens": 8977121.0, "reward": 2.3662233352661133, "reward_std": 0.36591798067092896, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.490026593208313, "rewards/rollout_reward_func/std": 0.1713269054889679, "sampling/importance_sampling_ratio/max": 2.4813146591186523, "sampling/importance_sampling_ratio/mean": 1.0544798374176025, "sampling/importance_sampling_ratio/min": 0.5539883375167847, "sampling/sampling_logp_difference/max": 0.9087880849838257, "sampling/sampling_logp_difference/mean": 0.04017889127135277, "step": 171, "step_time": 27.655318435999106 }, { "clip_ratio/high_max": 0.06666666828095913, "clip_ratio/high_mean": 0.033333334140479565, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04375000111758709, "entropy": 0.10798206774052233, "epoch": 0.00344, "grad_norm": 0.012118767946958542, "kl": 0.39312139721005224, "learning_rate": 7.999972598633632e-06, "loss": -0.0, "step": 172, "step_time": 11.631308623997938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06574001582339406, "epoch": 0.00346, "frac_reward_zero_std": 0.0, "grad_norm": 0.004147569183260202, "kl": 0.01835462471728988, "learning_rate": 7.999972194191514e-06, "loss": 0.0001, "num_tokens": 9080753.0, "reward": 2.3741204738616943, "reward_std": 0.33386632800102234, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9828835725784302, "rewards/probe_shaping_dominance/std": 0.06857176870107651, "rewards/probe_terminal_raw/mean": 0.016895325854420662, "rewards/probe_terminal_raw/std": 0.067360520362854, "rewards/rollout_reward_func/mean": -0.48190829157829285, "rewards/rollout_reward_func/std": 0.23477764427661896, "sampling/importance_sampling_ratio/max": 2.0903360843658447, "sampling/importance_sampling_ratio/mean": 1.0450650453567505, "sampling/importance_sampling_ratio/min": 0.8843300342559814, "sampling/sampling_logp_difference/max": 0.7373225688934326, "sampling/sampling_logp_difference/mean": 0.01723039150238037, "step": 173, "step_time": 26.502221221999207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.06844324560370296, "epoch": 0.00348, "grad_norm": 0.0040916260331869125, "kl": 0.022212313354311064, "learning_rate": 7.999971786786465e-06, "loss": 0.0001, "step": 174, "step_time": 11.897189610000169 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07524242554791272, "epoch": 0.0035, "frac_reward_zero_std": 0.0, "grad_norm": 0.005887735169380903, "kl": 0.22349138231948018, "learning_rate": 7.99997137641849e-06, "loss": -0.0, "num_tokens": 9185715.0, "reward": 2.4274468421936035, "reward_std": 0.30020296573638916, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9728903770446777, "rewards/probe_shaping_dominance/std": 0.10675826668739319, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.12296734005212784, "rewards/rollout_reward_func/mean": -0.46419334411621094, "rewards/rollout_reward_func/std": 0.211602121591568, "sampling/importance_sampling_ratio/max": 1.1768231391906738, "sampling/importance_sampling_ratio/mean": 0.9632406830787659, "sampling/importance_sampling_ratio/min": 0.32605040073394775, "sampling/sampling_logp_difference/max": 1.1148320436477661, "sampling/sampling_logp_difference/mean": 0.02662883885204792, "step": 175, "step_time": 27.599337874999037 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.0704325451515615, "epoch": 0.00352, "grad_norm": 0.004202236421406269, "kl": 0.2313449110952206, "learning_rate": 7.999970963087587e-06, "loss": -0.0, "step": 176, "step_time": 11.622392715999013 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.13769991835579276, "epoch": 0.00354, "frac_reward_zero_std": 0.0, "grad_norm": 0.008693602867424488, "kl": 0.1878440118744038, "learning_rate": 7.99997054679376e-06, "loss": -0.0001, "num_tokens": 9289277.0, "reward": 2.358966588973999, "reward_std": 0.3925982713699341, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9915453791618347, "rewards/probe_shaping_dominance/std": 0.04782645031809807, "rewards/probe_terminal_raw/mean": 0.006986788474023342, "rewards/probe_terminal_raw/std": 0.03952324390411377, "rewards/rollout_reward_func/mean": -0.4958154261112213, "rewards/rollout_reward_func/std": 0.18107342720031738, "sampling/importance_sampling_ratio/max": 1.5426419973373413, "sampling/importance_sampling_ratio/mean": 0.9988285303115845, "sampling/importance_sampling_ratio/min": 0.43416687846183777, "sampling/sampling_logp_difference/max": 0.5040676593780518, "sampling/sampling_logp_difference/mean": 0.04200742021203041, "step": 177, "step_time": 27.01749301200016 }, { "clip_ratio/high_max": 0.06666666828095913, "clip_ratio/high_mean": 0.033333334140479565, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04375000111758709, "entropy": 0.13739392068237066, "epoch": 0.00356, "grad_norm": 0.00472621712833643, "kl": 0.1952200917294249, "learning_rate": 7.999970127537005e-06, "loss": -0.0001, "step": 178, "step_time": 12.335309556999164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07692393008619547, "epoch": 0.00358, "frac_reward_zero_std": 0.0, "grad_norm": 0.028508227318525314, "kl": 5.385302404543381, "learning_rate": 7.999969705317325e-06, "loss": 0.0001, "num_tokens": 9389166.0, "reward": 2.4562783241271973, "reward_std": 0.2598528265953064, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.4624716639518738, "rewards/rollout_reward_func/std": 0.18222831189632416, "sampling/importance_sampling_ratio/max": 1.1737666130065918, "sampling/importance_sampling_ratio/mean": 0.9517749547958374, "sampling/importance_sampling_ratio/min": 0.2871549129486084, "sampling/sampling_logp_difference/max": 1.2477340698242188, "sampling/sampling_logp_difference/mean": 0.038661930710077286, "step": 179, "step_time": 26.827888970999993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.08128447085618973, "epoch": 0.0036, "grad_norm": 0.00963876023888588, "kl": 2.0179060684172327, "learning_rate": 7.99996928013472e-06, "loss": 0.0001, "step": 180, "step_time": 11.37347329900058 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10767973656766117, "epoch": 0.00362, "frac_reward_zero_std": 0.0, "grad_norm": 0.010538225993514061, "kl": 1.1300342498579994, "learning_rate": 7.999968851989192e-06, "loss": 0.0, "num_tokens": 9494689.0, "reward": 2.297545909881592, "reward_std": 0.3879827558994293, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.98197340965271, "rewards/probe_shaping_dominance/std": 0.10197389870882034, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.5250524282455444, "rewards/rollout_reward_func/std": 0.19950900971889496, "sampling/importance_sampling_ratio/max": 1.4360560178756714, "sampling/importance_sampling_ratio/mean": 0.9875404834747314, "sampling/importance_sampling_ratio/min": 0.18539370596408844, "sampling/sampling_logp_difference/max": 1.6852741241455078, "sampling/sampling_logp_difference/mean": 0.049665287137031555, "step": 181, "step_time": 26.69406858900038 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 0.11389242531731725, "epoch": 0.00364, "grad_norm": 0.003970544785261154, "kl": 0.6293696188367903, "learning_rate": 7.999968420880736e-06, "loss": 0.0, "step": 182, "step_time": 12.197639549000996 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09248453052714467, "epoch": 0.00366, "frac_reward_zero_std": 0.0, "grad_norm": 0.0045459093526005745, "kl": 0.13789485239249188, "learning_rate": 7.99996798680936e-06, "loss": -0.0001, "num_tokens": 9599380.0, "reward": 2.4226768016815186, "reward_std": 0.3249405324459076, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9769073724746704, "rewards/probe_shaping_dominance/std": 0.09389247745275497, "rewards/probe_terminal_raw/mean": 0.024263210594654083, "rewards/probe_terminal_raw/std": 0.09960746020078659, "rewards/rollout_reward_func/mean": -0.4659937620162964, "rewards/rollout_reward_func/std": 0.19758032262325287, "sampling/importance_sampling_ratio/max": 1.1653671264648438, "sampling/importance_sampling_ratio/mean": 0.9370558261871338, "sampling/importance_sampling_ratio/min": 0.46233388781547546, "sampling/sampling_logp_difference/max": 0.7714686393737793, "sampling/sampling_logp_difference/mean": 0.038370583206415176, "step": 183, "step_time": 26.904572651000308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0933451559394598, "epoch": 0.00368, "grad_norm": 0.004598891828209162, "kl": 0.12106670817593113, "learning_rate": 7.999967549775057e-06, "loss": -0.0001, "step": 184, "step_time": 11.607436572001461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.15387224033474922, "epoch": 0.0037, "frac_reward_zero_std": 0.0, "grad_norm": 0.010245956480503082, "kl": 0.5030446688178927, "learning_rate": 7.999967109777834e-06, "loss": -0.0, "num_tokens": 9707382.0, "reward": 2.4315314292907715, "reward_std": 0.47317853569984436, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0625, "rewards/probe_invalid_count/std": 0.24593468010425568, "rewards/probe_shaping_dominance/mean": 0.9855233430862427, "rewards/probe_shaping_dominance/std": 0.0818924754858017, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.4883667528629303, "rewards/rollout_reward_func/std": 0.20319455862045288, "sampling/importance_sampling_ratio/max": 1.2542879581451416, "sampling/importance_sampling_ratio/mean": 0.9586943984031677, "sampling/importance_sampling_ratio/min": 0.3715563118457794, "sampling/sampling_logp_difference/max": 0.9900554418563843, "sampling/sampling_logp_difference/mean": 0.04447564482688904, "step": 185, "step_time": 27.28605421100019 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.056250001303851604, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06875000149011612, "entropy": 0.15163114294409752, "epoch": 0.00372, "grad_norm": 0.0044283876195549965, "kl": 0.7128359689377248, "learning_rate": 7.999966666817687e-06, "loss": -0.0, "step": 186, "step_time": 12.221499876998678 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1375539805740118, "epoch": 0.00374, "frac_reward_zero_std": 0.0, "grad_norm": 0.006928480230271816, "kl": 0.14416655764216557, "learning_rate": 7.999966220894617e-06, "loss": -0.0, "num_tokens": 9814422.0, "reward": 2.40926456451416, "reward_std": 0.47349250316619873, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9848357439041138, "rewards/probe_shaping_dominance/std": 0.08578190207481384, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.3849461078643799, "rewards/rollout_reward_func/std": 0.28888392448425293, "sampling/importance_sampling_ratio/max": 1.243560791015625, "sampling/importance_sampling_ratio/mean": 0.9681116342544556, "sampling/importance_sampling_ratio/min": 0.665830671787262, "sampling/sampling_logp_difference/max": 0.37914347648620605, "sampling/sampling_logp_difference/mean": 0.03084658458828926, "step": 187, "step_time": 28.931869071998335 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.13996880408376455, "epoch": 0.00376, "grad_norm": 0.009369016624987125, "kl": 0.15229893615469337, "learning_rate": 7.999965772008627e-06, "loss": -0.0, "step": 188, "step_time": 11.766830096999001 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.035416667349636555, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10358174092834815, "epoch": 0.00378, "frac_reward_zero_std": 0.0, "grad_norm": 0.03559152036905289, "kl": 0.39252137734001735, "learning_rate": 7.999965320159715e-06, "loss": 0.0, "num_tokens": 9914246.0, "reward": 2.483328342437744, "reward_std": 0.3890749216079712, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.950668215751648, "rewards/probe_shaping_dominance/std": 0.1378525346517563, "rewards/probe_terminal_raw/mean": 0.056783534586429596, "rewards/probe_terminal_raw/std": 0.15526829659938812, "rewards/rollout_reward_func/mean": -0.44287341833114624, "rewards/rollout_reward_func/std": 0.26299041509628296, "sampling/importance_sampling_ratio/max": 1.2944039106369019, "sampling/importance_sampling_ratio/mean": 0.9779493808746338, "sampling/importance_sampling_ratio/min": 0.5075531005859375, "sampling/sampling_logp_difference/max": 0.6781981587409973, "sampling/sampling_logp_difference/mean": 0.026978708803653717, "step": 189, "step_time": 27.036868832000437 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.10481282410910353, "epoch": 0.0038, "grad_norm": 0.0055263713002204895, "kl": 0.39047255569312256, "learning_rate": 7.999964865347883e-06, "loss": 0.0001, "step": 190, "step_time": 11.940458628999295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1198381851427257, "epoch": 0.00382, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030459309928119183, "kl": 0.34787876208429225, "learning_rate": 7.999964407573131e-06, "loss": 0.0, "num_tokens": 10017338.0, "reward": 2.2820868492126465, "reward_std": 0.4749685525894165, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.966022253036499, "rewards/probe_shaping_dominance/std": 0.13379566371440887, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.12296734005212784, "rewards/rollout_reward_func/mean": -0.47768545150756836, "rewards/rollout_reward_func/std": 0.2837761640548706, "sampling/importance_sampling_ratio/max": 1.7857273817062378, "sampling/importance_sampling_ratio/mean": 1.0156748294830322, "sampling/importance_sampling_ratio/min": 0.514444887638092, "sampling/sampling_logp_difference/max": 0.6646687984466553, "sampling/sampling_logp_difference/mean": 0.03443087264895439, "step": 191, "step_time": 27.4936487089999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.12180041195824742, "epoch": 0.00384, "grad_norm": 0.009600832127034664, "kl": 0.3496675969581702, "learning_rate": 7.999963946835458e-06, "loss": 0.0, "step": 192, "step_time": 11.71437842100022 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07277914439328015, "epoch": 0.00386, "frac_reward_zero_std": 0.0, "grad_norm": 0.005412722937762737, "kl": 0.6871760921980012, "learning_rate": 7.999963483134866e-06, "loss": 0.0001, "num_tokens": 10123551.0, "reward": 2.4312024116516113, "reward_std": 0.31741824746131897, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.954460859298706, "rewards/probe_shaping_dominance/std": 0.1439775824546814, "rewards/probe_terminal_raw/mean": 0.046875, "rewards/probe_terminal_raw/std": 0.1480722874403, "rewards/rollout_reward_func/mean": -0.48888325691223145, "rewards/rollout_reward_func/std": 0.2712078392505646, "sampling/importance_sampling_ratio/max": 1.8100159168243408, "sampling/importance_sampling_ratio/mean": 1.0015695095062256, "sampling/importance_sampling_ratio/min": 0.4417291283607483, "sampling/sampling_logp_difference/max": 0.817058801651001, "sampling/sampling_logp_difference/mean": 0.03453746810555458, "step": 193, "step_time": 26.960456193001846 }, { "clip_ratio/high_max": 0.06250000186264515, "clip_ratio/high_mean": 0.031250000931322575, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0416666679084301, "entropy": 0.07823239883873612, "epoch": 0.00388, "grad_norm": 0.01935429498553276, "kl": 0.6307496229807157, "learning_rate": 7.999963016471355e-06, "loss": 0.0001, "step": 194, "step_time": 12.808481609999944 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02500000037252903, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09379608882591128, "epoch": 0.0039, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030668650288134813, "kl": 0.2814688477665186, "learning_rate": 7.999962546844924e-06, "loss": 0.0001, "num_tokens": 10225590.0, "reward": 2.361347198486328, "reward_std": 0.32310429215431213, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9668688178062439, "rewards/probe_shaping_dominance/std": 0.13289061188697815, "rewards/probe_terminal_raw/mean": 0.028328251093626022, "rewards/probe_terminal_raw/std": 0.11210102587938309, "rewards/rollout_reward_func/mean": -0.49010002613067627, "rewards/rollout_reward_func/std": 0.24613085389137268, "sampling/importance_sampling_ratio/max": 1.3004266023635864, "sampling/importance_sampling_ratio/mean": 0.9684375524520874, "sampling/importance_sampling_ratio/min": 0.5094537734985352, "sampling/sampling_logp_difference/max": 0.6744171380996704, "sampling/sampling_logp_difference/mean": 0.028151309117674828, "step": 195, "step_time": 25.599357043000964 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 0.09113423456437886, "epoch": 0.00392, "grad_norm": 0.003771732561290264, "kl": 0.27635849734906515, "learning_rate": 7.999962074255578e-06, "loss": 0.0001, "step": 196, "step_time": 11.204666337999697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04920864764972066, "epoch": 0.00394, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027844668366014957, "kl": 0.3839081407932099, "learning_rate": 7.999961598703312e-06, "loss": -0.0, "num_tokens": 10330063.0, "reward": 2.415410041809082, "reward_std": 0.4154632091522217, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9844269156455994, "rewards/probe_shaping_dominance/std": 0.08809469640254974, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.44089192152023315, "rewards/rollout_reward_func/std": 0.2551630139350891, "sampling/importance_sampling_ratio/max": 1.1653680801391602, "sampling/importance_sampling_ratio/mean": 0.9744973182678223, "sampling/importance_sampling_ratio/min": 0.20111165940761566, "sampling/sampling_logp_difference/max": 1.6039009094238281, "sampling/sampling_logp_difference/mean": 0.030936850234866142, "step": 197, "step_time": 26.98998093100181 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 0.0472417699656944, "epoch": 0.00396, "grad_norm": 0.0009492259123362601, "kl": 0.3996036083844956, "learning_rate": 7.99996112018813e-06, "loss": -0.0, "step": 198, "step_time": 12.02342930299983 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06072818394750357, "epoch": 0.00398, "frac_reward_zero_std": 0.0, "grad_norm": 0.0025375511031597853, "kl": 0.2914491758947406, "learning_rate": 7.999960638710032e-06, "loss": 0.0, "num_tokens": 10431419.0, "reward": 2.499394178390503, "reward_std": 0.29632288217544556, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9628201127052307, "rewards/probe_shaping_dominance/std": 0.12069481611251831, "rewards/probe_terminal_raw/mean": 0.04026930779218674, "rewards/probe_terminal_raw/std": 0.13092826306819916, "rewards/rollout_reward_func/mean": -0.42244523763656616, "rewards/rollout_reward_func/std": 0.24739933013916016, "sampling/importance_sampling_ratio/max": 1.3507100343704224, "sampling/importance_sampling_ratio/mean": 1.0147151947021484, "sampling/importance_sampling_ratio/min": 0.9091832637786865, "sampling/sampling_logp_difference/max": 0.338870108127594, "sampling/sampling_logp_difference/mean": 0.010294873267412186, "step": 199, "step_time": 27.086743224999736 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.058446566108614206, "epoch": 0.004, "grad_norm": 0.0024834321811795235, "kl": 0.2936624846115592, "learning_rate": 7.999960154269017e-06, "loss": 0.0, "step": 200, "step_time": 11.463394613998389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09987628925591707, "epoch": 0.00402, "frac_reward_zero_std": 0.0, "grad_norm": 0.0032795185688883066, "kl": 0.33637799334246665, "learning_rate": 7.999959666865086e-06, "loss": -0.0, "num_tokens": 10533498.0, "reward": 2.4651217460632324, "reward_std": 0.32078394293785095, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9926146268844604, "rewards/probe_shaping_dominance/std": 0.04177792742848396, "rewards/probe_terminal_raw/mean": 0.008003048598766327, "rewards/probe_terminal_raw/std": 0.04527207836508751, "rewards/rollout_reward_func/mean": -0.4229958653450012, "rewards/rollout_reward_func/std": 0.19672146439552307, "sampling/importance_sampling_ratio/max": 1.195106863975525, "sampling/importance_sampling_ratio/mean": 0.9418940544128418, "sampling/importance_sampling_ratio/min": 0.318993479013443, "sampling/sampling_logp_difference/max": 0.9258831739425659, "sampling/sampling_logp_difference/mean": 0.038004204630851746, "step": 201, "step_time": 26.624555751001026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.10252866102382541, "epoch": 0.00404, "grad_norm": 0.0035051219165325165, "kl": 0.3395325805176981, "learning_rate": 7.99995917649824e-06, "loss": -0.0, "step": 202, "step_time": 12.736442242999146 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10787439718842506, "epoch": 0.00406, "frac_reward_zero_std": 0.0, "grad_norm": 0.00344108697026968, "kl": 0.40611333276319783, "learning_rate": 7.999958683168479e-06, "loss": 0.0, "num_tokens": 10637062.0, "reward": 2.5038881301879883, "reward_std": 0.22744759917259216, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.4148617684841156, "rewards/rollout_reward_func/std": 0.18833006918430328, "sampling/importance_sampling_ratio/max": 1.1548116207122803, "sampling/importance_sampling_ratio/mean": 0.9753589630126953, "sampling/importance_sampling_ratio/min": 0.7033773064613342, "sampling/sampling_logp_difference/max": 0.35186219215393066, "sampling/sampling_logp_difference/mean": 0.019522543996572495, "step": 203, "step_time": 26.715982574999543 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "entropy": 0.1049330742098391, "epoch": 0.00408, "grad_norm": 0.0019796311389654875, "kl": 0.4593061124905944, "learning_rate": 7.999958186875805e-06, "loss": -0.0, "step": 204, "step_time": 11.646448757999678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.055373367242282256, "epoch": 0.0041, "frac_reward_zero_std": 0.0, "grad_norm": 0.006926523055881262, "kl": 0.05150494979155518, "learning_rate": 7.999957687620215e-06, "loss": -0.0, "num_tokens": 10738428.0, "reward": 2.550138473510742, "reward_std": 0.22538912296295166, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.0, "rewards/probe_completion_length/std": 0.0, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9910282492637634, "rewards/probe_shaping_dominance/std": 0.05075191706418991, "rewards/probe_terminal_raw/mean": 0.00940040685236454, "rewards/probe_terminal_raw/std": 0.05317673459649086, "rewards/rollout_reward_func/mean": -0.4002901315689087, "rewards/rollout_reward_func/std": 0.22546610236167908, "sampling/importance_sampling_ratio/max": 1.2517437934875488, "sampling/importance_sampling_ratio/mean": 0.9799097180366516, "sampling/importance_sampling_ratio/min": 0.5997620224952698, "sampling/sampling_logp_difference/max": 0.5112212896347046, "sampling/sampling_logp_difference/mean": 0.01782449334859848, "step": 205, "step_time": 26.197072295999533 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.06316551179043017, "epoch": 0.00412, "grad_norm": 0.0017257543513551354, "kl": 0.053950335964449536, "learning_rate": 7.999957185401714e-06, "loss": -0.0, "step": 206, "step_time": 12.549622151999756 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11197321023792028, "epoch": 0.00414, "frac_reward_zero_std": 0.0, "grad_norm": 0.031097499653697014, "kl": 3.89400917571038, "learning_rate": 7.9999566802203e-06, "loss": 0.0001, "num_tokens": 10840689.0, "reward": 2.345735549926758, "reward_std": 0.5137441754341125, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0625, "rewards/probe_invalid_count/std": 0.24593468010425568, "rewards/probe_shaping_dominance/mean": 0.9725180268287659, "rewards/probe_shaping_dominance/std": 0.10821773111820221, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.12296734005212784, "rewards/rollout_reward_func/mean": -0.5142826437950134, "rewards/rollout_reward_func/std": 0.19152681529521942, "sampling/importance_sampling_ratio/max": 1.9924818277359009, "sampling/importance_sampling_ratio/mean": 0.9943416118621826, "sampling/importance_sampling_ratio/min": 0.39203470945358276, "sampling/sampling_logp_difference/max": 0.9361467361450195, "sampling/sampling_logp_difference/mean": 0.053312450647354126, "step": 207, "step_time": 26.643122880999726 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.035416667349636555, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.058333334513008595, "entropy": 0.11182145914062858, "epoch": 0.00416, "grad_norm": 0.007240073289722204, "kl": 1.6443076208233833, "learning_rate": 7.999956172075974e-06, "loss": 0.0, "step": 208, "step_time": 11.64378536300228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.12927352613769472, "epoch": 0.00418, "frac_reward_zero_std": 0.0, "grad_norm": 0.004229975864291191, "kl": 0.6016647743063004, "learning_rate": 7.999955660968735e-06, "loss": -0.0, "num_tokens": 10944113.0, "reward": 2.364624261856079, "reward_std": 0.36824679374694824, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.49162572622299194, "rewards/rollout_reward_func/std": 0.21871674060821533, "sampling/importance_sampling_ratio/max": 1.3223011493682861, "sampling/importance_sampling_ratio/mean": 0.9632259607315063, "sampling/importance_sampling_ratio/min": 0.3602616786956787, "sampling/sampling_logp_difference/max": 0.6850378513336182, "sampling/sampling_logp_difference/mean": 0.04301746189594269, "step": 209, "step_time": 26.264724693000062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.12698473082855344, "epoch": 0.0042, "grad_norm": 0.004611098673194647, "kl": 0.6409582832593514, "learning_rate": 7.999955146898586e-06, "loss": -0.0001, "step": 210, "step_time": 12.728916892999223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04340869339648634, "epoch": 0.00422, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008402821840718389, "kl": 0.035792879805057964, "learning_rate": 7.999954629865525e-06, "loss": -0.0, "num_tokens": 11047946.0, "reward": 2.3281283378601074, "reward_std": 0.43589621782302856, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9857661724090576, "rewards/probe_shaping_dominance/std": 0.08051877468824387, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.4982629418373108, "rewards/rollout_reward_func/std": 0.20851053297519684, "sampling/importance_sampling_ratio/max": 1.0012203454971313, "sampling/importance_sampling_ratio/mean": 0.9677799940109253, "sampling/importance_sampling_ratio/min": 0.4670157730579376, "sampling/sampling_logp_difference/max": 0.7613925933837891, "sampling/sampling_logp_difference/mean": 0.014532409608364105, "step": 211, "step_time": 26.491312149000805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04452452051918954, "epoch": 0.00424, "grad_norm": 0.0009245733381249011, "kl": 0.039327465879523515, "learning_rate": 7.999954109869554e-06, "loss": -0.0, "step": 212, "step_time": 11.690953868999713 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10158436209894717, "epoch": 0.00426, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037761996500194073, "kl": 0.43266808055341244, "learning_rate": 7.999953586910674e-06, "loss": -0.0, "num_tokens": 11155145.0, "reward": 2.33209490776062, "reward_std": 0.3974522352218628, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9518005847930908, "rewards/probe_shaping_dominance/std": 0.15248610079288483, "rewards/probe_terminal_raw/mean": 0.046875, "rewards/probe_terminal_raw/std": 0.1480722874403, "rewards/rollout_reward_func/mean": -0.4603305459022522, "rewards/rollout_reward_func/std": 0.2795467674732208, "sampling/importance_sampling_ratio/max": 1.5568536520004272, "sampling/importance_sampling_ratio/mean": 1.0121254920959473, "sampling/importance_sampling_ratio/min": 0.6084503531455994, "sampling/sampling_logp_difference/max": 0.49602431058883667, "sampling/sampling_logp_difference/mean": 0.017653338611125946, "step": 213, "step_time": 26.773649626000406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10326709412038326, "epoch": 0.00428, "grad_norm": 0.004299989901483059, "kl": 0.4246340822428465, "learning_rate": 7.999953060988884e-06, "loss": 0.0, "step": 214, "step_time": 12.393828191000466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10672931908629835, "epoch": 0.0043, "frac_reward_zero_std": 0.0, "grad_norm": 0.0029812948778271675, "kl": 0.5036190063692629, "learning_rate": 7.999952532104185e-06, "loss": 0.0, "num_tokens": 11256499.0, "reward": 2.3668174743652344, "reward_std": 0.4220028221607208, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.984410285949707, "rewards/probe_shaping_dominance/std": 0.08818867057561874, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.458217978477478, "rewards/rollout_reward_func/std": 0.1993042230606079, "sampling/importance_sampling_ratio/max": 1.2048288583755493, "sampling/importance_sampling_ratio/mean": 0.9700103998184204, "sampling/importance_sampling_ratio/min": 0.2804865837097168, "sampling/sampling_logp_difference/max": 1.2170777320861816, "sampling/sampling_logp_difference/mean": 0.027440235018730164, "step": 215, "step_time": 26.241349470000387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.10663612652570009, "epoch": 0.00432, "grad_norm": 0.0025962339714169502, "kl": 0.514960631611757, "learning_rate": 7.99995200025658e-06, "loss": 0.0, "step": 216, "step_time": 11.455195212000945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.13120519556105137, "epoch": 0.00434, "frac_reward_zero_std": 0.0, "grad_norm": 0.00685320096090436, "kl": 0.5306107758951839, "learning_rate": 7.999951465446065e-06, "loss": 0.0, "num_tokens": 11358760.0, "reward": 2.4137301445007324, "reward_std": 0.38182157278060913, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9719860553741455, "rewards/probe_shaping_dominance/std": 0.1105431467294693, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.12296734005212784, "rewards/rollout_reward_func/mean": -0.4770059883594513, "rewards/rollout_reward_func/std": 0.27089011669158936, "sampling/importance_sampling_ratio/max": 1.8946123123168945, "sampling/importance_sampling_ratio/mean": 1.0106232166290283, "sampling/importance_sampling_ratio/min": 0.6873172521591187, "sampling/sampling_logp_difference/max": 0.6602880954742432, "sampling/sampling_logp_difference/mean": 0.026765936985611916, "step": 217, "step_time": 28.19653884499894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.1328302058391273, "epoch": 0.00436, "grad_norm": 0.006467514205724001, "kl": 0.5236879177391529, "learning_rate": 7.999950927672645e-06, "loss": 0.0, "step": 218, "step_time": 11.548230411000986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0558876832947135, "epoch": 0.00438, "frac_reward_zero_std": 0.0, "grad_norm": 0.002873801626265049, "kl": 0.43705418131622764, "learning_rate": 7.999950386936317e-06, "loss": 0.0001, "num_tokens": 11459134.0, "reward": 2.4926953315734863, "reward_std": 0.2576614320278168, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9532788395881653, "rewards/probe_shaping_dominance/std": 0.1259964108467102, "rewards/probe_terminal_raw/mean": 0.049288615584373474, "rewards/probe_terminal_raw/std": 0.13439743220806122, "rewards/rollout_reward_func/mean": -0.4286222755908966, "rewards/rollout_reward_func/std": 0.13808076083660126, "sampling/importance_sampling_ratio/max": 2.167020320892334, "sampling/importance_sampling_ratio/mean": 1.0488494634628296, "sampling/importance_sampling_ratio/min": 0.5981054306030273, "sampling/sampling_logp_difference/max": 0.773352861404419, "sampling/sampling_logp_difference/mean": 0.021742573007941246, "step": 219, "step_time": 26.59079552000003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05215576570481062, "epoch": 0.0044, "grad_norm": 0.013386573642492294, "kl": 0.4328960892962641, "learning_rate": 7.999949843237083e-06, "loss": 0.0001, "step": 220, "step_time": 11.575578054999824 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.10125815495848656, "epoch": 0.00442, "frac_reward_zero_std": 0.0, "grad_norm": 0.003916088026016951, "kl": 0.22720737754934817, "learning_rate": 7.999949296574944e-06, "loss": 0.0, "num_tokens": 11564110.0, "reward": 2.5024495124816895, "reward_std": 0.21472422778606415, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 2.0, "rewards/probe_completion_length/std": 0.0, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9687739014625549, "rewards/probe_shaping_dominance/std": 0.12289554625749588, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.12296734005212784, "rewards/rollout_reward_func/mean": -0.4475744962692261, "rewards/rollout_reward_func/std": 0.21471136808395386, "sampling/importance_sampling_ratio/max": 1.2565096616744995, "sampling/importance_sampling_ratio/mean": 0.9851142168045044, "sampling/importance_sampling_ratio/min": 0.7785980701446533, "sampling/sampling_logp_difference/max": 0.25026071071624756, "sampling/sampling_logp_difference/mean": 0.014336168766021729, "step": 221, "step_time": 28.308517722000943 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.10386610007844865, "epoch": 0.00444, "grad_norm": 0.0038715400733053684, "kl": 0.2309217918664217, "learning_rate": 7.9999487469499e-06, "loss": 0.0, "step": 222, "step_time": 11.59164219199829 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08839935716241598, "epoch": 0.00446, "frac_reward_zero_std": 0.0, "grad_norm": 0.0031392991077154875, "kl": 0.3969584498627228, "learning_rate": 7.999948194361951e-06, "loss": 0.0, "num_tokens": 11670791.0, "reward": 2.504007339477539, "reward_std": 0.40813401341438293, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0625, "rewards/probe_invalid_count/std": 0.24593468010425568, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.4459925591945648, "rewards/rollout_reward_func/std": 0.22923637926578522, "sampling/importance_sampling_ratio/max": 1.2424126863479614, "sampling/importance_sampling_ratio/mean": 1.0054875612258911, "sampling/importance_sampling_ratio/min": 0.8022926449775696, "sampling/sampling_logp_difference/max": 0.2571254372596741, "sampling/sampling_logp_difference/mean": 0.01522812806069851, "step": 223, "step_time": 27.01184939599989 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.08901654137298465, "epoch": 0.00448, "grad_norm": 0.0026675413828343153, "kl": 0.3970091380215308, "learning_rate": 7.999947638811098e-06, "loss": 0.0, "step": 224, "step_time": 12.880684480999662 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06860345043241978, "epoch": 0.0045, "frac_reward_zero_std": 0.0, "grad_norm": 0.005898882634937763, "kl": 0.2994147054851055, "learning_rate": 7.999947080297344e-06, "loss": 0.0001, "num_tokens": 11778059.0, "reward": 2.442521095275879, "reward_std": 0.44092267751693726, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0625, "rewards/probe_invalid_count/std": 0.3535533845424652, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5387288331985474, "rewards/rollout_reward_func/std": 0.17624567449092865, "sampling/importance_sampling_ratio/max": 1.9132263660430908, "sampling/importance_sampling_ratio/mean": 1.0267926454544067, "sampling/importance_sampling_ratio/min": 0.2760489583015442, "sampling/sampling_logp_difference/max": 1.2855275869369507, "sampling/sampling_logp_difference/mean": 0.03292452543973923, "step": 225, "step_time": 26.894577987999583 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "entropy": 0.0715375836007297, "epoch": 0.00452, "grad_norm": 0.004127421882003546, "kl": 0.2991956745972857, "learning_rate": 7.999946518820686e-06, "loss": 0.0001, "step": 226, "step_time": 11.7451522450001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07084862189367414, "epoch": 0.00454, "frac_reward_zero_std": 0.0, "grad_norm": 0.007534612435847521, "kl": 0.3083134523330955, "learning_rate": 7.999945954381125e-06, "loss": -0.0, "num_tokens": 11885416.0, "reward": 2.2896175384521484, "reward_std": 0.4199885129928589, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9857305884361267, "rewards/probe_shaping_dominance/std": 0.080719955265522, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.5367380380630493, "rewards/rollout_reward_func/std": 0.2644577920436859, "sampling/importance_sampling_ratio/max": 1.2167645692825317, "sampling/importance_sampling_ratio/mean": 0.9729256629943848, "sampling/importance_sampling_ratio/min": 0.5702285766601562, "sampling/sampling_logp_difference/max": 0.556563138961792, "sampling/sampling_logp_difference/mean": 0.01854308322072029, "step": 227, "step_time": 26.478597906999312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.07101618498563766, "epoch": 0.00456, "grad_norm": 0.005244475323706865, "kl": 0.275350460462505, "learning_rate": 7.999945386978663e-06, "loss": -0.0, "step": 228, "step_time": 12.815234450999014 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.10753743472741917, "epoch": 0.00458, "frac_reward_zero_std": 0.0, "grad_norm": 0.002793548395857215, "kl": 0.3363812413687519, "learning_rate": 7.999944816613299e-06, "loss": 0.0, "num_tokens": 11990346.0, "reward": 2.4647884368896484, "reward_std": 0.3218696117401123, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9882341623306274, "rewards/probe_shaping_dominance/std": 0.06655776500701904, "rewards/probe_terminal_raw/mean": 0.011559959501028061, "rewards/probe_terminal_raw/std": 0.06539300829172134, "rewards/rollout_reward_func/mean": -0.45375561714172363, "rewards/rollout_reward_func/std": 0.26721474528312683, "sampling/importance_sampling_ratio/max": 1.7522544860839844, "sampling/importance_sampling_ratio/mean": 1.0056817531585693, "sampling/importance_sampling_ratio/min": 0.39151322841644287, "sampling/sampling_logp_difference/max": 0.9377517700195312, "sampling/sampling_logp_difference/mean": 0.030310627073049545, "step": 229, "step_time": 26.652824122999846 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.10178712871856987, "epoch": 0.0046, "grad_norm": 0.0023058054503053427, "kl": 0.3472972925131521, "learning_rate": 7.999944243285035e-06, "loss": 0.0, "step": 230, "step_time": 11.641791465999631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11396907176822424, "epoch": 0.00462, "frac_reward_zero_std": 0.0, "grad_norm": 0.0053448486141860485, "kl": 0.23751085135154426, "learning_rate": 7.999943666993872e-06, "loss": -0.0, "num_tokens": 12094123.0, "reward": 2.3231983184814453, "reward_std": 0.4537913501262665, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9484962224960327, "rewards/probe_shaping_dominance/std": 0.14161793887615204, "rewards/probe_terminal_raw/mean": 0.05538617819547653, "rewards/probe_terminal_raw/std": 0.15303537249565125, "rewards/rollout_reward_func/mean": -0.4744342267513275, "rewards/rollout_reward_func/std": 0.27888038754463196, "sampling/importance_sampling_ratio/max": 1.2306643724441528, "sampling/importance_sampling_ratio/mean": 0.9789013862609863, "sampling/importance_sampling_ratio/min": 0.5588669180870056, "sampling/sampling_logp_difference/max": 0.5087692737579346, "sampling/sampling_logp_difference/mean": 0.027260489761829376, "step": 231, "step_time": 27.108853302998796 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.11494649667292833, "epoch": 0.00464, "grad_norm": 0.0034225336275994778, "kl": 0.2446515706833452, "learning_rate": 7.999943087739808e-06, "loss": -0.0, "step": 232, "step_time": 12.437156906999007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09780422016046941, "epoch": 0.00466, "frac_reward_zero_std": 0.0, "grad_norm": 0.00331493909470737, "kl": 0.29221273493021727, "learning_rate": 7.999942505522845e-06, "loss": 0.0, "num_tokens": 12202392.0, "reward": 2.31793212890625, "reward_std": 0.4711916446685791, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9865642786026001, "rewards/probe_shaping_dominance/std": 0.07600414007902145, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.44675713777542114, "rewards/rollout_reward_func/std": 0.27934086322784424, "sampling/importance_sampling_ratio/max": 1.2045822143554688, "sampling/importance_sampling_ratio/mean": 0.9702666997909546, "sampling/importance_sampling_ratio/min": 0.5390675067901611, "sampling/sampling_logp_difference/max": 0.6179147958755493, "sampling/sampling_logp_difference/mean": 0.02464653179049492, "step": 233, "step_time": 27.07101158400019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.10010742908343673, "epoch": 0.00468, "grad_norm": 0.00394394900649786, "kl": 0.28515962581150234, "learning_rate": 7.999941920342986e-06, "loss": 0.0, "step": 234, "step_time": 11.908877233997373 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09357268398161978, "epoch": 0.0047, "frac_reward_zero_std": 0.0, "grad_norm": 0.003001472679898143, "kl": 0.4120303535989933, "learning_rate": 7.999941332200228e-06, "loss": 0.0, "num_tokens": 12307473.0, "reward": 2.356600761413574, "reward_std": 0.39092886447906494, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9900838732719421, "rewards/probe_shaping_dominance/std": 0.05609414726495743, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.47410792112350464, "rewards/rollout_reward_func/std": 0.2651825547218323, "sampling/importance_sampling_ratio/max": 1.2125083208084106, "sampling/importance_sampling_ratio/mean": 0.9483182430267334, "sampling/importance_sampling_ratio/min": 0.5642846822738647, "sampling/sampling_logp_difference/max": 0.5796399116516113, "sampling/sampling_logp_difference/mean": 0.029487669467926025, "step": 235, "step_time": 27.473293748998913 }, { "clip_ratio/high_max": 0.06666666828095913, "clip_ratio/high_mean": 0.033333334140479565, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 0.08885149616980925, "epoch": 0.00472, "grad_norm": 0.004106747917830944, "kl": 0.39987785345859805, "learning_rate": 7.999940741094573e-06, "loss": 0.0, "step": 236, "step_time": 11.607714889999443 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0416666679084301, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0790116679854691, "epoch": 0.00474, "frac_reward_zero_std": 0.0, "grad_norm": 0.0017510356847196817, "kl": 0.49183082331728656, "learning_rate": 7.999940147026021e-06, "loss": 0.0, "num_tokens": 12410261.0, "reward": 2.362030029296875, "reward_std": 0.48628348112106323, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9550326466560364, "rewards/probe_shaping_dominance/std": 0.1423776000738144, "rewards/probe_terminal_raw/mean": 0.046875, "rewards/probe_terminal_raw/std": 0.1480722874403, "rewards/rollout_reward_func/mean": -0.464877724647522, "rewards/rollout_reward_func/std": 0.2927810847759247, "sampling/importance_sampling_ratio/max": 1.2767555713653564, "sampling/importance_sampling_ratio/mean": 1.0007102489471436, "sampling/importance_sampling_ratio/min": 0.5674677491188049, "sampling/sampling_logp_difference/max": 0.564541220664978, "sampling/sampling_logp_difference/mean": 0.017719101160764694, "step": 237, "step_time": 26.277223889999732 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "entropy": 0.08091688924469054, "epoch": 0.00476, "grad_norm": 0.0037676175124943256, "kl": 0.4987390860915184, "learning_rate": 7.999939549994574e-06, "loss": 0.0, "step": 238, "step_time": 11.42589379400033 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08572797977831215, "epoch": 0.00478, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028349068015813828, "kl": 0.29074460588162765, "learning_rate": 7.99993895000023e-06, "loss": -0.0001, "num_tokens": 12515046.0, "reward": 2.3852663040161133, "reward_std": 0.48509836196899414, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9448926448822021, "rewards/probe_shaping_dominance/std": 0.15274296700954437, "rewards/probe_terminal_raw/mean": 0.05525914579629898, "rewards/probe_terminal_raw/std": 0.15285103023052216, "rewards/rollout_reward_func/mean": -0.43988555669784546, "rewards/rollout_reward_func/std": 0.28072717785835266, "sampling/importance_sampling_ratio/max": 1.2809064388275146, "sampling/importance_sampling_ratio/mean": 0.9681559801101685, "sampling/importance_sampling_ratio/min": 0.417494535446167, "sampling/sampling_logp_difference/max": 0.8734843134880066, "sampling/sampling_logp_difference/mean": 0.02679057978093624, "step": 239, "step_time": 27.850705083998037 }, { "clip_ratio/high_max": 0.05208333395421505, "clip_ratio/high_mean": 0.026041666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 0.09145444841124117, "epoch": 0.0048, "grad_norm": 0.003533316310495138, "kl": 0.276357589289546, "learning_rate": 7.999938347042993e-06, "loss": -0.0001, "step": 240, "step_time": 11.650785684000766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05321495997486636, "epoch": 0.00482, "frac_reward_zero_std": 0.0, "grad_norm": 0.002123113488778472, "kl": 0.1996255109550784, "learning_rate": 7.999937741122862e-06, "loss": 0.0, "num_tokens": 12618608.0, "reward": 2.31355619430542, "reward_std": 0.3297788202762604, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5426939129829407, "rewards/rollout_reward_func/std": 0.22457517683506012, "sampling/importance_sampling_ratio/max": 1.1050293445587158, "sampling/importance_sampling_ratio/mean": 1.0058460235595703, "sampling/importance_sampling_ratio/min": 0.9022819995880127, "sampling/sampling_logp_difference/max": 0.10648787021636963, "sampling/sampling_logp_difference/mean": 0.005735831335186958, "step": 241, "step_time": 26.73763404300007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.05486724083311856, "epoch": 0.00484, "grad_norm": 0.003093272214755416, "kl": 0.1941228064047955, "learning_rate": 7.999937132239836e-06, "loss": 0.0, "step": 242, "step_time": 11.670754389999274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07577884336933494, "epoch": 0.00486, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036162908654659986, "kl": 0.4399729967590247, "learning_rate": 7.999936520393918e-06, "loss": 0.0, "num_tokens": 12726447.0, "reward": 2.3645379543304443, "reward_std": 0.41120022535324097, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9753913879394531, "rewards/probe_shaping_dominance/std": 0.09771986305713654, "rewards/probe_terminal_raw/mean": 0.0260416679084301, "rewards/probe_terminal_raw/std": 0.1046360433101654, "rewards/rollout_reward_func/mean": -0.4618951678276062, "rewards/rollout_reward_func/std": 0.1977241188287735, "sampling/importance_sampling_ratio/max": 1.1149406433105469, "sampling/importance_sampling_ratio/mean": 0.9780128002166748, "sampling/importance_sampling_ratio/min": 0.7354345321655273, "sampling/sampling_logp_difference/max": 0.18633489310741425, "sampling/sampling_logp_difference/mean": 0.013524588197469711, "step": 243, "step_time": 27.977090622001015 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 0.07133703003637493, "epoch": 0.00488, "grad_norm": 0.002898427424952388, "kl": 0.44227540418796707, "learning_rate": 7.999935905585108e-06, "loss": 0.0, "step": 244, "step_time": 11.75723793999805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0315001527142158, "epoch": 0.0049, "frac_reward_zero_std": 0.0, "grad_norm": 0.001392417005263269, "kl": 0.23886053822934628, "learning_rate": 7.999935287813407e-06, "loss": -0.0, "num_tokens": 12827575.0, "reward": 2.4073498249053955, "reward_std": 0.42101356387138367, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9554626941680908, "rewards/probe_shaping_dominance/std": 0.14310474693775177, "rewards/probe_terminal_raw/mean": 0.046875, "rewards/probe_terminal_raw/std": 0.1480722874403, "rewards/rollout_reward_func/mean": -0.4199880063533783, "rewards/rollout_reward_func/std": 0.2148957997560501, "sampling/importance_sampling_ratio/max": 1.0394365787506104, "sampling/importance_sampling_ratio/mean": 0.995591402053833, "sampling/importance_sampling_ratio/min": 0.8603565096855164, "sampling/sampling_logp_difference/max": 0.1303640604019165, "sampling/sampling_logp_difference/mean": 0.004159946460276842, "step": 245, "step_time": 26.077412141000423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.032327667491699685, "epoch": 0.00492, "grad_norm": 0.0010727684712037444, "kl": 0.23855953469561797, "learning_rate": 7.999934667078813e-06, "loss": -0.0, "step": 246, "step_time": 11.513740063000114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0816163292620331, "epoch": 0.00494, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027896249666810036, "kl": 0.4679242782876827, "learning_rate": 7.999934043381328e-06, "loss": 0.0, "num_tokens": 12935730.0, "reward": 2.46283221244812, "reward_std": 0.36876291036605835, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9373108148574829, "rewards/probe_shaping_dominance/std": 0.1728522628545761, "rewards/probe_terminal_raw/mean": 0.0625, "rewards/probe_terminal_raw/std": 0.16800537705421448, "rewards/rollout_reward_func/mean": -0.3932287096977234, "rewards/rollout_reward_func/std": 0.24200834333896637, "sampling/importance_sampling_ratio/max": 1.2427064180374146, "sampling/importance_sampling_ratio/mean": 1.0063412189483643, "sampling/importance_sampling_ratio/min": 0.8085158467292786, "sampling/sampling_logp_difference/max": 0.21965795755386353, "sampling/sampling_logp_difference/mean": 0.01280665211379528, "step": 247, "step_time": 28.041720137000084 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.08206989825703204, "epoch": 0.00496, "grad_norm": 0.00293480372056365, "kl": 0.46830739825963974, "learning_rate": 7.999933416720957e-06, "loss": 0.0, "step": 248, "step_time": 11.713867525000751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06533269377541728, "epoch": 0.00498, "frac_reward_zero_std": 0.0, "grad_norm": 0.003347411984577775, "kl": 0.36843465792230745, "learning_rate": 7.999932787097692e-06, "loss": 0.0001, "num_tokens": 13041381.0, "reward": 2.382171630859375, "reward_std": 0.4231238067150116, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.9549021124839783, "rewards/probe_shaping_dominance/std": 0.14646287262439728, "rewards/probe_terminal_raw/mean": 0.04255589470267296, "rewards/probe_terminal_raw/std": 0.13594815135002136, "rewards/rollout_reward_func/mean": -0.50278639793396, "rewards/rollout_reward_func/std": 0.27676716446876526, "sampling/importance_sampling_ratio/max": 1.3422638177871704, "sampling/importance_sampling_ratio/mean": 0.9941832423210144, "sampling/importance_sampling_ratio/min": 0.6115661263465881, "sampling/sampling_logp_difference/max": 0.4917324185371399, "sampling/sampling_logp_difference/mean": 0.018511097878217697, "step": 249, "step_time": 26.64282015000026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.07043309864820912, "epoch": 0.005, "grad_norm": 0.0035562312696129084, "kl": 0.359963540629451, "learning_rate": 7.999932154511542e-06, "loss": 0.0, "step": 250, "step_time": 11.727345789000537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08174855704419315, "epoch": 0.00502, "frac_reward_zero_std": 0.0, "grad_norm": 0.003543607424944639, "kl": 0.5413316028789268, "learning_rate": 7.999931518962502e-06, "loss": 0.0, "num_tokens": 13146021.0, "reward": 2.4559497833251953, "reward_std": 0.3885264992713928, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.9621438384056091, "rewards/probe_shaping_dominance/std": 0.1238301619887352, "rewards/probe_terminal_raw/mean": 0.03963414579629898, "rewards/probe_terminal_raw/std": 0.12972840666770935, "rewards/rollout_reward_func/mean": -0.40207818150520325, "rewards/rollout_reward_func/std": 0.2555524408817291, "sampling/importance_sampling_ratio/max": 1.1064826250076294, "sampling/importance_sampling_ratio/mean": 0.954660177230835, "sampling/importance_sampling_ratio/min": 0.41962218284606934, "sampling/sampling_logp_difference/max": 0.7979011535644531, "sampling/sampling_logp_difference/mean": 0.023729108273983, "step": 251, "step_time": 27.992850227999952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.07990033202804625, "epoch": 0.00504, "grad_norm": 0.003231135895475745, "kl": 0.524783481414488, "learning_rate": 7.999930880450575e-06, "loss": 0.0, "step": 252, "step_time": 11.643585757999972 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.07902092937729321, "epoch": 0.00506, "frac_reward_zero_std": 0.0, "grad_norm": 0.006585233379155397, "kl": 0.37969694038247326, "learning_rate": 7.99993023897576e-06, "loss": 0.0, "num_tokens": 13246298.0, "reward": 2.4005722999572754, "reward_std": 0.3679780662059784, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.974242091178894, "rewards/probe_shaping_dominance/std": 0.10219167917966843, "rewards/probe_terminal_raw/mean": 0.026549797505140305, "rewards/probe_terminal_raw/std": 0.10620416700839996, "rewards/rollout_reward_func/mean": -0.42521971464157104, "rewards/rollout_reward_func/std": 0.21645236015319824, "sampling/importance_sampling_ratio/max": 1.969668984413147, "sampling/importance_sampling_ratio/mean": 1.0500105619430542, "sampling/importance_sampling_ratio/min": 0.7689392566680908, "sampling/sampling_logp_difference/max": 0.6780328750610352, "sampling/sampling_logp_difference/mean": 0.02139047347009182, "step": 253, "step_time": 26.232431414999155 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 0.07854951097397134, "epoch": 0.00508, "grad_norm": 0.005968212615698576, "kl": 0.3778405386647137, "learning_rate": 7.99992959453806e-06, "loss": 0.0, "step": 254, "step_time": 12.017544923999594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04191483659815276, "epoch": 0.0051, "frac_reward_zero_std": 0.0, "grad_norm": 0.004378916695713997, "kl": 0.3174490866222186, "learning_rate": 7.999928947137475e-06, "loss": -0.0, "num_tokens": 13351235.0, "reward": 2.3821582794189453, "reward_std": 0.4624309539794922, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9869383573532104, "rewards/probe_shaping_dominance/std": 0.07388784736394882, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.41415512561798096, "rewards/rollout_reward_func/std": 0.23873184621334076, "sampling/importance_sampling_ratio/max": 1.257253885269165, "sampling/importance_sampling_ratio/mean": 1.011238932609558, "sampling/importance_sampling_ratio/min": 0.9685202836990356, "sampling/sampling_logp_difference/max": 0.2289290428161621, "sampling/sampling_logp_difference/mean": 0.005532183218747377, "step": 255, "step_time": 28.14665811800114 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.044428632616472896, "epoch": 0.00512, "grad_norm": 0.001523565617389977, "kl": 0.3174588828405831, "learning_rate": 7.999928296774006e-06, "loss": -0.0, "step": 256, "step_time": 11.396023698001045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11287707928568125, "epoch": 0.00514, "frac_reward_zero_std": 0.0, "grad_norm": 0.0049528395757079124, "kl": 0.3751811153779272, "learning_rate": 7.999927643447652e-06, "loss": -0.0001, "num_tokens": 13453732.0, "reward": 2.2990427017211914, "reward_std": 0.4729869067668915, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.8994619250297546, "rewards/probe_shaping_dominance/std": 0.23433801531791687, "rewards/probe_terminal_raw/mean": 0.08892276883125305, "rewards/probe_terminal_raw/std": 0.1897670477628708, "rewards/rollout_reward_func/mean": -0.451841801404953, "rewards/rollout_reward_func/std": 0.3020572066307068, "sampling/importance_sampling_ratio/max": 1.7735323905944824, "sampling/importance_sampling_ratio/mean": 1.0311025381088257, "sampling/importance_sampling_ratio/min": 0.48170769214630127, "sampling/sampling_logp_difference/max": 0.5872056484222412, "sampling/sampling_logp_difference/mean": 0.03187928348779678, "step": 257, "step_time": 27.428863920001277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.11026706825941801, "epoch": 0.00516, "grad_norm": 0.0036789914593100548, "kl": 0.37549637774645817, "learning_rate": 7.999926987158413e-06, "loss": -0.0001, "step": 258, "step_time": 12.307902244997422 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09494227101095021, "epoch": 0.00518, "frac_reward_zero_std": 0.0, "grad_norm": 0.004995207767933607, "kl": 0.5894506504137098, "learning_rate": 7.999926327906292e-06, "loss": 0.0, "num_tokens": 13559320.0, "reward": 2.3814258575439453, "reward_std": 0.36968865990638733, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9921875, "rewards/probe_shaping_dominance/std": 0.04419417306780815, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.4826367497444153, "rewards/rollout_reward_func/std": 0.231715127825737, "sampling/importance_sampling_ratio/max": 1.2988759279251099, "sampling/importance_sampling_ratio/mean": 0.989588737487793, "sampling/importance_sampling_ratio/min": 0.3728586435317993, "sampling/sampling_logp_difference/max": 0.9864900708198547, "sampling/sampling_logp_difference/mean": 0.030208630487322807, "step": 259, "step_time": 28.526762178002173 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.09542630659416318, "epoch": 0.0052, "grad_norm": 0.009572784416377544, "kl": 0.5865388629335939, "learning_rate": 7.999925665691289e-06, "loss": 0.0, "step": 260, "step_time": 11.52395996999985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.042740301505546086, "epoch": 0.00522, "frac_reward_zero_std": 0.0, "grad_norm": 0.0034757580142468214, "kl": 0.16234587341508444, "learning_rate": 7.999925000513405e-06, "loss": 0.0001, "num_tokens": 13662277.0, "reward": 2.3550405502319336, "reward_std": 0.3789060413837433, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9763131737709045, "rewards/probe_shaping_dominance/std": 0.09356633573770523, "rewards/probe_terminal_raw/mean": 0.023119919002056122, "rewards/probe_terminal_raw/std": 0.0910695344209671, "rewards/rollout_reward_func/mean": -0.4693926274776459, "rewards/rollout_reward_func/std": 0.27393800020217896, "sampling/importance_sampling_ratio/max": 1.9132373332977295, "sampling/importance_sampling_ratio/mean": 1.0334219932556152, "sampling/importance_sampling_ratio/min": 0.8748363256454468, "sampling/sampling_logp_difference/max": 0.648794412612915, "sampling/sampling_logp_difference/mean": 0.015361637808382511, "step": 261, "step_time": 27.68740953500128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.043997991022479255, "epoch": 0.00524, "grad_norm": 0.0034889201633632183, "kl": 0.1585660980490502, "learning_rate": 7.999924332372639e-06, "loss": 0.0, "step": 262, "step_time": 12.369422526000562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.06730000481184106, "epoch": 0.00526, "frac_reward_zero_std": 0.0, "grad_norm": 0.0019632691983133554, "kl": 0.2906430190632818, "learning_rate": 7.999923661268994e-06, "loss": -0.0, "num_tokens": 13768535.0, "reward": 2.461604356765747, "reward_std": 0.28569555282592773, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9790951013565063, "rewards/probe_shaping_dominance/std": 0.08612176775932312, "rewards/probe_terminal_raw/mean": 0.023373983800411224, "rewards/probe_terminal_raw/std": 0.09738598018884659, "rewards/rollout_reward_func/mean": -0.42836469411849976, "rewards/rollout_reward_func/std": 0.21179892122745514, "sampling/importance_sampling_ratio/max": 1.027362585067749, "sampling/importance_sampling_ratio/mean": 0.911888837814331, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2283318042755127, "sampling/sampling_logp_difference/mean": 0.04068940505385399, "step": 263, "step_time": 28.125288621000436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.0668873688664462, "epoch": 0.00528, "grad_norm": 0.0020422539673745632, "kl": 0.30596065653662663, "learning_rate": 7.999922987202466e-06, "loss": -0.0, "step": 264, "step_time": 11.507015873000455 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05058241146616638, "epoch": 0.0053, "frac_reward_zero_std": 0.0, "grad_norm": 0.0018712878227233887, "kl": 0.39055716490838677, "learning_rate": 7.999922310173063e-06, "loss": -0.0, "num_tokens": 13871840.0, "reward": 2.4825406074523926, "reward_std": 0.31064870953559875, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9752524495124817, "rewards/probe_shaping_dominance/std": 0.09777678549289703, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.12296734005212784, "rewards/rollout_reward_func/mean": -0.41146183013916016, "rewards/rollout_reward_func/std": 0.21425116062164307, "sampling/importance_sampling_ratio/max": 1.5599281787872314, "sampling/importance_sampling_ratio/mean": 1.0341243743896484, "sampling/importance_sampling_ratio/min": 0.8953186869621277, "sampling/sampling_logp_difference/max": 0.4449194669723511, "sampling/sampling_logp_difference/mean": 0.013410702347755432, "step": 265, "step_time": 27.96838706700055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04936367901973426, "epoch": 0.00532, "grad_norm": 0.006141372956335545, "kl": 0.3867563092110231, "learning_rate": 7.99992163018078e-06, "loss": -0.0, "step": 266, "step_time": 12.308435358998395 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05740413888270268, "epoch": 0.00534, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028442663606256247, "kl": 0.3010439347126521, "learning_rate": 7.99992094722562e-06, "loss": -0.0, "num_tokens": 13974703.0, "reward": 2.375330924987793, "reward_std": 0.3971181809902191, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9330692291259766, "rewards/probe_shaping_dominance/std": 0.15932095050811768, "rewards/probe_terminal_raw/mean": 0.06885162740945816, "rewards/probe_terminal_raw/std": 0.1653386801481247, "rewards/rollout_reward_func/mean": -0.42034000158309937, "rewards/rollout_reward_func/std": 0.19739177823066711, "sampling/importance_sampling_ratio/max": 1.2114074230194092, "sampling/importance_sampling_ratio/mean": 0.9802918434143066, "sampling/importance_sampling_ratio/min": 0.3451912999153137, "sampling/sampling_logp_difference/max": 1.0613338947296143, "sampling/sampling_logp_difference/mean": 0.018370507284998894, "step": 267, "step_time": 27.86067632400045 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.055183965210744645, "epoch": 0.00536, "grad_norm": 0.0022630670573562384, "kl": 0.344313826324651, "learning_rate": 7.999920261307583e-06, "loss": -0.0, "step": 268, "step_time": 11.746586444000059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.08427908451994881, "epoch": 0.00538, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037011466920375824, "kl": 0.412635525688529, "learning_rate": 7.999919572426668e-06, "loss": -0.0, "num_tokens": 14078089.0, "reward": 2.4167308807373047, "reward_std": 0.32326242327690125, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9451819658279419, "rewards/probe_shaping_dominance/std": 0.147642120718956, "rewards/probe_terminal_raw/mean": 0.05843495950102806, "rewards/probe_terminal_raw/std": 0.15837596356868744, "rewards/rollout_reward_func/mean": -0.44313597679138184, "rewards/rollout_reward_func/std": 0.24654169380664825, "sampling/importance_sampling_ratio/max": 1.858984112739563, "sampling/importance_sampling_ratio/mean": 0.9879124164581299, "sampling/importance_sampling_ratio/min": 0.6056866645812988, "sampling/sampling_logp_difference/max": 0.6200296878814697, "sampling/sampling_logp_difference/mean": 0.027817152440547943, "step": 269, "step_time": 26.451382616000046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.07716414582682773, "epoch": 0.0054, "grad_norm": 0.0030677285976707935, "kl": 0.4153696422581561, "learning_rate": 7.999918880582879e-06, "loss": -0.0, "step": 270, "step_time": 12.785874016998605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04053633386229194, "epoch": 0.00542, "frac_reward_zero_std": 0.0, "grad_norm": 0.001796143944375217, "kl": 0.5015344847925007, "learning_rate": 7.999918185776215e-06, "loss": 0.0, "num_tokens": 14181503.0, "reward": 2.4646096229553223, "reward_std": 0.2045918107032776, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9679263234138489, "rewards/probe_shaping_dominance/std": 0.1049569845199585, "rewards/probe_terminal_raw/mean": 0.0364583358168602, "rewards/probe_terminal_raw/std": 0.11773227155208588, "rewards/rollout_reward_func/mean": -0.4585248529911041, "rewards/rollout_reward_func/std": 0.16162419319152832, "sampling/importance_sampling_ratio/max": 1.4571605920791626, "sampling/importance_sampling_ratio/mean": 1.0197436809539795, "sampling/importance_sampling_ratio/min": 0.8846800923347473, "sampling/sampling_logp_difference/max": 0.3764890432357788, "sampling/sampling_logp_difference/mean": 0.012306122109293938, "step": 271, "step_time": 26.693239825001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.038741875116102165, "epoch": 0.00544, "grad_norm": 0.0020677302964031696, "kl": 0.5029990994371474, "learning_rate": 7.999917488006676e-06, "loss": 0.0, "step": 272, "step_time": 11.444299719997616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04818115712259896, "epoch": 0.00546, "frac_reward_zero_std": 0.0, "grad_norm": 0.008343451656401157, "kl": 0.7089566249400381, "learning_rate": 7.999916787274264e-06, "loss": 0.0001, "num_tokens": 14287480.0, "reward": 2.4599452018737793, "reward_std": 0.38899266719818115, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9238950610160828, "rewards/probe_shaping_dominance/std": 0.16443566977977753, "rewards/probe_terminal_raw/mean": 0.08130080997943878, "rewards/probe_terminal_raw/std": 0.17714287340641022, "rewards/rollout_reward_func/mean": -0.3702506721019745, "rewards/rollout_reward_func/std": 0.21257071197032928, "sampling/importance_sampling_ratio/max": 2.423100471496582, "sampling/importance_sampling_ratio/mean": 1.0725514888763428, "sampling/importance_sampling_ratio/min": 0.8080363273620605, "sampling/sampling_logp_difference/max": 0.8850466012954712, "sampling/sampling_logp_difference/mean": 0.024975256994366646, "step": 273, "step_time": 28.09797250900101 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 0.04680645616099355, "epoch": 0.00548, "grad_norm": 0.003927062265574932, "kl": 0.742738697305322, "learning_rate": 7.99991608357898e-06, "loss": 0.0001, "step": 274, "step_time": 11.650237371000003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04380835813935846, "epoch": 0.0055, "frac_reward_zero_std": 0.0, "grad_norm": 0.0025579470675438643, "kl": 0.21995878049926887, "learning_rate": 7.999915376920822e-06, "loss": -0.0, "num_tokens": 14387389.0, "reward": 2.2633914947509766, "reward_std": 0.42217421531677246, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9556913375854492, "rewards/probe_shaping_dominance/std": 0.1223374605178833, "rewards/probe_terminal_raw/mean": 0.0518292672932148, "rewards/probe_terminal_raw/std": 0.14265993237495422, "rewards/rollout_reward_func/mean": -0.5378788709640503, "rewards/rollout_reward_func/std": 0.23384462296962738, "sampling/importance_sampling_ratio/max": 1.084592580795288, "sampling/importance_sampling_ratio/mean": 0.9922658205032349, "sampling/importance_sampling_ratio/min": 0.7613502740859985, "sampling/sampling_logp_difference/max": 0.2726619839668274, "sampling/sampling_logp_difference/mean": 0.009103155694901943, "step": 275, "step_time": 26.459266137000668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.03994250175310299, "epoch": 0.00552, "grad_norm": 0.0021381748374551535, "kl": 0.2157795349397702, "learning_rate": 7.999914667299794e-06, "loss": -0.0, "step": 276, "step_time": 11.672075437000785 }, { "clip_ratio/high_max": 0.05000000074505806, "clip_ratio/high_mean": 0.02500000037252903, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.037500000558793545, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09868528880178928, "epoch": 0.00554, "frac_reward_zero_std": 0.0, "grad_norm": 0.0031571455765515566, "kl": 0.4792258571833372, "learning_rate": 7.999913954715895e-06, "loss": 0.0, "num_tokens": 14492025.0, "reward": 2.2542710304260254, "reward_std": 0.38688531517982483, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.5082289576530457, "rewards/rollout_reward_func/std": 0.17395071685314178, "sampling/importance_sampling_ratio/max": 1.9612770080566406, "sampling/importance_sampling_ratio/mean": 1.0468454360961914, "sampling/importance_sampling_ratio/min": 0.5976178646087646, "sampling/sampling_logp_difference/max": 0.7003155946731567, "sampling/sampling_logp_difference/mean": 0.032625701278448105, "step": 277, "step_time": 27.236174976000257 }, { "clip_ratio/high_max": 0.05000000074505806, "clip_ratio/high_mean": 0.02500000037252903, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.037500000558793545, "entropy": 0.09623363520950079, "epoch": 0.00556, "grad_norm": 0.0032991948537528515, "kl": 0.4749853519606404, "learning_rate": 7.999913239169126e-06, "loss": 0.0, "step": 278, "step_time": 12.07052038799975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.8125, "completions/mean_terminated_length": 2.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04121039004530758, "epoch": 0.00558, "frac_reward_zero_std": 0.0, "grad_norm": 0.0032093473710119724, "kl": 0.6897661700841127, "learning_rate": 7.999912520659488e-06, "loss": 0.0, "num_tokens": 14593223.0, "reward": 2.3469300270080566, "reward_std": 0.5208548307418823, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.8125, "rewards/probe_completion_length/std": 0.3965577781200409, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.95980304479599, "rewards/probe_shaping_dominance/std": 0.12795211374759674, "rewards/probe_terminal_raw/mean": 0.042174797505140305, "rewards/probe_terminal_raw/std": 0.13503843545913696, "rewards/rollout_reward_func/mean": -0.44879791140556335, "rewards/rollout_reward_func/std": 0.2045743763446808, "sampling/importance_sampling_ratio/max": 1.9838464260101318, "sampling/importance_sampling_ratio/mean": 1.0156028270721436, "sampling/importance_sampling_ratio/min": 0.1315358281135559, "sampling/sampling_logp_difference/max": 2.028473377227783, "sampling/sampling_logp_difference/mean": 0.03758270666003227, "step": 279, "step_time": 26.44161211500159 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033333334140479565, "entropy": 0.047012478462420404, "epoch": 0.0056, "grad_norm": 0.0013261314015835524, "kl": 0.7127395562856691, "learning_rate": 7.99991179918698e-06, "loss": -0.0, "step": 280, "step_time": 11.634762280001269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.012499196142016444, "epoch": 0.00562, "frac_reward_zero_std": 0.0, "grad_norm": 0.0003787693567574024, "kl": 0.669078703969717, "learning_rate": 7.999911074751606e-06, "loss": -0.0, "num_tokens": 14693012.0, "reward": 2.4939217567443848, "reward_std": 0.381552517414093, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9186484813690186, "rewards/probe_shaping_dominance/std": 0.195449098944664, "rewards/probe_terminal_raw/mean": 0.078125, "rewards/probe_terminal_raw/std": 0.18445101380348206, "rewards/rollout_reward_func/mean": -0.3903515338897705, "rewards/rollout_reward_func/std": 0.2618943452835083, "sampling/importance_sampling_ratio/max": 1.0298659801483154, "sampling/importance_sampling_ratio/mean": 0.9976564645767212, "sampling/importance_sampling_ratio/min": 0.9420029520988464, "sampling/sampling_logp_difference/max": 0.05974767729640007, "sampling/sampling_logp_difference/mean": 0.0016555668553337455, "step": 281, "step_time": 26.723270941998635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.012372259192488855, "epoch": 0.00564, "grad_norm": 0.0003435203689150512, "kl": 0.6690934834768996, "learning_rate": 7.999910347353363e-06, "loss": -0.0, "step": 282, "step_time": 11.794659334002063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.84375, "completions/mean_terminated_length": 2.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.045614961185492575, "epoch": 0.00566, "frac_reward_zero_std": 0.0, "grad_norm": 0.003150342497974634, "kl": 0.48013901670856285, "learning_rate": 7.999909616992255e-06, "loss": -0.0, "num_tokens": 14799672.0, "reward": 2.3399429321289062, "reward_std": 0.4422038793563843, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.84375, "rewards/probe_completion_length/std": 0.3689020276069641, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 1.0, "rewards/probe_shaping_dominance/std": 0.0, "rewards/probe_terminal_raw/mean": 0.0, "rewards/probe_terminal_raw/std": 0.0, "rewards/rollout_reward_func/mean": -0.45380693674087524, "rewards/rollout_reward_func/std": 0.1835639625787735, "sampling/importance_sampling_ratio/max": 1.2092225551605225, "sampling/importance_sampling_ratio/mean": 0.9782531261444092, "sampling/importance_sampling_ratio/min": 0.3157159686088562, "sampling/sampling_logp_difference/max": 1.1528494358062744, "sampling/sampling_logp_difference/mean": 0.019979460164904594, "step": 283, "step_time": 27.036351400000058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.03985181718599051, "epoch": 0.00568, "grad_norm": 0.0033008423633873463, "kl": 0.49970418894372415, "learning_rate": 7.99990888366828e-06, "loss": -0.0, "step": 284, "step_time": 11.668078124000203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.03972258236899506, "epoch": 0.0057, "frac_reward_zero_std": 0.0, "grad_norm": 0.002630846342071891, "kl": 0.3517824411392212, "learning_rate": 7.99990814738144e-06, "loss": -0.0, "num_tokens": 14902831.0, "reward": 2.4359757900238037, "reward_std": 0.2911105751991272, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9871374368667603, "rewards/probe_shaping_dominance/std": 0.07276186347007751, "rewards/probe_terminal_raw/mean": 0.015625, "rewards/probe_terminal_raw/std": 0.0883883461356163, "rewards/rollout_reward_func/mean": -0.48553669452667236, "rewards/rollout_reward_func/std": 0.2099909633398056, "sampling/importance_sampling_ratio/max": 1.558259129524231, "sampling/importance_sampling_ratio/mean": 1.021366834640503, "sampling/importance_sampling_ratio/min": 0.757884681224823, "sampling/sampling_logp_difference/max": 0.4435689449310303, "sampling/sampling_logp_difference/mean": 0.011840267106890678, "step": 285, "step_time": 27.424052481000217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.040461032156599686, "epoch": 0.00572, "grad_norm": 0.002737229922786355, "kl": 0.3537818659096956, "learning_rate": 7.999907408131737e-06, "loss": -0.0, "step": 286, "step_time": 12.126654321001297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.020485240605921717, "epoch": 0.00574, "frac_reward_zero_std": 0.0, "grad_norm": 0.000876868492923677, "kl": 0.23688423214722576, "learning_rate": 7.999906665919169e-06, "loss": -0.0, "num_tokens": 15005261.0, "reward": 2.5098652839660645, "reward_std": 0.30707597732543945, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.96875, "rewards/probe_completion_length/std": 0.1767766922712326, "rewards/probe_invalid_count/mean": 0.03125, "rewards/probe_invalid_count/std": 0.1767766922712326, "rewards/probe_shaping_dominance/mean": 0.9733736515045166, "rewards/probe_shaping_dominance/std": 0.10744811594486237, "rewards/probe_terminal_raw/mean": 0.0260416679084301, "rewards/probe_terminal_raw/std": 0.1046360433101654, "rewards/rollout_reward_func/mean": -0.4395501911640167, "rewards/rollout_reward_func/std": 0.18828870356082916, "sampling/importance_sampling_ratio/max": 1.0840176343917847, "sampling/importance_sampling_ratio/mean": 1.0012118816375732, "sampling/importance_sampling_ratio/min": 0.9655031561851501, "sampling/sampling_logp_difference/max": 0.08256775140762329, "sampling/sampling_logp_difference/mean": 0.00235398905351758, "step": 287, "step_time": 27.075085327001034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.022996263058303157, "epoch": 0.00576, "grad_norm": 0.0009354232461191714, "kl": 0.23660576696175895, "learning_rate": 7.99990592074374e-06, "loss": -0.0, "step": 288, "step_time": 11.657212093999078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.05316271091851377, "epoch": 0.00578, "frac_reward_zero_std": 0.0, "grad_norm": 0.006305683869868517, "kl": 0.2035164695232652, "learning_rate": 7.999905172605446e-06, "loss": -0.0001, "num_tokens": 15107252.0, "reward": 2.422664165496826, "reward_std": 0.37807923555374146, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9658713340759277, "rewards/probe_shaping_dominance/std": 0.10942408442497253, "rewards/probe_terminal_raw/mean": 0.03315548598766327, "rewards/probe_terminal_raw/std": 0.1095743477344513, "rewards/rollout_reward_func/mean": -0.40136268734931946, "rewards/rollout_reward_func/std": 0.2093636691570282, "sampling/importance_sampling_ratio/max": 1.5805177688598633, "sampling/importance_sampling_ratio/mean": 1.0220205783843994, "sampling/importance_sampling_ratio/min": 0.7326148748397827, "sampling/sampling_logp_difference/max": 0.4577510356903076, "sampling/sampling_logp_difference/mean": 0.019495096057653427, "step": 289, "step_time": 26.987616914999307 }, { "clip_ratio/high_max": 0.04583333432674408, "clip_ratio/high_mean": 0.02291666716337204, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 0.05469944020660478, "epoch": 0.0058, "grad_norm": 0.0032733359839767218, "kl": 0.18666235760611016, "learning_rate": 7.999904421504293e-06, "loss": -0.0001, "step": 290, "step_time": 11.951281235001261 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.035416667349636555, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.0591709428122158, "epoch": 0.00582, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026493356563150883, "kl": 0.8575776647776365, "learning_rate": 7.999903667440278e-06, "loss": 0.0, "num_tokens": 15208793.0, "reward": 2.402831792831421, "reward_std": 0.3910689353942871, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9471915364265442, "rewards/probe_shaping_dominance/std": 0.1431892067193985, "rewards/probe_terminal_raw/mean": 0.0520833358168602, "rewards/probe_terminal_raw/std": 0.1433027982711792, "rewards/rollout_reward_func/mean": -0.42144304513931274, "rewards/rollout_reward_func/std": 0.21596133708953857, "sampling/importance_sampling_ratio/max": 1.0310035943984985, "sampling/importance_sampling_ratio/mean": 0.9701290130615234, "sampling/importance_sampling_ratio/min": 0.5706773400306702, "sampling/sampling_logp_difference/max": 0.5609317421913147, "sampling/sampling_logp_difference/mean": 0.014490557834506035, "step": 291, "step_time": 27.075510762000704 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02500000037252903, "entropy": 0.05919087287338698, "epoch": 0.00584, "grad_norm": 0.0026768911629915237, "kl": 0.8454538804168692, "learning_rate": 7.999902910413404e-06, "loss": 0.0, "step": 292, "step_time": 12.032383580999522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04045550918681329, "epoch": 0.00586, "frac_reward_zero_std": 0.0, "grad_norm": 0.00724328076466918, "kl": 0.80053227301687, "learning_rate": 7.999902150423671e-06, "loss": -0.0001, "num_tokens": 15311233.0, "reward": 2.4362893104553223, "reward_std": 0.426661878824234, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9734768271446228, "rewards/probe_shaping_dominance/std": 0.10647083818912506, "rewards/probe_terminal_raw/mean": 0.026295732706785202, "rewards/probe_terminal_raw/std": 0.10541322082281113, "rewards/rollout_reward_func/mean": -0.38848331570625305, "rewards/rollout_reward_func/std": 0.2122591733932495, "sampling/importance_sampling_ratio/max": 1.8292688131332397, "sampling/importance_sampling_ratio/mean": 1.001596212387085, "sampling/importance_sampling_ratio/min": 0.44141146540641785, "sampling/sampling_logp_difference/max": 0.8177778720855713, "sampling/sampling_logp_difference/mean": 0.025196455419063568, "step": 293, "step_time": 27.23413759199957 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.04230553897491518, "epoch": 0.00588, "grad_norm": 0.005148016382008791, "kl": 0.6622665030881763, "learning_rate": 7.999901387471079e-06, "loss": -0.0001, "step": 294, "step_time": 11.526401772997815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.03366142028335162, "epoch": 0.0059, "frac_reward_zero_std": 0.0, "grad_norm": 0.005694986321032047, "kl": 0.39196249035501296, "learning_rate": 7.99990062155563e-06, "loss": 0.0, "num_tokens": 15421347.0, "reward": 2.391371726989746, "reward_std": 0.43072906136512756, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.875, "rewards/probe_completion_length/std": 0.33601075410842896, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9607213735580444, "rewards/probe_shaping_dominance/std": 0.12690994143486023, "rewards/probe_terminal_raw/mean": 0.046875, "rewards/probe_terminal_raw/std": 0.1480722874403, "rewards/rollout_reward_func/mean": -0.4412246346473694, "rewards/rollout_reward_func/std": 0.21457210183143616, "sampling/importance_sampling_ratio/max": 1.2205545902252197, "sampling/importance_sampling_ratio/mean": 0.9986574053764343, "sampling/importance_sampling_ratio/min": 0.7592641115188599, "sampling/sampling_logp_difference/max": 0.2809281349182129, "sampling/sampling_logp_difference/mean": 0.008172519505023956, "step": 295, "step_time": 26.643844086999707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.030476719188300194, "epoch": 0.00592, "grad_norm": 0.005326179787516594, "kl": 0.39566947892306814, "learning_rate": 7.999899852677322e-06, "loss": 0.0, "step": 296, "step_time": 12.454534126997714 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.90625, "completions/mean_terminated_length": 2.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.04483710537169827, "epoch": 0.00594, "frac_reward_zero_std": 0.0, "grad_norm": 0.0021372437477111816, "kl": 0.4166623194081088, "learning_rate": 7.99989908083616e-06, "loss": 0.0, "num_tokens": 15523076.0, "reward": 2.4664759635925293, "reward_std": 0.4568862318992615, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.90625, "rewards/probe_completion_length/std": 0.2961445748806, "rewards/probe_invalid_count/mean": 0.0625, "rewards/probe_invalid_count/std": 0.24593468010425568, "rewards/probe_shaping_dominance/mean": 0.9591568112373352, "rewards/probe_shaping_dominance/std": 0.11802849918603897, "rewards/probe_terminal_raw/mean": 0.04509654641151428, "rewards/probe_terminal_raw/std": 0.1317683309316635, "rewards/rollout_reward_func/mean": -0.4565274119377136, "rewards/rollout_reward_func/std": 0.26263633370399475, "sampling/importance_sampling_ratio/max": 1.3225888013839722, "sampling/importance_sampling_ratio/mean": 1.0174564123153687, "sampling/importance_sampling_ratio/min": 0.8623110055923462, "sampling/sampling_logp_difference/max": 0.27959030866622925, "sampling/sampling_logp_difference/mean": 0.008479975163936615, "step": 297, "step_time": 26.703501694998522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.04334457405639114, "epoch": 0.00596, "grad_norm": 0.004324762150645256, "kl": 0.41364979138597846, "learning_rate": 7.999898306032144e-06, "loss": 0.0, "step": 298, "step_time": 11.624797897999088 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 2.9375, "completions/mean_terminated_length": 2.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.039461553949308836, "epoch": 0.00598, "frac_reward_zero_std": 0.0, "grad_norm": 0.0021312020253390074, "kl": 0.4295559982638224, "learning_rate": 7.999897528265272e-06, "loss": 0.0, "num_tokens": 15625505.0, "reward": 2.4885663986206055, "reward_std": 0.32209959626197815, "rewards/format_guard/mean": -0.05000000074505806, "rewards/format_guard/std": 0.0, "rewards/probe_completion_length/mean": 1.9375, "rewards/probe_completion_length/std": 0.24593468010425568, "rewards/probe_invalid_count/mean": 0.0, "rewards/probe_invalid_count/std": 0.0, "rewards/probe_shaping_dominance/mean": 0.9723982810974121, "rewards/probe_shaping_dominance/std": 0.10864228010177612, "rewards/probe_terminal_raw/mean": 0.03125, "rewards/probe_terminal_raw/std": 0.12296734005212784, "rewards/rollout_reward_func/mean": -0.40258198976516724, "rewards/rollout_reward_func/std": 0.1721213161945343, "sampling/importance_sampling_ratio/max": 1.015625, "sampling/importance_sampling_ratio/mean": 0.9557619690895081, "sampling/importance_sampling_ratio/min": 0.3387709856033325, "sampling/sampling_logp_difference/max": 1.0839133262634277, "sampling/sampling_logp_difference/mean": 0.021332627162337303, "step": 299, "step_time": 26.165180010000768 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.04107913846030442, "epoch": 0.006, "grad_norm": 0.0022343825548887253, "kl": 0.42880946584045887, "learning_rate": 7.999896747535546e-06, "loss": 0.0, "step": 300, "step_time": 12.217135184999279 } ], "logging_steps": 1.0, "max_steps": 100000, "num_input_tokens_seen": 15625505, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }