test-env-gin-rummy / trainer_state.json
bimabk's picture
Upload task output 1
015345c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.006,
"eval_steps": 500,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06598061177646741,
"epoch": 2e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.003923534415662289,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0,
"num_tokens": 102665.0,
"reward": 2.355022430419922,
"reward_std": 0.3552054464817047,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.501227617263794,
"rewards/rollout_reward_func/std": 0.18640437722206116,
"sampling/importance_sampling_ratio/max": 1.0961512327194214,
"sampling/importance_sampling_ratio/mean": 0.9703092575073242,
"sampling/importance_sampling_ratio/min": 0.5060414671897888,
"sampling/sampling_logp_difference/max": 0.6756159067153931,
"sampling/sampling_logp_difference/mean": 0.0183907151222229,
"step": 1,
"step_time": 29.075429260999726
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.06598061177646741,
"epoch": 4e-05,
"grad_norm": 0.003917683847248554,
"kl": 0.0,
"learning_rate": 2.2857142857142855e-07,
"loss": -0.0,
"step": 2,
"step_time": 11.468670933999988
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05838002988048174,
"epoch": 6e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004170146305114031,
"kl": 0.0014184596652553338,
"learning_rate": 4.571428571428571e-07,
"loss": 0.0,
"num_tokens": 205842.0,
"reward": 2.2323365211486816,
"reward_std": 0.41563019156455994,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.970565140247345,
"rewards/probe_shaping_dominance/std": 0.11582481861114502,
"rewards/probe_terminal_raw/mean": 0.03125,
"rewards/probe_terminal_raw/std": 0.12296734005212784,
"rewards/rollout_reward_func/mean": -0.5944784879684448,
"rewards/rollout_reward_func/std": 0.19796565175056458,
"sampling/importance_sampling_ratio/max": 1.4160258769989014,
"sampling/importance_sampling_ratio/mean": 1.0286931991577148,
"sampling/importance_sampling_ratio/min": 0.8523033857345581,
"sampling/sampling_logp_difference/max": 0.34715062379837036,
"sampling/sampling_logp_difference/mean": 0.01565416157245636,
"step": 3,
"step_time": 26.976024578999954
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.06273448248884961,
"epoch": 8e-05,
"grad_norm": 0.0025308942422270775,
"kl": 0.004324701569430545,
"learning_rate": 6.857142857142857e-07,
"loss": 0.0,
"step": 4,
"step_time": 12.765090235000116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.78125,
"completions/mean_terminated_length": 2.78125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06146455561975017,
"epoch": 0.0001,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010321667417883873,
"kl": 0.005618094519680539,
"learning_rate": 9.142857142857142e-07,
"loss": 0.0,
"num_tokens": 303571.0,
"reward": 2.236471176147461,
"reward_std": 0.5468828678131104,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.78125,
"rewards/probe_completion_length/std": 0.420013427734375,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.9439389705657959,
"rewards/probe_shaping_dominance/std": 0.15084654092788696,
"rewards/probe_terminal_raw/mean": 0.0625,
"rewards/probe_terminal_raw/std": 0.16800537705421448,
"rewards/rollout_reward_func/mean": -0.5324676036834717,
"rewards/rollout_reward_func/std": 0.24024422466754913,
"sampling/importance_sampling_ratio/max": 1.3134887218475342,
"sampling/importance_sampling_ratio/mean": 0.9676171541213989,
"sampling/importance_sampling_ratio/min": 0.41273218393325806,
"sampling/sampling_logp_difference/max": 0.8849565982818604,
"sampling/sampling_logp_difference/mean": 0.026659058406949043,
"step": 5,
"step_time": 26.660728665999955
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"entropy": 0.06810211515403353,
"epoch": 0.00012,
"grad_norm": 0.007714552339166403,
"kl": 0.0028154569756466685,
"learning_rate": 1.1428571428571428e-06,
"loss": 0.0,
"step": 6,
"step_time": 11.44768754599977
},
{
"clip_ratio/high_max": 0.06250000186264515,
"clip_ratio/high_mean": 0.031250000931322575,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.04115386162811774,
"epoch": 0.00014,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.003323962911963463,
"kl": 0.001360555283525855,
"learning_rate": 1.3714285714285715e-06,
"loss": 0.0,
"num_tokens": 410424.0,
"reward": 2.2917943000793457,
"reward_std": 0.44559940695762634,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9733562469482422,
"rewards/probe_shaping_dominance/std": 0.10957542806863785,
"rewards/probe_terminal_raw/mean": 0.027566056698560715,
"rewards/probe_terminal_raw/std": 0.10949952900409698,
"rewards/rollout_reward_func/mean": -0.5341278314590454,
"rewards/rollout_reward_func/std": 0.27136242389678955,
"sampling/importance_sampling_ratio/max": 1.0618572235107422,
"sampling/importance_sampling_ratio/mean": 0.9585317969322205,
"sampling/importance_sampling_ratio/min": 0.2324376255273819,
"sampling/sampling_logp_difference/max": 1.470571756362915,
"sampling/sampling_logp_difference/mean": 0.02589060366153717,
"step": 7,
"step_time": 27.5007078120002
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.04836703218097682,
"epoch": 0.00016,
"grad_norm": 0.005415800027549267,
"kl": 0.001694043724171479,
"learning_rate": 1.6e-06,
"loss": 0.0,
"step": 8,
"step_time": 12.223170772000117
},
{
"clip_ratio/high_max": 0.06250000186264515,
"clip_ratio/high_mean": 0.031250000931322575,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07847535189284827,
"epoch": 0.00018,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010716816410422325,
"kl": 0.004078912243130617,
"learning_rate": 1.8285714285714284e-06,
"loss": -0.0,
"num_tokens": 511562.0,
"reward": 2.3355042934417725,
"reward_std": 0.43706634640693665,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.9753304123878479,
"rewards/probe_shaping_dominance/std": 0.0984005331993103,
"rewards/probe_terminal_raw/mean": 0.026295732706785202,
"rewards/probe_terminal_raw/std": 0.10541322082281113,
"rewards/rollout_reward_func/mean": -0.553621768951416,
"rewards/rollout_reward_func/std": 0.20992274582386017,
"sampling/importance_sampling_ratio/max": 2.0806119441986084,
"sampling/importance_sampling_ratio/mean": 1.0222396850585938,
"sampling/importance_sampling_ratio/min": 0.5085986256599426,
"sampling/sampling_logp_difference/max": 0.7373225688934326,
"sampling/sampling_logp_difference/mean": 0.028744252398610115,
"step": 9,
"step_time": 26.567318749999913
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"entropy": 0.07049791459576227,
"epoch": 0.0002,
"grad_norm": 0.004469083622097969,
"kl": 0.026501665124972873,
"learning_rate": 2.057142857142857e-06,
"loss": -0.0,
"step": 10,
"step_time": 11.64468052799998
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05360435344118741,
"epoch": 0.00022,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.008809314109385014,
"kl": 0.004907062985087585,
"learning_rate": 2.2857142857142856e-06,
"loss": -0.0,
"num_tokens": 616201.0,
"reward": 2.4397201538085938,
"reward_std": 0.5087255239486694,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0625,
"rewards/probe_invalid_count/std": 0.24593468010425568,
"rewards/probe_shaping_dominance/mean": 0.9706827998161316,
"rewards/probe_shaping_dominance/std": 0.12301044911146164,
"rewards/probe_terminal_raw/mean": 0.025406504049897194,
"rewards/probe_terminal_raw/std": 0.10275532305240631,
"rewards/rollout_reward_func/mean": -0.5063689351081848,
"rewards/rollout_reward_func/std": 0.27631497383117676,
"sampling/importance_sampling_ratio/max": 1.1329089403152466,
"sampling/importance_sampling_ratio/mean": 0.9933090806007385,
"sampling/importance_sampling_ratio/min": 0.768523633480072,
"sampling/sampling_logp_difference/max": 0.2632848620414734,
"sampling/sampling_logp_difference/mean": 0.007937189191579819,
"step": 11,
"step_time": 27.777191968999887
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"entropy": 0.056653952024134924,
"epoch": 0.00024,
"grad_norm": 0.005528156645596027,
"kl": 0.0032436020156101364,
"learning_rate": 2.5142857142857142e-06,
"loss": -0.0,
"step": 12,
"step_time": 11.55833436900025
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.038703071273630485,
"epoch": 0.00026,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007530162110924721,
"kl": 0.09287417630221206,
"learning_rate": 2.742857142857143e-06,
"loss": -0.0,
"num_tokens": 724364.0,
"reward": 2.351245880126953,
"reward_std": 0.4424680173397064,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9703531265258789,
"rewards/probe_shaping_dominance/std": 0.11669508367776871,
"rewards/probe_terminal_raw/mean": 0.03125,
"rewards/probe_terminal_raw/std": 0.12296734005212784,
"rewards/rollout_reward_func/mean": -0.47535717487335205,
"rewards/rollout_reward_func/std": 0.24601998925209045,
"sampling/importance_sampling_ratio/max": 1.440869688987732,
"sampling/importance_sampling_ratio/mean": 1.0093717575073242,
"sampling/importance_sampling_ratio/min": 0.7920892238616943,
"sampling/sampling_logp_difference/max": 0.3652459681034088,
"sampling/sampling_logp_difference/mean": 0.008522224612534046,
"step": 13,
"step_time": 27.311093626999764
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.0453864433511626,
"epoch": 0.00028,
"grad_norm": 0.006435270421206951,
"kl": 0.010504724175871893,
"learning_rate": 2.9714285714285716e-06,
"loss": -0.0,
"step": 14,
"step_time": 11.88281524700028
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.037373697148723295,
"epoch": 0.0003,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006931572686880827,
"kl": 0.0013499163329698805,
"learning_rate": 3.2e-06,
"loss": -0.0,
"num_tokens": 828160.0,
"reward": 2.3457703590393066,
"reward_std": 0.32655069231987,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9890751838684082,
"rewards/probe_shaping_dominance/std": 0.06179998442530632,
"rewards/probe_terminal_raw/mean": 0.01092479657381773,
"rewards/probe_terminal_raw/std": 0.06179998070001602,
"rewards/rollout_reward_func/mean": -0.5417294502258301,
"rewards/rollout_reward_func/std": 0.19428227841854095,
"sampling/importance_sampling_ratio/max": 1.5512616634368896,
"sampling/importance_sampling_ratio/mean": 1.0071200132369995,
"sampling/importance_sampling_ratio/min": 0.7788013219833374,
"sampling/sampling_logp_difference/max": 0.43915224075317383,
"sampling/sampling_logp_difference/mean": 0.008885648101568222,
"step": 15,
"step_time": 27.3166221219999
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.03605123230954632,
"epoch": 0.00032,
"grad_norm": 0.007162998430430889,
"kl": 0.0005329122045578671,
"learning_rate": 3.428571428571428e-06,
"loss": -0.0,
"step": 16,
"step_time": 12.10146868100037
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.04818721191259101,
"epoch": 0.00034,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0020011591259390116,
"kl": 0.000880227197208705,
"learning_rate": 3.657142857142857e-06,
"loss": 0.0,
"num_tokens": 933852.0,
"reward": 2.2396738529205322,
"reward_std": 0.3769412934780121,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9811877012252808,
"rewards/probe_shaping_dominance/std": 0.07417813688516617,
"rewards/probe_terminal_raw/mean": 0.019435975700616837,
"rewards/probe_terminal_raw/std": 0.07648143172264099,
"rewards/rollout_reward_func/mean": -0.554699718952179,
"rewards/rollout_reward_func/std": 0.14253978431224823,
"sampling/importance_sampling_ratio/max": 1.3911144733428955,
"sampling/importance_sampling_ratio/mean": 1.0014019012451172,
"sampling/importance_sampling_ratio/min": 0.647373378276825,
"sampling/sampling_logp_difference/max": 0.4348297119140625,
"sampling/sampling_logp_difference/mean": 0.01693439856171608,
"step": 17,
"step_time": 27.466287271999818
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.059825868549523875,
"epoch": 0.00036,
"grad_norm": 0.00400462094694376,
"kl": 0.0010442571770683529,
"learning_rate": 3.885714285714286e-06,
"loss": 0.0,
"step": 18,
"step_time": 11.729434232999665
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07400128486915492,
"epoch": 0.00038,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004407655913382769,
"kl": 0.0058712156430829054,
"learning_rate": 4.114285714285714e-06,
"loss": -0.0,
"num_tokens": 1040669.0,
"reward": 2.3979897499084473,
"reward_std": 0.3378089666366577,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9721384644508362,
"rewards/probe_shaping_dominance/std": 0.10964522510766983,
"rewards/probe_terminal_raw/mean": 0.03125,
"rewards/probe_terminal_raw/std": 0.12296734005212784,
"rewards/rollout_reward_func/mean": -0.4928986430168152,
"rewards/rollout_reward_func/std": 0.280559241771698,
"sampling/importance_sampling_ratio/max": 1.2489417791366577,
"sampling/importance_sampling_ratio/mean": 0.9779143333435059,
"sampling/importance_sampling_ratio/min": 0.5380392670631409,
"sampling/sampling_logp_difference/max": 0.619827151298523,
"sampling/sampling_logp_difference/mean": 0.017949596047401428,
"step": 19,
"step_time": 28.172511434999933
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.07153313838352915,
"epoch": 0.0004,
"grad_norm": 0.010058136656880379,
"kl": 0.01704683385832595,
"learning_rate": 4.342857142857142e-06,
"loss": -0.0,
"step": 20,
"step_time": 11.798744088000149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.78125,
"completions/mean_terminated_length": 2.78125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07958520320244133,
"epoch": 0.00042,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.015031801536679268,
"kl": 0.02134023218428638,
"learning_rate": 4.571428571428571e-06,
"loss": 0.0,
"num_tokens": 1146440.0,
"reward": 2.2259719371795654,
"reward_std": 0.4264923334121704,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.78125,
"rewards/probe_completion_length/std": 0.420013427734375,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9849475622177124,
"rewards/probe_shaping_dominance/std": 0.08514932543039322,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.505850613117218,
"rewards/rollout_reward_func/std": 0.22946372628211975,
"sampling/importance_sampling_ratio/max": 1.8730424642562866,
"sampling/importance_sampling_ratio/mean": 1.0450382232666016,
"sampling/importance_sampling_ratio/min": 0.6261028051376343,
"sampling/sampling_logp_difference/max": 0.6275629997253418,
"sampling/sampling_logp_difference/mean": 0.033233314752578735,
"step": 21,
"step_time": 27.33938806800029
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.08325157128274441,
"epoch": 0.00044,
"grad_norm": 0.01334489043802023,
"kl": 0.01684667149083907,
"learning_rate": 4.8e-06,
"loss": 0.0,
"step": 22,
"step_time": 11.840854454999999
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.8125,
"completions/mean_terminated_length": 2.8125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06853798200609162,
"epoch": 0.00046,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01755128987133503,
"kl": 0.003467819899402258,
"learning_rate": 5.0285714285714285e-06,
"loss": 0.0001,
"num_tokens": 1248638.0,
"reward": 2.270667552947998,
"reward_std": 0.47502174973487854,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.8125,
"rewards/probe_completion_length/std": 0.3965577781200409,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.978266716003418,
"rewards/probe_shaping_dominance/std": 0.08902076631784439,
"rewards/probe_terminal_raw/mean": 0.025406504049897194,
"rewards/probe_terminal_raw/std": 0.10275533050298691,
"rewards/rollout_reward_func/mean": -0.495505690574646,
"rewards/rollout_reward_func/std": 0.24283160269260406,
"sampling/importance_sampling_ratio/max": 2.039003610610962,
"sampling/importance_sampling_ratio/mean": 1.0263185501098633,
"sampling/importance_sampling_ratio/min": 0.6725395321846008,
"sampling/sampling_logp_difference/max": 0.8136651515960693,
"sampling/sampling_logp_difference/mean": 0.02945869043469429,
"step": 23,
"step_time": 27.97098964299971
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.0757814844546374,
"epoch": 0.00048,
"grad_norm": 0.02817094884812832,
"kl": 0.009625433100154623,
"learning_rate": 5.257142857142857e-06,
"loss": 0.0001,
"step": 24,
"step_time": 11.866423993000353
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.046443949002423324,
"epoch": 0.0005,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.002482979791238904,
"kl": 0.011937914369631542,
"learning_rate": 5.485714285714286e-06,
"loss": -0.0,
"num_tokens": 1348967.0,
"reward": 2.4115562438964844,
"reward_std": 0.4029836654663086,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.987568199634552,
"rewards/probe_shaping_dominance/std": 0.07032480090856552,
"rewards/probe_terminal_raw/mean": 0.011941056698560715,
"rewards/probe_terminal_raw/std": 0.06754881888628006,
"rewards/rollout_reward_func/mean": -0.4754529595375061,
"rewards/rollout_reward_func/std": 0.20119507610797882,
"sampling/importance_sampling_ratio/max": 1.2200837135314941,
"sampling/importance_sampling_ratio/mean": 0.9975783824920654,
"sampling/importance_sampling_ratio/min": 0.8279879689216614,
"sampling/sampling_logp_difference/max": 0.1989191770553589,
"sampling/sampling_logp_difference/mean": 0.011062754318118095,
"step": 25,
"step_time": 26.57660025700011
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"entropy": 0.048878253597649746,
"epoch": 0.00052,
"grad_norm": 0.009242719039320946,
"kl": 0.008345632606265863,
"learning_rate": 5.7142857142857145e-06,
"loss": -0.0,
"step": 26,
"step_time": 11.446816336000438
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.03407000357401557,
"epoch": 0.00054,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0032159958500415087,
"kl": 0.0009551170151098631,
"learning_rate": 5.942857142857143e-06,
"loss": 0.0001,
"num_tokens": 1454840.0,
"reward": 2.308957099914551,
"reward_std": 0.35809147357940674,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9919047951698303,
"rewards/probe_shaping_dominance/std": 0.04579342529177666,
"rewards/probe_terminal_raw/mean": 0.00825711339712143,
"rewards/probe_terminal_raw/std": 0.04670928418636322,
"rewards/rollout_reward_func/mean": -0.4849545955657959,
"rewards/rollout_reward_func/std": 0.17723596096038818,
"sampling/importance_sampling_ratio/max": 1.3277825117111206,
"sampling/importance_sampling_ratio/mean": 1.03197181224823,
"sampling/importance_sampling_ratio/min": 0.9784432053565979,
"sampling/sampling_logp_difference/max": 0.2835111618041992,
"sampling/sampling_logp_difference/mean": 0.010589659214019775,
"step": 27,
"step_time": 27.828797529999974
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.031250000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"entropy": 0.03843506844714284,
"epoch": 0.00056,
"grad_norm": 0.001164909452199936,
"kl": 0.0005121690442896343,
"learning_rate": 6.171428571428571e-06,
"loss": 0.0001,
"step": 28,
"step_time": 11.809285704000104
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.04930314904777333,
"epoch": 0.00058,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.002748744795098901,
"kl": 0.004907883932952495,
"learning_rate": 6.4e-06,
"loss": -0.0,
"num_tokens": 1556979.0,
"reward": 2.240399122238159,
"reward_std": 0.4602973461151123,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9860905408859253,
"rewards/probe_shaping_dominance/std": 0.0786839947104454,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.5863164663314819,
"rewards/rollout_reward_func/std": 0.2140309065580368,
"sampling/importance_sampling_ratio/max": 1.2453359365463257,
"sampling/importance_sampling_ratio/mean": 0.9654719233512878,
"sampling/importance_sampling_ratio/min": 0.4166664183139801,
"sampling/sampling_logp_difference/max": 0.8754727840423584,
"sampling/sampling_logp_difference/mean": 0.023819994181394577,
"step": 29,
"step_time": 26.907328863000203
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"entropy": 0.04787830199347809,
"epoch": 0.0006,
"grad_norm": 0.004575630649924278,
"kl": 0.021033072499267114,
"learning_rate": 6.628571428571428e-06,
"loss": -0.0,
"step": 30,
"step_time": 12.03838489500049
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06083334801951423,
"epoch": 0.00062,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.013260968960821629,
"kl": 0.0185297402889546,
"learning_rate": 6.857142857142856e-06,
"loss": 0.0001,
"num_tokens": 1662740.0,
"reward": 2.1973555088043213,
"reward_std": 0.43850135803222656,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.991719663143158,
"rewards/probe_shaping_dominance/std": 0.046840641647577286,
"rewards/probe_terminal_raw/mean": 0.008384146727621555,
"rewards/probe_terminal_raw/std": 0.04742789641022682,
"rewards/rollout_reward_func/mean": -0.5964983701705933,
"rewards/rollout_reward_func/std": 0.296856164932251,
"sampling/importance_sampling_ratio/max": 2.8883938789367676,
"sampling/importance_sampling_ratio/mean": 1.041499376296997,
"sampling/importance_sampling_ratio/min": 0.611585795879364,
"sampling/sampling_logp_difference/max": 0.9767682552337646,
"sampling/sampling_logp_difference/mean": 0.02332986891269684,
"step": 31,
"step_time": 27.415389096000126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.03750000149011612,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03750000149011612,
"entropy": 0.06045454426202923,
"epoch": 0.00064,
"grad_norm": 0.014426084235310555,
"kl": 0.027800074360129656,
"learning_rate": 7.085714285714285e-06,
"loss": 0.0001,
"step": 32,
"step_time": 11.844893076999824
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0422610079695005,
"epoch": 0.00066,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005128172226250172,
"kl": 0.009348716392499568,
"learning_rate": 7.314285714285714e-06,
"loss": 0.0,
"num_tokens": 1765521.0,
"reward": 2.3525331020355225,
"reward_std": 0.3403870165348053,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.9837720394134521,
"rewards/probe_shaping_dominance/std": 0.06571495532989502,
"rewards/probe_terminal_raw/mean": 0.01880081370472908,
"rewards/probe_terminal_raw/std": 0.0745616927742958,
"rewards/rollout_reward_func/mean": -0.5375398397445679,
"rewards/rollout_reward_func/std": 0.22309184074401855,
"sampling/importance_sampling_ratio/max": 1.275700569152832,
"sampling/importance_sampling_ratio/mean": 0.994273841381073,
"sampling/importance_sampling_ratio/min": 0.600629448890686,
"sampling/sampling_logp_difference/max": 0.5097755193710327,
"sampling/sampling_logp_difference/mean": 0.011872323229908943,
"step": 33,
"step_time": 27.277823512999475
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"entropy": 0.0630603444587905,
"epoch": 0.00068,
"grad_norm": 0.007451063022017479,
"kl": 0.007260499390742581,
"learning_rate": 7.542857142857142e-06,
"loss": 0.0,
"step": 34,
"step_time": 12.15706381699988
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 4.0,
"completions/max_terminated_length": 4.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06165817377041094,
"epoch": 0.0007,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01730046235024929,
"kl": 0.007911830088153327,
"learning_rate": 7.771428571428572e-06,
"loss": 0.0,
"num_tokens": 1868519.0,
"reward": 2.275172233581543,
"reward_std": 0.48706814646720886,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.42121174931526184,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9609175324440002,
"rewards/probe_shaping_dominance/std": 0.12447085976600647,
"rewards/probe_terminal_raw/mean": 0.042174797505140305,
"rewards/probe_terminal_raw/std": 0.13503843545913696,
"rewards/rollout_reward_func/mean": -0.552919864654541,
"rewards/rollout_reward_func/std": 0.20079734921455383,
"sampling/importance_sampling_ratio/max": 2.4695143699645996,
"sampling/importance_sampling_ratio/mean": 1.0170851945877075,
"sampling/importance_sampling_ratio/min": 0.5358201861381531,
"sampling/sampling_logp_difference/max": 0.9040230512619019,
"sampling/sampling_logp_difference/mean": 0.023447973653674126,
"step": 35,
"step_time": 26.740296546999843
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.02291666716337204,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"entropy": 0.058829510177019984,
"epoch": 0.00072,
"grad_norm": 0.0026921494863927364,
"kl": 0.008077224918185522,
"learning_rate": 8e-06,
"loss": 0.0,
"step": 36,
"step_time": 11.526741372999822
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.78125,
"completions/mean_terminated_length": 2.78125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0314667156167161,
"epoch": 0.00074,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0024028760381042957,
"kl": 0.00625098004627489,
"learning_rate": 7.999999998518522e-06,
"loss": -0.0,
"num_tokens": 1970124.0,
"reward": 2.264838933944702,
"reward_std": 0.5270799994468689,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.78125,
"rewards/probe_completion_length/std": 0.420013427734375,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9424034953117371,
"rewards/probe_shaping_dominance/std": 0.13680323958396912,
"rewards/probe_terminal_raw/mean": 0.05856199190020561,
"rewards/probe_terminal_raw/std": 0.1405627578496933,
"rewards/rollout_reward_func/mean": -0.4673765003681183,
"rewards/rollout_reward_func/std": 0.2097388207912445,
"sampling/importance_sampling_ratio/max": 1.8680520057678223,
"sampling/importance_sampling_ratio/mean": 1.0426936149597168,
"sampling/importance_sampling_ratio/min": 0.9883837103843689,
"sampling/sampling_logp_difference/max": 0.6248946189880371,
"sampling/sampling_logp_difference/mean": 0.012692131102085114,
"step": 37,
"step_time": 26.3523716899997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.0313223133775864,
"epoch": 0.00076,
"grad_norm": 0.0023324843496084213,
"kl": 0.0035868614445746516,
"learning_rate": 7.99999999407409e-06,
"loss": -0.0,
"step": 38,
"step_time": 12.628685679
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05816701124422252,
"epoch": 0.00078,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007494654040783644,
"kl": 0.03421914212867705,
"learning_rate": 7.999999986666703e-06,
"loss": -0.0,
"num_tokens": 2076598.0,
"reward": 2.311230182647705,
"reward_std": 0.36618658900260925,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.9932495951652527,
"rewards/probe_shaping_dominance/std": 0.03818599134683609,
"rewards/probe_terminal_raw/mean": 0.00889227632433176,
"rewards/probe_terminal_raw/std": 0.05030231550335884,
"rewards/rollout_reward_func/mean": -0.6096617579460144,
"rewards/rollout_reward_func/std": 0.20722205936908722,
"sampling/importance_sampling_ratio/max": 1.4155004024505615,
"sampling/importance_sampling_ratio/mean": 0.9876462817192078,
"sampling/importance_sampling_ratio/min": 0.7839126586914062,
"sampling/sampling_logp_difference/max": 0.3471514582633972,
"sampling/sampling_logp_difference/mean": 0.0168665312230587,
"step": 39,
"step_time": 26.542540336999764
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.057803097704891115,
"epoch": 0.0008,
"grad_norm": 0.004047502297908068,
"kl": 0.02604524488651805,
"learning_rate": 7.99999997629636e-06,
"loss": -0.0,
"step": 40,
"step_time": 11.67055183600064
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.030615816707722843,
"epoch": 0.00082,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.002531230915337801,
"kl": 0.0002023791248291218,
"learning_rate": 7.999999962963062e-06,
"loss": 0.0,
"num_tokens": 2182025.0,
"reward": 2.3659095764160156,
"reward_std": 0.3363305926322937,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.970511794090271,
"rewards/probe_shaping_dominance/std": 0.11608950048685074,
"rewards/probe_terminal_raw/mean": 0.03125,
"rewards/probe_terminal_raw/std": 0.12296734005212784,
"rewards/rollout_reward_func/mean": -0.49210208654403687,
"rewards/rollout_reward_func/std": 0.19491301476955414,
"sampling/importance_sampling_ratio/max": 1.0795150995254517,
"sampling/importance_sampling_ratio/mean": 1.0009956359863281,
"sampling/importance_sampling_ratio/min": 0.9117990136146545,
"sampling/sampling_logp_difference/max": 0.09234827756881714,
"sampling/sampling_logp_difference/mean": 0.004786844830960035,
"step": 41,
"step_time": 26.691086626000242
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.029701224237214774,
"epoch": 0.00084,
"grad_norm": 0.0024189443793147802,
"kl": 0.0002964178702313802,
"learning_rate": 7.999999946666809e-06,
"loss": 0.0,
"step": 42,
"step_time": 12.699684607000108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 4.0,
"completions/max_terminated_length": 4.0,
"completions/mean_length": 3.0,
"completions/mean_terminated_length": 3.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05056236406426251,
"epoch": 0.00086,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01209098007529974,
"kl": 0.010471812368450628,
"learning_rate": 7.999999927407602e-06,
"loss": -0.0,
"num_tokens": 2286142.0,
"reward": 2.469311237335205,
"reward_std": 0.4115804135799408,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 2.0,
"rewards/probe_completion_length/std": 0.2540002465248108,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9852168560028076,
"rewards/probe_shaping_dominance/std": 0.0836259201169014,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.48153066635131836,
"rewards/rollout_reward_func/std": 0.24715669453144073,
"sampling/importance_sampling_ratio/max": 2.0913164615631104,
"sampling/importance_sampling_ratio/mean": 1.0417256355285645,
"sampling/importance_sampling_ratio/min": 0.8711547255516052,
"sampling/sampling_logp_difference/max": 0.7377924919128418,
"sampling/sampling_logp_difference/mean": 0.016645925119519234,
"step": 43,
"step_time": 26.97034319699992
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"entropy": 0.06137717463207082,
"epoch": 0.00088,
"grad_norm": 0.004214904736727476,
"kl": 0.02022934940032428,
"learning_rate": 7.99999990518544e-06,
"loss": -0.0,
"step": 44,
"step_time": 11.70073124500027
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.028125000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.78125,
"completions/mean_terminated_length": 2.78125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0973230431554839,
"epoch": 0.0009,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.008132712915539742,
"kl": 0.012426901788174405,
"learning_rate": 7.999999880000322e-06,
"loss": 0.0,
"num_tokens": 2390804.0,
"reward": 2.2431583404541016,
"reward_std": 0.5248546600341797,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.78125,
"rewards/probe_completion_length/std": 0.420013427734375,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9669345021247864,
"rewards/probe_shaping_dominance/std": 0.10819793492555618,
"rewards/probe_terminal_raw/mean": 0.038998983800411224,
"rewards/probe_terminal_raw/std": 0.1286177635192871,
"rewards/rollout_reward_func/mean": -0.4940252900123596,
"rewards/rollout_reward_func/std": 0.255024790763855,
"sampling/importance_sampling_ratio/max": 1.6163866519927979,
"sampling/importance_sampling_ratio/mean": 0.9977768659591675,
"sampling/importance_sampling_ratio/min": 0.3879617154598236,
"sampling/sampling_logp_difference/max": 0.9467527270317078,
"sampling/sampling_logp_difference/mean": 0.02932477556169033,
"step": 45,
"step_time": 26.472695325999894
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.043750000186264515,
"entropy": 0.09132259455509484,
"epoch": 0.00092,
"grad_norm": 0.004103749990463257,
"kl": 0.02156046110090415,
"learning_rate": 7.99999985185225e-06,
"loss": 0.0,
"step": 46,
"step_time": 12.17020241299997
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08302483463194221,
"epoch": 0.00094,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00962382648140192,
"kl": 0.05296483388110573,
"learning_rate": 7.999999820741223e-06,
"loss": 0.0,
"num_tokens": 2498950.0,
"reward": 2.3484296798706055,
"reward_std": 0.40232396125793457,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9905115365982056,
"rewards/probe_shaping_dominance/std": 0.05367483198642731,
"rewards/probe_terminal_raw/mean": 0.009019308723509312,
"rewards/probe_terminal_raw/std": 0.05102091282606125,
"rewards/rollout_reward_func/mean": -0.507351279258728,
"rewards/rollout_reward_func/std": 0.22662682831287384,
"sampling/importance_sampling_ratio/max": 1.3692384958267212,
"sampling/importance_sampling_ratio/mean": 0.9901071786880493,
"sampling/importance_sampling_ratio/min": 0.3076327443122864,
"sampling/sampling_logp_difference/max": 1.179471731185913,
"sampling/sampling_logp_difference/mean": 0.03242562711238861,
"step": 47,
"step_time": 26.895124169999463
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"entropy": 0.07248554454417899,
"epoch": 0.00096,
"grad_norm": 0.01555224135518074,
"kl": 0.039988372170228104,
"learning_rate": 7.99999978666724e-06,
"loss": -0.0,
"step": 48,
"step_time": 11.803917615999808
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06659889499132987,
"epoch": 0.00098,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007047319784760475,
"kl": 0.038143942947499454,
"learning_rate": 7.999999749630303e-06,
"loss": 0.0001,
"num_tokens": 2605752.0,
"reward": 2.304872512817383,
"reward_std": 0.4004109501838684,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5201276540756226,
"rewards/rollout_reward_func/std": 0.2584696114063263,
"sampling/importance_sampling_ratio/max": 2.615042209625244,
"sampling/importance_sampling_ratio/mean": 1.0269113779067993,
"sampling/importance_sampling_ratio/min": 0.39808669686317444,
"sampling/sampling_logp_difference/max": 0.9612793922424316,
"sampling/sampling_logp_difference/mean": 0.03832431882619858,
"step": 49,
"step_time": 26.91781551100007
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.0618492451030761,
"epoch": 0.001,
"grad_norm": 0.00791104231029749,
"kl": 0.05557279207035515,
"learning_rate": 7.999999709630412e-06,
"loss": 0.0001,
"step": 50,
"step_time": 12.788009578999208
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05631835470558144,
"epoch": 0.00102,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0038132003974169493,
"kl": 0.029594353904632498,
"learning_rate": 7.999999666667564e-06,
"loss": 0.0,
"num_tokens": 2707257.0,
"reward": 2.346804618835449,
"reward_std": 0.2936249077320099,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.571945309638977,
"rewards/rollout_reward_func/std": 0.23333650827407837,
"sampling/importance_sampling_ratio/max": 1.6730494499206543,
"sampling/importance_sampling_ratio/mean": 0.9981693029403687,
"sampling/importance_sampling_ratio/min": 0.40917959809303284,
"sampling/sampling_logp_difference/max": 0.9063196182250977,
"sampling/sampling_logp_difference/mean": 0.024803204461932182,
"step": 51,
"step_time": 26.73268852599972
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"entropy": 0.05670045691658743,
"epoch": 0.00104,
"grad_norm": 0.003768681548535824,
"kl": 0.030258090482694455,
"learning_rate": 7.999999620741765e-06,
"loss": 0.0,
"step": 52,
"step_time": 11.579914525999584
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.8125,
"completions/mean_terminated_length": 2.8125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.1101932916790247,
"epoch": 0.00106,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0030547163914889097,
"kl": 0.01951221001081649,
"learning_rate": 7.999999571853009e-06,
"loss": 0.0,
"num_tokens": 2811393.0,
"reward": 2.1927480697631836,
"reward_std": 0.406143456697464,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.8125,
"rewards/probe_completion_length/std": 0.3965577781200409,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9884125590324402,
"rewards/probe_shaping_dominance/std": 0.0655483826994896,
"rewards/probe_terminal_raw/mean": 0.01143292710185051,
"rewards/probe_terminal_raw/std": 0.06467439979314804,
"rewards/rollout_reward_func/mean": -0.5695973038673401,
"rewards/rollout_reward_func/std": 0.16589799523353577,
"sampling/importance_sampling_ratio/max": 1.0527032613754272,
"sampling/importance_sampling_ratio/mean": 0.9693626165390015,
"sampling/importance_sampling_ratio/min": 0.5484977960586548,
"sampling/sampling_logp_difference/max": 0.6245040893554688,
"sampling/sampling_logp_difference/mean": 0.023702893406152725,
"step": 53,
"step_time": 27.233809398999938
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.02291666716337204,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"entropy": 0.11004186974605545,
"epoch": 0.00108,
"grad_norm": 0.006082055624574423,
"kl": 0.04293493747854882,
"learning_rate": 7.999999520001299e-06,
"loss": 0.0,
"step": 54,
"step_time": 12.14583877500013
},
{
"clip_ratio/high_max": 0.05208333395421505,
"clip_ratio/high_mean": 0.026041666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.026041666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0866635709971888,
"epoch": 0.0011,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005786150228232145,
"kl": 0.045153988463084715,
"learning_rate": 7.999999465186634e-06,
"loss": 0.0,
"num_tokens": 2914367.0,
"reward": 2.3385372161865234,
"reward_std": 0.3273521363735199,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5177128314971924,
"rewards/rollout_reward_func/std": 0.2579730451107025,
"sampling/importance_sampling_ratio/max": 1.2267568111419678,
"sampling/importance_sampling_ratio/mean": 0.9484584331512451,
"sampling/importance_sampling_ratio/min": 0.5135900378227234,
"sampling/sampling_logp_difference/max": 0.6663306355476379,
"sampling/sampling_logp_difference/mean": 0.0320717915892601,
"step": 55,
"step_time": 26.36462075400027
},
{
"clip_ratio/high_max": 0.0729166679084301,
"clip_ratio/high_mean": 0.046875000931322575,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.046875000931322575,
"entropy": 0.09378209740680177,
"epoch": 0.00112,
"grad_norm": 0.007270520552992821,
"kl": 0.05788560025212064,
"learning_rate": 7.999999407409014e-06,
"loss": 0.0,
"step": 56,
"step_time": 11.583988187999921
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.8125,
"completions/mean_terminated_length": 2.8125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08144025912042707,
"epoch": 0.00114,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006977744400501251,
"kl": 0.16513798182256778,
"learning_rate": 7.99999934666844e-06,
"loss": -0.0,
"num_tokens": 3018848.0,
"reward": 2.2243924140930176,
"reward_std": 0.4345919191837311,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.8125,
"rewards/probe_completion_length/std": 0.3965577781200409,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9774075150489807,
"rewards/probe_shaping_dominance/std": 0.08980447798967361,
"rewards/probe_terminal_raw/mean": 0.02489837259054184,
"rewards/probe_terminal_raw/std": 0.1013173907995224,
"rewards/rollout_reward_func/mean": -0.540413498878479,
"rewards/rollout_reward_func/std": 0.20110559463500977,
"sampling/importance_sampling_ratio/max": 2.1173288822174072,
"sampling/importance_sampling_ratio/mean": 1.0253949165344238,
"sampling/importance_sampling_ratio/min": 0.34861743450164795,
"sampling/sampling_logp_difference/max": 1.0653817653656006,
"sampling/sampling_logp_difference/mean": 0.03663061559200287,
"step": 57,
"step_time": 27.63207101699959
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.08666177466511726,
"epoch": 0.00116,
"grad_norm": 0.00648898771032691,
"kl": 0.14551325980573893,
"learning_rate": 7.999999282964912e-06,
"loss": 0.0,
"step": 58,
"step_time": 12.149218646000236
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0776638601673767,
"epoch": 0.00118,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006341388914734125,
"kl": 0.1438233179026156,
"learning_rate": 7.999999216298429e-06,
"loss": 0.0,
"num_tokens": 3118313.0,
"reward": 2.337385654449463,
"reward_std": 0.40537285804748535,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.9689397215843201,
"rewards/probe_shaping_dominance/std": 0.09820227324962616,
"rewards/probe_terminal_raw/mean": 0.03201219439506531,
"rewards/probe_terminal_raw/std": 0.10123317688703537,
"rewards/rollout_reward_func/mean": -0.5198163986206055,
"rewards/rollout_reward_func/std": 0.24933888018131256,
"sampling/importance_sampling_ratio/max": 1.642152190208435,
"sampling/importance_sampling_ratio/mean": 0.9745345115661621,
"sampling/importance_sampling_ratio/min": 0.32652705907821655,
"sampling/sampling_logp_difference/max": 1.1220024824142456,
"sampling/sampling_logp_difference/mean": 0.04093600809574127,
"step": 59,
"step_time": 26.148361385999806
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.08309375832322985,
"epoch": 0.0012,
"grad_norm": 0.009624861180782318,
"kl": 0.15202067893005733,
"learning_rate": 7.999999146668991e-06,
"loss": 0.0,
"step": 60,
"step_time": 11.512923075000117
},
{
"clip_ratio/high_max": 0.07083333469927311,
"clip_ratio/high_mean": 0.035416667349636555,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.056250001303851604,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.10362166631966829,
"epoch": 0.00122,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.012691067531704903,
"kl": 0.22026659833500162,
"learning_rate": 7.999999074076601e-06,
"loss": 0.0001,
"num_tokens": 3227556.0,
"reward": 2.3282229900360107,
"reward_std": 0.4200522303581238,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.972678005695343,
"rewards/probe_shaping_dominance/std": 0.10808944702148438,
"rewards/probe_terminal_raw/mean": 0.03125,
"rewards/probe_terminal_raw/std": 0.12296734005212784,
"rewards/rollout_reward_func/mean": -0.4694550037384033,
"rewards/rollout_reward_func/std": 0.2165255844593048,
"sampling/importance_sampling_ratio/max": 1.6590189933776855,
"sampling/importance_sampling_ratio/mean": 0.9916884899139404,
"sampling/importance_sampling_ratio/min": 0.47236600518226624,
"sampling/sampling_logp_difference/max": 0.7500003576278687,
"sampling/sampling_logp_difference/mean": 0.045740097761154175,
"step": 61,
"step_time": 28.188819883999713
},
{
"clip_ratio/high_max": 0.07083333469927311,
"clip_ratio/high_mean": 0.035416667349636555,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04583333432674408,
"entropy": 0.10312735941261053,
"epoch": 0.00124,
"grad_norm": 0.019286708906292915,
"kl": 0.11081840936094522,
"learning_rate": 7.999998998521257e-06,
"loss": 0.0001,
"step": 62,
"step_time": 11.837706676999915
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08664211053110193,
"epoch": 0.00126,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.015839533880352974,
"kl": 0.10350155318124621,
"learning_rate": 7.999998920002956e-06,
"loss": -0.0,
"num_tokens": 3332394.0,
"reward": 2.405167579650879,
"reward_std": 0.46130281686782837,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.9712571501731873,
"rewards/probe_shaping_dominance/std": 0.11318810284137726,
"rewards/probe_terminal_raw/mean": 0.03125,
"rewards/probe_terminal_raw/std": 0.12296734005212784,
"rewards/rollout_reward_func/mean": -0.48483964800834656,
"rewards/rollout_reward_func/std": 0.24800339341163635,
"sampling/importance_sampling_ratio/max": 1.9499810934066772,
"sampling/importance_sampling_ratio/mean": 0.9958123564720154,
"sampling/importance_sampling_ratio/min": 0.30673947930336,
"sampling/sampling_logp_difference/max": 0.8753989338874817,
"sampling/sampling_logp_difference/mean": 0.03312094882130623,
"step": 63,
"step_time": 26.684526995999477
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"entropy": 0.08623273627017625,
"epoch": 0.00128,
"grad_norm": 0.022980431094765663,
"kl": 0.11929617358450173,
"learning_rate": 7.999998838521705e-06,
"loss": -0.0,
"step": 64,
"step_time": 12.258063536000009
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.8125,
"completions/mean_terminated_length": 2.8125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07359768182504922,
"epoch": 0.0013,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.011483771726489067,
"kl": 0.10457528214246281,
"learning_rate": 7.999998754077496e-06,
"loss": -0.0,
"num_tokens": 3436726.0,
"reward": 2.377361297607422,
"reward_std": 0.5483381748199463,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.8125,
"rewards/probe_completion_length/std": 0.3965577781200409,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.41638875007629395,
"rewards/rollout_reward_func/std": 0.2915210723876953,
"sampling/importance_sampling_ratio/max": 1.1877729892730713,
"sampling/importance_sampling_ratio/mean": 0.9874942898750305,
"sampling/importance_sampling_ratio/min": 0.26991596817970276,
"sampling/sampling_logp_difference/max": 1.309645414352417,
"sampling/sampling_logp_difference/mean": 0.027806004509329796,
"step": 65,
"step_time": 27.115505474999736
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"entropy": 0.06954771315213293,
"epoch": 0.00132,
"grad_norm": 0.011225158348679543,
"kl": 0.4594924821127222,
"learning_rate": 7.999998666670336e-06,
"loss": -0.0,
"step": 66,
"step_time": 11.664916763999372
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05904226377606392,
"epoch": 0.00134,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.012288263067603111,
"kl": 0.0946728276903741,
"learning_rate": 7.999998576300222e-06,
"loss": -0.0,
"num_tokens": 3541291.0,
"reward": 2.2826719284057617,
"reward_std": 0.36464667320251465,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9738304615020752,
"rewards/probe_shaping_dominance/std": 0.0877876877784729,
"rewards/probe_terminal_raw/mean": 0.03137703239917755,
"rewards/probe_terminal_raw/std": 0.10557617992162704,
"rewards/rollout_reward_func/mean": -0.6100356578826904,
"rewards/rollout_reward_func/std": 0.23593732714653015,
"sampling/importance_sampling_ratio/max": 1.271332859992981,
"sampling/importance_sampling_ratio/mean": 0.9844968914985657,
"sampling/importance_sampling_ratio/min": 0.3530118763446808,
"sampling/sampling_logp_difference/max": 1.0369465351104736,
"sampling/sampling_logp_difference/mean": 0.02148618921637535,
"step": 67,
"step_time": 26.421768857000643
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.06591829867102206,
"epoch": 0.00136,
"grad_norm": 0.01136076170951128,
"kl": 0.09406092630524654,
"learning_rate": 7.999998482967154e-06,
"loss": -0.0,
"step": 68,
"step_time": 12.272947167999973
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09870199719443917,
"epoch": 0.00138,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.017969369888305664,
"kl": 0.16196376640436938,
"learning_rate": 7.999998386671134e-06,
"loss": 0.0,
"num_tokens": 3645068.0,
"reward": 2.2971627712249756,
"reward_std": 0.3776472806930542,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9890751838684082,
"rewards/probe_shaping_dominance/std": 0.06179998442530632,
"rewards/probe_terminal_raw/mean": 0.01092479657381773,
"rewards/probe_terminal_raw/std": 0.06179998070001602,
"rewards/rollout_reward_func/mean": -0.5590872764587402,
"rewards/rollout_reward_func/std": 0.19611209630966187,
"sampling/importance_sampling_ratio/max": 2.4048268795013428,
"sampling/importance_sampling_ratio/mean": 0.9662601947784424,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.3840640783309937,
"sampling/sampling_logp_difference/mean": 0.0624161995947361,
"step": 69,
"step_time": 26.791781901999457
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.1055372767150402,
"epoch": 0.0014,
"grad_norm": 0.006739933043718338,
"kl": 0.17029937845654786,
"learning_rate": 7.999998287412158e-06,
"loss": 0.0,
"step": 70,
"step_time": 11.527228552999532
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0416666679084301,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0755673204548657,
"epoch": 0.00142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0038206640165299177,
"kl": 0.27058742146891746,
"learning_rate": 7.99999818519023e-06,
"loss": -0.0,
"num_tokens": 3745050.0,
"reward": 2.4418420791625977,
"reward_std": 0.3276258409023285,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9443497061729431,
"rewards/probe_shaping_dominance/std": 0.17115506529808044,
"rewards/probe_terminal_raw/mean": 0.05487804859876633,
"rewards/probe_terminal_raw/std": 0.15910547971725464,
"rewards/rollout_reward_func/mean": -0.4761357307434082,
"rewards/rollout_reward_func/std": 0.27386248111724854,
"sampling/importance_sampling_ratio/max": 1.2027363777160645,
"sampling/importance_sampling_ratio/mean": 0.9526693224906921,
"sampling/importance_sampling_ratio/min": 0.26859819889068604,
"sampling/sampling_logp_difference/max": 1.314541220664978,
"sampling/sampling_logp_difference/mean": 0.04236820340156555,
"step": 71,
"step_time": 25.810139078000248
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.07833350286819041,
"epoch": 0.00144,
"grad_norm": 0.006155087612569332,
"kl": 0.15766439647995867,
"learning_rate": 7.999998080005348e-06,
"loss": -0.0,
"step": 72,
"step_time": 11.807300304999444
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.02291666716337204,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 4.0,
"completions/max_terminated_length": 4.0,
"completions/mean_length": 2.78125,
"completions/mean_terminated_length": 2.78125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.12527845823206007,
"epoch": 0.00146,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.012825227342545986,
"kl": 0.4211071440950036,
"learning_rate": 7.999997971857512e-06,
"loss": 0.0001,
"num_tokens": 3846778.0,
"reward": 2.290764570236206,
"reward_std": 0.5837900042533875,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.78125,
"rewards/probe_completion_length/std": 0.4908435642719269,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9892492890357971,
"rewards/probe_shaping_dominance/std": 0.06081530824303627,
"rewards/probe_terminal_raw/mean": 0.010797764174640179,
"rewards/probe_terminal_raw/std": 0.06108137592673302,
"rewards/rollout_reward_func/mean": -0.4405323565006256,
"rewards/rollout_reward_func/std": 0.3242381811141968,
"sampling/importance_sampling_ratio/max": 1.6338335275650024,
"sampling/importance_sampling_ratio/mean": 0.9540376663208008,
"sampling/importance_sampling_ratio/min": 0.19394879043102264,
"sampling/sampling_logp_difference/max": 1.26481294631958,
"sampling/sampling_logp_difference/mean": 0.07170334458351135,
"step": 73,
"step_time": 27.727274773000772
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.035416667349636555,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.035416667349636555,
"entropy": 0.1369485834147781,
"epoch": 0.00148,
"grad_norm": 0.006000218912959099,
"kl": 0.3834730681264773,
"learning_rate": 7.999997860746726e-06,
"loss": 0.0,
"step": 74,
"step_time": 11.550198297999486
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05670425167772919,
"epoch": 0.0015,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004246127791702747,
"kl": 0.26258886672280823,
"learning_rate": 7.999997746672985e-06,
"loss": 0.0001,
"num_tokens": 3952684.0,
"reward": 2.3076558113098145,
"reward_std": 0.2708474397659302,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5798441171646118,
"rewards/rollout_reward_func/std": 0.21061494946479797,
"sampling/importance_sampling_ratio/max": 1.4762965440750122,
"sampling/importance_sampling_ratio/mean": 0.9765973091125488,
"sampling/importance_sampling_ratio/min": 0.1482001394033432,
"sampling/sampling_logp_difference/max": 1.9091930389404297,
"sampling/sampling_logp_difference/mean": 0.034642815589904785,
"step": 75,
"step_time": 27.424144634000186
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.06237753387540579,
"epoch": 0.00152,
"grad_norm": 0.005785573739558458,
"kl": 0.34405436088127317,
"learning_rate": 7.999997629636291e-06,
"loss": 0.0001,
"step": 76,
"step_time": 12.303879873000824
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08415639377199113,
"epoch": 0.00154,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005243807099759579,
"kl": 0.17415540551155573,
"learning_rate": 7.999997509636644e-06,
"loss": 0.0,
"num_tokens": 4058589.0,
"reward": 2.46805739402771,
"reward_std": 0.32934877276420593,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9650155901908875,
"rewards/probe_shaping_dominance/std": 0.11404264718294144,
"rewards/probe_terminal_raw/mean": 0.04090446978807449,
"rewards/probe_terminal_raw/std": 0.13221491873264313,
"rewards/rollout_reward_func/mean": -0.45661279559135437,
"rewards/rollout_reward_func/std": 0.2438260018825531,
"sampling/importance_sampling_ratio/max": 1.467045783996582,
"sampling/importance_sampling_ratio/mean": 0.9993070363998413,
"sampling/importance_sampling_ratio/min": 0.5919517874717712,
"sampling/sampling_logp_difference/max": 0.5126774311065674,
"sampling/sampling_logp_difference/mean": 0.021975167095661163,
"step": 77,
"step_time": 27.026433300999997
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.08070441009476781,
"epoch": 0.00156,
"grad_norm": 0.0065447925589978695,
"kl": 0.1744868414461962,
"learning_rate": 7.999997386674047e-06,
"loss": 0.0,
"step": 78,
"step_time": 11.744910646999415
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07671235466841608,
"epoch": 0.00158,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007739327382296324,
"kl": 0.10829602145804529,
"learning_rate": 7.999997260748495e-06,
"loss": 0.0,
"num_tokens": 4163362.0,
"reward": 2.291594982147217,
"reward_std": 0.39855584502220154,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9897778034210205,
"rewards/probe_shaping_dominance/std": 0.05782533064484596,
"rewards/probe_terminal_raw/mean": 0.009908536449074745,
"rewards/probe_terminal_raw/std": 0.05605114996433258,
"rewards/rollout_reward_func/mean": -0.5330914855003357,
"rewards/rollout_reward_func/std": 0.2664976716041565,
"sampling/importance_sampling_ratio/max": 1.3343223333358765,
"sampling/importance_sampling_ratio/mean": 0.9947078227996826,
"sampling/importance_sampling_ratio/min": 0.4244631230831146,
"sampling/sampling_logp_difference/max": 0.9074487686157227,
"sampling/sampling_logp_difference/mean": 0.022345466539263725,
"step": 79,
"step_time": 27.107436816999325
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.08035149308852851,
"epoch": 0.0016,
"grad_norm": 0.00506787933409214,
"kl": 0.1221858259250439,
"learning_rate": 7.999997131859992e-06,
"loss": 0.0,
"step": 80,
"step_time": 12.165714977000334
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.031250000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04375000111758709,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.1357073881663382,
"epoch": 0.00162,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.008707523345947266,
"kl": 0.19407588429749012,
"learning_rate": 7.999997000008536e-06,
"loss": 0.0,
"num_tokens": 4264863.0,
"reward": 2.4384140968322754,
"reward_std": 0.4922390580177307,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.963716983795166,
"rewards/probe_shaping_dominance/std": 0.11659354716539383,
"rewards/probe_terminal_raw/mean": 0.03658536449074745,
"rewards/probe_terminal_raw/std": 0.11809173226356506,
"rewards/rollout_reward_func/mean": -0.44938817620277405,
"rewards/rollout_reward_func/std": 0.28418225049972534,
"sampling/importance_sampling_ratio/max": 1.7522894144058228,
"sampling/importance_sampling_ratio/mean": 0.9879751205444336,
"sampling/importance_sampling_ratio/min": 0.4941127300262451,
"sampling/sampling_logp_difference/max": 0.5609221458435059,
"sampling/sampling_logp_difference/mean": 0.03759397938847542,
"step": 81,
"step_time": 26.34822328099972
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"entropy": 0.14159703021869063,
"epoch": 0.00164,
"grad_norm": 0.009574824012815952,
"kl": 0.1771204932992987,
"learning_rate": 7.999996865194129e-06,
"loss": 0.0,
"step": 82,
"step_time": 11.777719495999463
},
{
"clip_ratio/high_max": 0.06250000186264515,
"clip_ratio/high_mean": 0.031250000931322575,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04375000111758709,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.11876969272270799,
"epoch": 0.00166,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010034332983195782,
"kl": 0.36267855847108876,
"learning_rate": 7.99999672741677e-06,
"loss": 0.0001,
"num_tokens": 4371298.0,
"reward": 2.316115379333496,
"reward_std": 0.4054742753505707,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9912324547767639,
"rewards/probe_shaping_dominance/std": 0.049596767872571945,
"rewards/probe_terminal_raw/mean": 0.009273373521864414,
"rewards/probe_terminal_raw/std": 0.05245811864733696,
"rewards/rollout_reward_func/mean": -0.5093902349472046,
"rewards/rollout_reward_func/std": 0.24608401954174042,
"sampling/importance_sampling_ratio/max": 1.394594430923462,
"sampling/importance_sampling_ratio/mean": 0.9233759045600891,
"sampling/importance_sampling_ratio/min": 0.08404743671417236,
"sampling/sampling_logp_difference/max": 2.4710586071014404,
"sampling/sampling_logp_difference/mean": 0.07214178144931793,
"step": 83,
"step_time": 27.42874688900065
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"entropy": 0.11967162042856216,
"epoch": 0.00168,
"grad_norm": 0.009677170775830746,
"kl": 0.30461428755370434,
"learning_rate": 7.999996586676458e-06,
"loss": 0.0001,
"step": 84,
"step_time": 12.210796541999116
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08633493585512042,
"epoch": 0.0017,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009309964254498482,
"kl": 0.34726120328798515,
"learning_rate": 7.999996442973193e-06,
"loss": -0.0,
"num_tokens": 4476938.0,
"reward": 2.3256678581237793,
"reward_std": 0.3970645070075989,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5930821895599365,
"rewards/rollout_reward_func/std": 0.20994225144386292,
"sampling/importance_sampling_ratio/max": 2.7198355197906494,
"sampling/importance_sampling_ratio/mean": 0.965837836265564,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.636561870574951,
"sampling/sampling_logp_difference/mean": 0.07213791459798813,
"step": 85,
"step_time": 26.77135907899992
},
{
"clip_ratio/high_max": 0.06250000186264515,
"clip_ratio/high_mean": 0.031250000931322575,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0416666679084301,
"entropy": 0.08549185702577233,
"epoch": 0.00172,
"grad_norm": 0.00986558198928833,
"kl": 0.6476581503327452,
"learning_rate": 7.99999629630698e-06,
"loss": -0.0,
"step": 86,
"step_time": 11.659285754999019
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.78125,
"completions/mean_terminated_length": 2.78125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08913910732371733,
"epoch": 0.00174,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005745335482060909,
"kl": 0.21945283197192111,
"learning_rate": 7.999996146677813e-06,
"loss": -0.0001,
"num_tokens": 4579856.0,
"reward": 2.2342212200164795,
"reward_std": 0.5761978030204773,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.78125,
"rewards/probe_completion_length/std": 0.420013427734375,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.985537052154541,
"rewards/probe_shaping_dominance/std": 0.08181492984294891,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.4981907904148102,
"rewards/rollout_reward_func/std": 0.2684464752674103,
"sampling/importance_sampling_ratio/max": 1.1302220821380615,
"sampling/importance_sampling_ratio/mean": 0.9439641833305359,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.921440839767456,
"sampling/sampling_logp_difference/mean": 0.047181740403175354,
"step": 87,
"step_time": 27.09005630599995
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.08115326758706942,
"epoch": 0.00176,
"grad_norm": 0.003665071912109852,
"kl": 0.22057799324602456,
"learning_rate": 7.999995994085696e-06,
"loss": -0.0001,
"step": 88,
"step_time": 12.136771756998769
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07007716363295913,
"epoch": 0.00178,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007810859940946102,
"kl": 0.6949258089686055,
"learning_rate": 7.999995838530628e-06,
"loss": -0.0,
"num_tokens": 4685612.0,
"reward": 2.3873391151428223,
"reward_std": 0.4150564968585968,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5001606941223145,
"rewards/rollout_reward_func/std": 0.2632400095462799,
"sampling/importance_sampling_ratio/max": 1.329830527305603,
"sampling/importance_sampling_ratio/mean": 0.9396188259124756,
"sampling/importance_sampling_ratio/min": 0.09286217391490936,
"sampling/sampling_logp_difference/max": 2.376638174057007,
"sampling/sampling_logp_difference/mean": 0.05502761900424957,
"step": 89,
"step_time": 26.554008219000025
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"entropy": 0.07465687586227432,
"epoch": 0.0018,
"grad_norm": 0.009502755478024483,
"kl": 0.22063382680062205,
"learning_rate": 7.99999568001261e-06,
"loss": -0.0,
"step": 90,
"step_time": 12.219043876999876
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.14270146866329014,
"epoch": 0.00182,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.008744009770452976,
"kl": 0.11013963767254609,
"learning_rate": 7.999995518531638e-06,
"loss": -0.0001,
"num_tokens": 4789951.0,
"reward": 2.567716360092163,
"reward_std": 0.9114633798599243,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 2.125,
"rewards/probe_completion_length/std": 0.9069623351097107,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9740893840789795,
"rewards/probe_shaping_dominance/std": 0.10259100794792175,
"rewards/probe_terminal_raw/mean": 0.02515243925154209,
"rewards/probe_terminal_raw/std": 0.10202876478433609,
"rewards/rollout_reward_func/mean": -0.5065252184867859,
"rewards/rollout_reward_func/std": 0.20758704841136932,
"sampling/importance_sampling_ratio/max": 1.6487281322479248,
"sampling/importance_sampling_ratio/mean": 0.9680857062339783,
"sampling/importance_sampling_ratio/min": 0.3606947958469391,
"sampling/sampling_logp_difference/max": 0.7544957399368286,
"sampling/sampling_logp_difference/mean": 0.04080694913864136,
"step": 91,
"step_time": 26.54145688799963
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.04375000111758709,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.054166668094694614,
"entropy": 0.1536610189359635,
"epoch": 0.00184,
"grad_norm": 0.0049968562088906765,
"kl": 0.21468755277851415,
"learning_rate": 7.999995354087718e-06,
"loss": -0.0001,
"step": 92,
"step_time": 12.239923568000904
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.8125,
"completions/mean_terminated_length": 2.8125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09390545927453786,
"epoch": 0.00186,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00847246777266264,
"kl": 0.4723499550793804,
"learning_rate": 7.999995186680847e-06,
"loss": -0.0,
"num_tokens": 4891817.0,
"reward": 2.240363121032715,
"reward_std": 0.4286558926105499,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.8125,
"rewards/probe_completion_length/std": 0.3965577781200409,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9915216565132141,
"rewards/probe_shaping_dominance/std": 0.04796085134148598,
"rewards/probe_terminal_raw/mean": 0.008511179126799107,
"rewards/probe_terminal_raw/std": 0.04814650118350983,
"rewards/rollout_reward_func/mean": -0.5221695899963379,
"rewards/rollout_reward_func/std": 0.18585550785064697,
"sampling/importance_sampling_ratio/max": 1.2803471088409424,
"sampling/importance_sampling_ratio/mean": 0.9798120856285095,
"sampling/importance_sampling_ratio/min": 0.28233107924461365,
"sampling/sampling_logp_difference/max": 1.2646756172180176,
"sampling/sampling_logp_difference/mean": 0.03255663067102432,
"step": 93,
"step_time": 26.499364807999882
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"entropy": 0.09494142327457666,
"epoch": 0.00188,
"grad_norm": 0.005891559179872274,
"kl": 0.4762792717665434,
"learning_rate": 7.999995016311026e-06,
"loss": -0.0,
"step": 94,
"step_time": 11.590511038999466
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.78125,
"completions/mean_terminated_length": 2.78125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0855805806349963,
"epoch": 0.0019,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010784839279949665,
"kl": 0.5285673206672072,
"learning_rate": 7.999994842978255e-06,
"loss": 0.0,
"num_tokens": 4999030.0,
"reward": 2.307888984680176,
"reward_std": 0.558517575263977,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.78125,
"rewards/probe_completion_length/std": 0.420013427734375,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.4233608841896057,
"rewards/rollout_reward_func/std": 0.2430049329996109,
"sampling/importance_sampling_ratio/max": 2.3040266036987305,
"sampling/importance_sampling_ratio/mean": 1.0930638313293457,
"sampling/importance_sampling_ratio/min": 0.26607653498649597,
"sampling/sampling_logp_difference/max": 1.3239718675613403,
"sampling/sampling_logp_difference/mean": 0.0572347566485405,
"step": 95,
"step_time": 27.32456371700073
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.08265005028806627,
"epoch": 0.00192,
"grad_norm": 0.009639889933168888,
"kl": 0.5285577713511884,
"learning_rate": 7.999994666682534e-06,
"loss": 0.0,
"step": 96,
"step_time": 12.08934896799974
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.10442803846672177,
"epoch": 0.00194,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007832064293324947,
"kl": 1.2743625693256035,
"learning_rate": 7.999994487423863e-06,
"loss": 0.0002,
"num_tokens": 5101617.0,
"reward": 2.3278391361236572,
"reward_std": 0.21062178909778595,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5909109115600586,
"rewards/rollout_reward_func/std": 0.17344380915164948,
"sampling/importance_sampling_ratio/max": 1.2738028764724731,
"sampling/importance_sampling_ratio/mean": 0.8911948204040527,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.880244493484497,
"sampling/sampling_logp_difference/mean": 0.08490461856126785,
"step": 97,
"step_time": 26.761640363000424
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"entropy": 0.09561855113133788,
"epoch": 0.00196,
"grad_norm": 0.0042576780542731285,
"kl": 0.8573908178368583,
"learning_rate": 7.999994305202242e-06,
"loss": 0.0002,
"step": 98,
"step_time": 12.239888331999737
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.12256050202995539,
"epoch": 0.00198,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03982119634747505,
"kl": 0.4613347239792347,
"learning_rate": 7.999994120017672e-06,
"loss": 0.0,
"num_tokens": 5208185.0,
"reward": 2.3622024059295654,
"reward_std": 0.3201013505458832,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9496574401855469,
"rewards/probe_shaping_dominance/std": 0.13700911402702332,
"rewards/probe_terminal_raw/mean": 0.0570375993847847,
"rewards/probe_terminal_raw/std": 0.15571396052837372,
"rewards/rollout_reward_func/mean": -0.5007427334785461,
"rewards/rollout_reward_func/std": 0.2577684223651886,
"sampling/importance_sampling_ratio/max": 2.246042490005493,
"sampling/importance_sampling_ratio/mean": 1.0854158401489258,
"sampling/importance_sampling_ratio/min": 0.0747772604227066,
"sampling/sampling_logp_difference/max": 2.5932421684265137,
"sampling/sampling_logp_difference/mean": 0.07237481325864792,
"step": 99,
"step_time": 28.563245160000406
},
{
"clip_ratio/high_max": 0.05000000074505806,
"clip_ratio/high_mean": 0.02500000037252903,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04583333432674408,
"entropy": 0.11652607470750809,
"epoch": 0.002,
"grad_norm": 0.013196082785725594,
"kl": 1.1047777848725673,
"learning_rate": 7.999993931870152e-06,
"loss": -0.0,
"step": 100,
"step_time": 11.832685018998745
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.11155627248808742,
"epoch": 0.00202,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.011043570004403591,
"kl": 0.8486065305769444,
"learning_rate": 7.999993740759685e-06,
"loss": 0.0,
"num_tokens": 5312092.0,
"reward": 2.469048261642456,
"reward_std": 0.296406090259552,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0625,
"rewards/probe_invalid_count/std": 0.24593468010425568,
"rewards/probe_shaping_dominance/mean": 0.9914976358413696,
"rewards/probe_shaping_dominance/std": 0.04809650778770447,
"rewards/probe_terminal_raw/mean": 0.00889227632433176,
"rewards/probe_terminal_raw/std": 0.05030231550335884,
"rewards/rollout_reward_func/mean": -0.5125917196273804,
"rewards/rollout_reward_func/std": 0.1837811917066574,
"sampling/importance_sampling_ratio/max": 1.2519433498382568,
"sampling/importance_sampling_ratio/mean": 0.8515626192092896,
"sampling/importance_sampling_ratio/min": 0.08545338362455368,
"sampling/sampling_logp_difference/max": 2.4583053588867188,
"sampling/sampling_logp_difference/mean": 0.1055741012096405,
"step": 101,
"step_time": 28.246981163999408
},
{
"clip_ratio/high_max": 0.0833333358168602,
"clip_ratio/high_mean": 0.0416666679084301,
"clip_ratio/low_mean": 0.031250000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.07291666883975267,
"entropy": 0.10925065912306309,
"epoch": 0.00204,
"grad_norm": 0.008332287892699242,
"kl": 0.7459432929754257,
"learning_rate": 7.999993546686268e-06,
"loss": 0.0,
"step": 102,
"step_time": 12.24685298599934
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09578724391758442,
"epoch": 0.00206,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005429553799331188,
"kl": 0.3181111275916919,
"learning_rate": 7.999993349649902e-06,
"loss": 0.0001,
"num_tokens": 5417356.0,
"reward": 2.296133279800415,
"reward_std": 0.48034343123435974,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.983701229095459,
"rewards/probe_shaping_dominance/std": 0.0652911514043808,
"rewards/probe_terminal_raw/mean": 0.021214431151747704,
"rewards/probe_terminal_raw/std": 0.08383625000715256,
"rewards/rollout_reward_func/mean": -0.5025323629379272,
"rewards/rollout_reward_func/std": 0.23934274911880493,
"sampling/importance_sampling_ratio/max": 1.7521827220916748,
"sampling/importance_sampling_ratio/mean": 1.0161978006362915,
"sampling/importance_sampling_ratio/min": 0.559285044670105,
"sampling/sampling_logp_difference/max": 0.5810226202011108,
"sampling/sampling_logp_difference/mean": 0.03578226640820503,
"step": 103,
"step_time": 28.179791414999727
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.09543535858392715,
"epoch": 0.00208,
"grad_norm": 0.005383977200835943,
"kl": 0.31692405231297016,
"learning_rate": 7.999993149650587e-06,
"loss": 0.0,
"step": 104,
"step_time": 11.594287923999673
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.10437362408265471,
"epoch": 0.0021,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006486265454441309,
"kl": 0.4273503478616476,
"learning_rate": 7.999992946688324e-06,
"loss": -0.0,
"num_tokens": 5522766.0,
"reward": 2.39151668548584,
"reward_std": 0.39364051818847656,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0625,
"rewards/probe_invalid_count/std": 0.24593468010425568,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5272334814071655,
"rewards/rollout_reward_func/std": 0.2972264289855957,
"sampling/importance_sampling_ratio/max": 1.9010006189346313,
"sampling/importance_sampling_ratio/mean": 1.0246827602386475,
"sampling/importance_sampling_ratio/min": 0.3678455054759979,
"sampling/sampling_logp_difference/max": 1.0000989437103271,
"sampling/sampling_logp_difference/mean": 0.03773331269621849,
"step": 105,
"step_time": 26.660096251999676
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"entropy": 0.09778932714834809,
"epoch": 0.00212,
"grad_norm": 0.005733635742217302,
"kl": 0.36536745447665453,
"learning_rate": 7.999992740763114e-06,
"loss": -0.0,
"step": 106,
"step_time": 12.020263065000563
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09189990477170795,
"epoch": 0.00214,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006950075738132,
"kl": 0.37158518051728606,
"learning_rate": 7.999992531874955e-06,
"loss": 0.0,
"num_tokens": 5624278.0,
"reward": 2.3239517211914062,
"reward_std": 0.4278637170791626,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9810667037963867,
"rewards/probe_shaping_dominance/std": 0.0745052620768547,
"rewards/probe_terminal_raw/mean": 0.021214431151747704,
"rewards/probe_terminal_raw/std": 0.08405215293169022,
"rewards/rollout_reward_func/mean": -0.472079336643219,
"rewards/rollout_reward_func/std": 0.24182648956775665,
"sampling/importance_sampling_ratio/max": 1.8587580919265747,
"sampling/importance_sampling_ratio/mean": 0.9948133230209351,
"sampling/importance_sampling_ratio/min": 0.488203763961792,
"sampling/sampling_logp_difference/max": 0.6990102529525757,
"sampling/sampling_logp_difference/mean": 0.03366800397634506,
"step": 107,
"step_time": 27.280253950999395
},
{
"clip_ratio/high_max": 0.06666666828095913,
"clip_ratio/high_mean": 0.033333334140479565,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04375000111758709,
"entropy": 0.07755104900570586,
"epoch": 0.00216,
"grad_norm": 0.0029529579915106297,
"kl": 0.3871547483528275,
"learning_rate": 7.99999232002385e-06,
"loss": 0.0,
"step": 108,
"step_time": 11.582099404000473
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.02291666716337204,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04583333432674408,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08607161836698651,
"epoch": 0.00218,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004912302363663912,
"kl": 0.3110020191234071,
"learning_rate": 7.999992105209796e-06,
"loss": 0.0,
"num_tokens": 5730240.0,
"reward": 2.3713436126708984,
"reward_std": 0.34508299827575684,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9868378639221191,
"rewards/probe_shaping_dominance/std": 0.07445620000362396,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.5186194181442261,
"rewards/rollout_reward_func/std": 0.22763106226921082,
"sampling/importance_sampling_ratio/max": 2.4666221141815186,
"sampling/importance_sampling_ratio/mean": 0.9437046051025391,
"sampling/importance_sampling_ratio/min": 0.16313567757606506,
"sampling/sampling_logp_difference/max": 1.8131763935089111,
"sampling/sampling_logp_difference/mean": 0.07055296003818512,
"step": 109,
"step_time": 27.85804966900014
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"entropy": 0.08376848418265581,
"epoch": 0.0022,
"grad_norm": 0.021030370146036148,
"kl": 0.3346872879192233,
"learning_rate": 7.999991887432795e-06,
"loss": 0.0,
"step": 110,
"step_time": 12.221424097000181
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03645833395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09564799422514625,
"epoch": 0.00222,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010623163543641567,
"kl": 1.25646445970051,
"learning_rate": 7.999991666692848e-06,
"loss": 0.0001,
"num_tokens": 5834866.0,
"reward": 2.371830463409424,
"reward_std": 0.455732524394989,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.9698338508605957,
"rewards/probe_shaping_dominance/std": 0.118809275329113,
"rewards/probe_terminal_raw/mean": 0.02909044735133648,
"rewards/probe_terminal_raw/std": 0.11480555683374405,
"rewards/rollout_reward_func/mean": -0.45209401845932007,
"rewards/rollout_reward_func/std": 0.2390637993812561,
"sampling/importance_sampling_ratio/max": 2.435302972793579,
"sampling/importance_sampling_ratio/mean": 0.9616929292678833,
"sampling/importance_sampling_ratio/min": 0.18086190521717072,
"sampling/sampling_logp_difference/max": 1.7100262641906738,
"sampling/sampling_logp_difference/mean": 0.06157621741294861,
"step": 111,
"step_time": 27.536669213000096
},
{
"clip_ratio/high_max": 0.05625000037252903,
"clip_ratio/high_mean": 0.028125000186264515,
"clip_ratio/low_mean": 0.031250000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05937500111758709,
"entropy": 0.09540150425164029,
"epoch": 0.00224,
"grad_norm": 0.005310059990733862,
"kl": 0.7572433853056282,
"learning_rate": 7.999991442989953e-06,
"loss": 0.0001,
"step": 112,
"step_time": 11.58020766800064
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05377835238323314,
"epoch": 0.00226,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0054777092300355434,
"kl": 0.2139036045409739,
"learning_rate": 7.999991216324112e-06,
"loss": 0.0,
"num_tokens": 5941971.0,
"reward": 2.3715004920959473,
"reward_std": 0.3570369482040405,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.9854661822319031,
"rewards/probe_shaping_dominance/std": 0.08221564441919327,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.5483407378196716,
"rewards/rollout_reward_func/std": 0.21500766277313232,
"sampling/importance_sampling_ratio/max": 1.468092441558838,
"sampling/importance_sampling_ratio/mean": 1.0448389053344727,
"sampling/importance_sampling_ratio/min": 0.9520513415336609,
"sampling/sampling_logp_difference/max": 0.38396334648132324,
"sampling/sampling_logp_difference/mean": 0.014699834398925304,
"step": 113,
"step_time": 26.95208743199919
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.05231661406651256,
"epoch": 0.00228,
"grad_norm": 0.005958650726824999,
"kl": 0.20708634098750167,
"learning_rate": 7.999990986695325e-06,
"loss": 0.0,
"step": 114,
"step_time": 12.898005667000234
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.1277365549467504,
"epoch": 0.0023,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010303696617484093,
"kl": 0.5558968242257833,
"learning_rate": 7.999990754103591e-06,
"loss": -0.0,
"num_tokens": 6048989.0,
"reward": 2.3545703887939453,
"reward_std": 0.32267555594444275,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5329294204711914,
"rewards/rollout_reward_func/std": 0.1960861086845398,
"sampling/importance_sampling_ratio/max": 2.528221368789673,
"sampling/importance_sampling_ratio/mean": 0.9982080459594727,
"sampling/importance_sampling_ratio/min": 0.042695675045251846,
"sampling/sampling_logp_difference/max": 3.153654098510742,
"sampling/sampling_logp_difference/mean": 0.08483341336250305,
"step": 115,
"step_time": 28.715120017999652
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.031250000931322575,
"clip_ratio/low_min": 0.02083333395421505,
"clip_ratio/region_mean": 0.0416666679084301,
"entropy": 0.1117813317105174,
"epoch": 0.00232,
"grad_norm": 0.006610220763832331,
"kl": 0.6069826502352953,
"learning_rate": 7.99999051854891e-06,
"loss": -0.0,
"step": 116,
"step_time": 12.037885646000177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.035416667349636555,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.035416667349636555,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08630842622369528,
"epoch": 0.00234,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.023052336648106575,
"kl": 4.202049997946233,
"learning_rate": 7.999990280031285e-06,
"loss": -0.0,
"num_tokens": 6156241.0,
"reward": 2.3509585857391357,
"reward_std": 0.3719061613082886,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5052914619445801,
"rewards/rollout_reward_func/std": 0.26930469274520874,
"sampling/importance_sampling_ratio/max": 1.4201393127441406,
"sampling/importance_sampling_ratio/mean": 0.9191266298294067,
"sampling/importance_sampling_ratio/min": 0.04002097621560097,
"sampling/sampling_logp_difference/max": 3.218353271484375,
"sampling/sampling_logp_difference/mean": 0.08381534367799759,
"step": 117,
"step_time": 27.4478307280001
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.08493484603241086,
"epoch": 0.00236,
"grad_norm": 0.005157412961125374,
"kl": 0.8633453572015242,
"learning_rate": 7.999990038550715e-06,
"loss": -0.0001,
"step": 118,
"step_time": 12.410220233000018
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.1009751778037753,
"epoch": 0.00238,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007704886142164469,
"kl": 1.133708338191262,
"learning_rate": 7.9999897941072e-06,
"loss": -0.0,
"num_tokens": 6261608.0,
"reward": 2.272282600402832,
"reward_std": 0.4321046769618988,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5527174472808838,
"rewards/rollout_reward_func/std": 0.2261652648448944,
"sampling/importance_sampling_ratio/max": 1.9247888326644897,
"sampling/importance_sampling_ratio/mean": 0.9601424932479858,
"sampling/importance_sampling_ratio/min": 0.10850485414266586,
"sampling/sampling_logp_difference/max": 2.221635580062866,
"sampling/sampling_logp_difference/mean": 0.06387770175933838,
"step": 119,
"step_time": 27.243004307998945
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"entropy": 0.10485226087621413,
"epoch": 0.0024,
"grad_norm": 0.005486879497766495,
"kl": 0.7662449008450487,
"learning_rate": 7.999989546700739e-06,
"loss": -0.0,
"step": 120,
"step_time": 11.642901553001138
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05734692560508847,
"epoch": 0.00242,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0022395749110728502,
"kl": 0.4620458657536801,
"learning_rate": 7.999989296331334e-06,
"loss": 0.0,
"num_tokens": 6364884.0,
"reward": 2.300528049468994,
"reward_std": 0.3925109803676605,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9786701202392578,
"rewards/probe_shaping_dominance/std": 0.08399670571088791,
"rewards/probe_terminal_raw/mean": 0.020452234894037247,
"rewards/probe_terminal_raw/std": 0.08055972307920456,
"rewards/rollout_reward_func/mean": -0.5235942602157593,
"rewards/rollout_reward_func/std": 0.19283899664878845,
"sampling/importance_sampling_ratio/max": 1.684720754623413,
"sampling/importance_sampling_ratio/mean": 0.9979562163352966,
"sampling/importance_sampling_ratio/min": 0.3297406733036041,
"sampling/sampling_logp_difference/max": 1.109449863433838,
"sampling/sampling_logp_difference/mean": 0.03222563862800598,
"step": 121,
"step_time": 27.102160742999786
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.06036481284536421,
"epoch": 0.00244,
"grad_norm": 0.0021346518769860268,
"kl": 0.460031573350534,
"learning_rate": 7.999989042998983e-06,
"loss": 0.0,
"step": 122,
"step_time": 12.627941945999737
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 3.0,
"completions/mean_terminated_length": 3.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.08197857672348619,
"epoch": 0.00246,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005835927091538906,
"kl": 0.3058228840382071,
"learning_rate": 7.99998878670369e-06,
"loss": -0.0,
"num_tokens": 6470259.0,
"reward": 2.4272561073303223,
"reward_std": 0.2215338796377182,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 2.0,
"rewards/probe_completion_length/std": 0.0,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9598830342292786,
"rewards/probe_shaping_dominance/std": 0.13210204243659973,
"rewards/probe_terminal_raw/mean": 0.04026930779218674,
"rewards/probe_terminal_raw/std": 0.13092826306819916,
"rewards/rollout_reward_func/mean": -0.5228960514068604,
"rewards/rollout_reward_func/std": 0.22377446293830872,
"sampling/importance_sampling_ratio/max": 1.2321637868881226,
"sampling/importance_sampling_ratio/mean": 0.9182083606719971,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2927324771881104,
"sampling/sampling_logp_difference/mean": 0.04780565947294235,
"step": 123,
"step_time": 27.481588907000514
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.0761918865609914,
"epoch": 0.00248,
"grad_norm": 0.005192534998059273,
"kl": 0.32337066042236984,
"learning_rate": 7.999988527445453e-06,
"loss": -0.0,
"step": 124,
"step_time": 11.74153527999988
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.061301857323996956,
"epoch": 0.0025,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004524994175881147,
"kl": 0.20127144705232547,
"learning_rate": 7.99998826522427e-06,
"loss": -0.0,
"num_tokens": 6573122.0,
"reward": 2.5412168502807617,
"reward_std": 0.4934008717536926,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0625,
"rewards/probe_invalid_count/std": 0.3535533845424652,
"rewards/probe_shaping_dominance/mean": 0.9729976058006287,
"rewards/probe_shaping_dominance/std": 0.10630916804075241,
"rewards/probe_terminal_raw/mean": 0.028963414952158928,
"rewards/probe_terminal_raw/std": 0.11434794962406158,
"rewards/rollout_reward_func/mean": -0.44199419021606445,
"rewards/rollout_reward_func/std": 0.23288173973560333,
"sampling/importance_sampling_ratio/max": 2.8899707794189453,
"sampling/importance_sampling_ratio/mean": 1.0233311653137207,
"sampling/importance_sampling_ratio/min": 0.5645219683647156,
"sampling/sampling_logp_difference/max": 1.0612452030181885,
"sampling/sampling_logp_difference/mean": 0.02934853918850422,
"step": 125,
"step_time": 26.56314809100013
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.0587867568974616,
"epoch": 0.00252,
"grad_norm": 0.003286329098045826,
"kl": 0.23132333873703226,
"learning_rate": 7.999988000040144e-06,
"loss": -0.0,
"step": 126,
"step_time": 12.704706686999543
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06930449209176004,
"epoch": 0.00254,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0038534458726644516,
"kl": 0.8923099512467161,
"learning_rate": 7.999987731893076e-06,
"loss": -0.0001,
"num_tokens": 6674759.0,
"reward": 2.476976156234741,
"reward_std": 0.5018807053565979,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.09375,
"rewards/probe_invalid_count/std": 0.39015090465545654,
"rewards/probe_shaping_dominance/mean": 0.9897805452346802,
"rewards/probe_shaping_dominance/std": 0.057810164988040924,
"rewards/probe_terminal_raw/mean": 0.010797764174640179,
"rewards/probe_terminal_raw/std": 0.06108137592673302,
"rewards/rollout_reward_func/mean": -0.5048520565032959,
"rewards/rollout_reward_func/std": 0.23183932900428772,
"sampling/importance_sampling_ratio/max": 2.6555376052856445,
"sampling/importance_sampling_ratio/mean": 1.037369728088379,
"sampling/importance_sampling_ratio/min": 0.18285271525382996,
"sampling/sampling_logp_difference/max": 1.6990761756896973,
"sampling/sampling_logp_difference/mean": 0.04799798130989075,
"step": 127,
"step_time": 26.519593818999965
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.02291666716337204,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"entropy": 0.07739454251714051,
"epoch": 0.00256,
"grad_norm": 0.0046963742934167385,
"kl": 0.8950551702291705,
"learning_rate": 7.999987460783066e-06,
"loss": -0.0001,
"step": 128,
"step_time": 11.701040565999392
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.02291666716337204,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.04215445008594543,
"epoch": 0.00258,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004038817714899778,
"kl": 0.483372636698145,
"learning_rate": 7.999987186710111e-06,
"loss": -0.0001,
"num_tokens": 6778164.0,
"reward": 2.3669238090515137,
"reward_std": 0.33272045850753784,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9665718078613281,
"rewards/probe_shaping_dominance/std": 0.11594124883413315,
"rewards/probe_terminal_raw/mean": 0.033663615584373474,
"rewards/probe_terminal_raw/std": 0.11093832552433014,
"rewards/rollout_reward_func/mean": -0.5208115577697754,
"rewards/rollout_reward_func/std": 0.22583386301994324,
"sampling/importance_sampling_ratio/max": 1.324372410774231,
"sampling/importance_sampling_ratio/mean": 0.9827702045440674,
"sampling/importance_sampling_ratio/min": 0.15934889018535614,
"sampling/sampling_logp_difference/max": 1.8366597890853882,
"sampling/sampling_logp_difference/mean": 0.03050372563302517,
"step": 129,
"step_time": 29.272002608000093
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"entropy": 0.040716532384976745,
"epoch": 0.0026,
"grad_norm": 0.004598686005920172,
"kl": 0.48791675676284285,
"learning_rate": 7.999986909674215e-06,
"loss": -0.0001,
"step": 130,
"step_time": 11.615075072000309
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07680852155863249,
"epoch": 0.00262,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004599301610141993,
"kl": 0.5561261102557182,
"learning_rate": 7.999986629675377e-06,
"loss": 0.0001,
"num_tokens": 6881343.0,
"reward": 2.428385019302368,
"reward_std": 0.35835328698158264,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.988267183303833,
"rewards/probe_shaping_dominance/std": 0.06637061387300491,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.4630073308944702,
"rewards/rollout_reward_func/std": 0.23799148201942444,
"sampling/importance_sampling_ratio/max": 2.105088472366333,
"sampling/importance_sampling_ratio/mean": 1.0250680446624756,
"sampling/importance_sampling_ratio/min": 0.24339471757411957,
"sampling/sampling_logp_difference/max": 1.413072109222412,
"sampling/sampling_logp_difference/mean": 0.05859563127160072,
"step": 131,
"step_time": 27.499229768000532
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.07459181371518753,
"epoch": 0.00264,
"grad_norm": 0.0046109952963888645,
"kl": 0.4819548297673464,
"learning_rate": 7.999986346713597e-06,
"loss": 0.0001,
"step": 132,
"step_time": 11.681140706999486
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06963892979547381,
"epoch": 0.00266,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004053663462400436,
"kl": 0.29985905811190605,
"learning_rate": 7.999986060788874e-06,
"loss": -0.0001,
"num_tokens": 6984936.0,
"reward": 2.398922920227051,
"reward_std": 0.3926793932914734,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9907037019729614,
"rewards/probe_shaping_dominance/std": 0.052587706595659256,
"rewards/probe_terminal_raw/mean": 0.007876016199588776,
"rewards/probe_terminal_raw/std": 0.0445534773170948,
"rewards/rollout_reward_func/mean": -0.45590683817863464,
"rewards/rollout_reward_func/std": 0.20304201543331146,
"sampling/importance_sampling_ratio/max": 1.1057724952697754,
"sampling/importance_sampling_ratio/mean": 0.917495846748352,
"sampling/importance_sampling_ratio/min": 0.2753896415233612,
"sampling/sampling_logp_difference/max": 1.2891517877578735,
"sampling/sampling_logp_difference/mean": 0.049349602311849594,
"step": 133,
"step_time": 28.668226430000686
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.07001902349293232,
"epoch": 0.00268,
"grad_norm": 0.0046079279854893684,
"kl": 0.30660303554032,
"learning_rate": 7.999985771901212e-06,
"loss": -0.0001,
"step": 134,
"step_time": 11.78814972499913
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0837576383491978,
"epoch": 0.0027,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004887988790869713,
"kl": 0.48908784112427384,
"learning_rate": 7.999985480050609e-06,
"loss": 0.0,
"num_tokens": 7089375.0,
"reward": 2.383143901824951,
"reward_std": 0.2860008180141449,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9715699553489685,
"rewards/probe_shaping_dominance/std": 0.11188202351331711,
"rewards/probe_terminal_raw/mean": 0.03125,
"rewards/probe_terminal_raw/std": 0.12296734005212784,
"rewards/rollout_reward_func/mean": -0.5384261608123779,
"rewards/rollout_reward_func/std": 0.24632836878299713,
"sampling/importance_sampling_ratio/max": 2.175699234008789,
"sampling/importance_sampling_ratio/mean": 0.9764343500137329,
"sampling/importance_sampling_ratio/min": 0.37150871753692627,
"sampling/sampling_logp_difference/max": 1.0082650184631348,
"sampling/sampling_logp_difference/mean": 0.04385855793952942,
"step": 135,
"step_time": 27.26713926100001
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.07945482805371284,
"epoch": 0.00272,
"grad_norm": 0.005393319763243198,
"kl": 0.4894396271556616,
"learning_rate": 7.999985185237063e-06,
"loss": 0.0,
"step": 136,
"step_time": 11.740167015000225
},
{
"clip_ratio/high_max": 0.012500000186264515,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016666667070239782,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7.0,
"completions/max_terminated_length": 7.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07358018541708589,
"epoch": 0.00274,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04956609383225441,
"kl": 7.594387605204247,
"learning_rate": 7.999984887460579e-06,
"loss": 0.0,
"num_tokens": 7195651.0,
"reward": 2.523413896560669,
"reward_std": 1.283755898475647,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 2.0,
"rewards/probe_completion_length/std": 1.1639753580093384,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.9845632314682007,
"rewards/probe_shaping_dominance/std": 0.08732341974973679,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.4580242931842804,
"rewards/rollout_reward_func/std": 0.29842740297317505,
"sampling/importance_sampling_ratio/max": 1.5995361804962158,
"sampling/importance_sampling_ratio/mean": 0.9101204872131348,
"sampling/importance_sampling_ratio/min": 0.2878796458244324,
"sampling/sampling_logp_difference/max": 1.2452144622802734,
"sampling/sampling_logp_difference/mean": 0.08170486986637115,
"step": 137,
"step_time": 35.5617492829997
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.02291666716337204,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04583333432674408,
"entropy": 0.0833338184747845,
"epoch": 0.00276,
"grad_norm": 0.004238603170961142,
"kl": 0.8713670628203545,
"learning_rate": 7.999984586721153e-06,
"loss": -0.0001,
"step": 138,
"step_time": 13.092057540999122
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.077066877449397,
"epoch": 0.00278,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006206016521900892,
"kl": 0.2502201258515315,
"learning_rate": 7.999984283018788e-06,
"loss": -0.0001,
"num_tokens": 7298420.0,
"reward": 2.434345006942749,
"reward_std": 0.33564823865890503,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.4531550407409668,
"rewards/rollout_reward_func/std": 0.23981256783008575,
"sampling/importance_sampling_ratio/max": 1.5575754642486572,
"sampling/importance_sampling_ratio/mean": 0.9945090413093567,
"sampling/importance_sampling_ratio/min": 0.39499369263648987,
"sampling/sampling_logp_difference/max": 0.9288842678070068,
"sampling/sampling_logp_difference/mean": 0.0369817316532135,
"step": 139,
"step_time": 26.636275078999915
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"entropy": 0.08048825367586687,
"epoch": 0.0028,
"grad_norm": 0.004995269235223532,
"kl": 0.1949386877240613,
"learning_rate": 7.999983976353484e-06,
"loss": -0.0001,
"step": 140,
"step_time": 11.886442712999724
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.02291666716337204,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09608687367290258,
"epoch": 0.00282,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010193965397775173,
"kl": 1.043814627239044,
"learning_rate": 7.99998366672524e-06,
"loss": 0.0001,
"num_tokens": 7400213.0,
"reward": 2.357463836669922,
"reward_std": 0.45996955037117004,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9802613258361816,
"rewards/probe_shaping_dominance/std": 0.0787685438990593,
"rewards/probe_terminal_raw/mean": 0.017403453588485718,
"rewards/probe_terminal_raw/std": 0.06857709586620331,
"rewards/rollout_reward_func/mean": -0.46520087122917175,
"rewards/rollout_reward_func/std": 0.23765753209590912,
"sampling/importance_sampling_ratio/max": 2.0903208255767822,
"sampling/importance_sampling_ratio/mean": 1.064300775527954,
"sampling/importance_sampling_ratio/min": 0.2817336320877075,
"sampling/sampling_logp_difference/max": 1.266794204711914,
"sampling/sampling_logp_difference/mean": 0.04518420994281769,
"step": 141,
"step_time": 27.64493636099951
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"entropy": 0.10309219686314464,
"epoch": 0.00284,
"grad_norm": 0.01219659112393856,
"kl": 0.6812123054987751,
"learning_rate": 7.999983354134058e-06,
"loss": 0.0,
"step": 142,
"step_time": 11.569478897000408
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07752494711894542,
"epoch": 0.00286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004447088576853275,
"kl": 0.28799188635699124,
"learning_rate": 7.999983038579937e-06,
"loss": -0.0002,
"num_tokens": 7502202.0,
"reward": 2.4029557704925537,
"reward_std": 0.41433292627334595,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.4220443069934845,
"rewards/rollout_reward_func/std": 0.2347659468650818,
"sampling/importance_sampling_ratio/max": 2.925204277038574,
"sampling/importance_sampling_ratio/mean": 1.0200954675674438,
"sampling/importance_sampling_ratio/min": 0.2386324405670166,
"sampling/sampling_logp_difference/max": 1.4322543144226074,
"sampling/sampling_logp_difference/mean": 0.04332014173269272,
"step": 143,
"step_time": 27.17438340000035
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.07426338468212634,
"epoch": 0.00288,
"grad_norm": 0.004469662439078093,
"kl": 0.2410876297701634,
"learning_rate": 7.999982720062878e-06,
"loss": -0.0002,
"step": 144,
"step_time": 12.213636597999539
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08634203940164298,
"epoch": 0.0029,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.002921090926975012,
"kl": 0.230285348889538,
"learning_rate": 7.99998239858288e-06,
"loss": 0.0,
"num_tokens": 7607649.0,
"reward": 2.3042469024658203,
"reward_std": 0.4113651216030121,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5207530856132507,
"rewards/rollout_reward_func/std": 0.2033592164516449,
"sampling/importance_sampling_ratio/max": 1.081487774848938,
"sampling/importance_sampling_ratio/mean": 0.961658239364624,
"sampling/importance_sampling_ratio/min": 0.3403857946395874,
"sampling/sampling_logp_difference/max": 0.7405810356140137,
"sampling/sampling_logp_difference/mean": 0.02413717657327652,
"step": 145,
"step_time": 28.17627675400081
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.08849173790076748,
"epoch": 0.00292,
"grad_norm": 0.0025327985640615225,
"kl": 0.24220079024462393,
"learning_rate": 7.999982074139944e-06,
"loss": 0.0,
"step": 146,
"step_time": 11.552079900000535
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.11541430978104472,
"epoch": 0.00294,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0037195596378296614,
"kl": 0.24169684358639643,
"learning_rate": 7.999981746734073e-06,
"loss": -0.0001,
"num_tokens": 7714926.0,
"reward": 2.362529754638672,
"reward_std": 0.3588845729827881,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9859415292739868,
"rewards/probe_shaping_dominance/std": 0.07952678948640823,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.5265366435050964,
"rewards/rollout_reward_func/std": 0.2366112768650055,
"sampling/importance_sampling_ratio/max": 1.8165228366851807,
"sampling/importance_sampling_ratio/mean": 1.0579065084457397,
"sampling/importance_sampling_ratio/min": 0.4353120028972626,
"sampling/sampling_logp_difference/max": 0.826627790927887,
"sampling/sampling_logp_difference/mean": 0.04029189795255661,
"step": 147,
"step_time": 27.175546237000162
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.1161738510709256,
"epoch": 0.00296,
"grad_norm": 0.0037887210492044687,
"kl": 0.23712664423510432,
"learning_rate": 7.999981416365263e-06,
"loss": -0.0,
"step": 148,
"step_time": 12.20823843899916
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.105490946007194,
"epoch": 0.00298,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005545547232031822,
"kl": 0.10429394743793807,
"learning_rate": 7.999981083033518e-06,
"loss": -0.0,
"num_tokens": 7820271.0,
"reward": 2.2831099033355713,
"reward_std": 0.39255067706108093,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5418901443481445,
"rewards/rollout_reward_func/std": 0.2250201553106308,
"sampling/importance_sampling_ratio/max": 1.449048399925232,
"sampling/importance_sampling_ratio/mean": 0.9792050719261169,
"sampling/importance_sampling_ratio/min": 0.2817993760108948,
"sampling/sampling_logp_difference/max": 1.2665607929229736,
"sampling/sampling_logp_difference/mean": 0.03002801164984703,
"step": 149,
"step_time": 27.53580150099924
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.10439014286384918,
"epoch": 0.003,
"grad_norm": 0.00822756253182888,
"kl": 0.11194274778247859,
"learning_rate": 7.999980746738835e-06,
"loss": -0.0,
"step": 150,
"step_time": 11.669001740000112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.1254521356895566,
"epoch": 0.00302,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.008205846883356571,
"kl": 0.2568075335584581,
"learning_rate": 7.999980407481217e-06,
"loss": -0.0,
"num_tokens": 7922328.0,
"reward": 2.4083704948425293,
"reward_std": 0.3905543088912964,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9785879850387573,
"rewards/probe_shaping_dominance/std": 0.08963118493556976,
"rewards/probe_terminal_raw/mean": 0.0260416679084301,
"rewards/probe_terminal_raw/std": 0.1046360433101654,
"rewards/rollout_reward_func/mean": -0.45250916481018066,
"rewards/rollout_reward_func/std": 0.25463223457336426,
"sampling/importance_sampling_ratio/max": 1.165947437286377,
"sampling/importance_sampling_ratio/mean": 0.9090801477432251,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.9794785976409912,
"sampling/sampling_logp_difference/mean": 0.06048261374235153,
"step": 151,
"step_time": 25.965173581000272
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"entropy": 0.11868520639836788,
"epoch": 0.00304,
"grad_norm": 0.008953132666647434,
"kl": 0.6233456870540977,
"learning_rate": 7.999980065260663e-06,
"loss": -0.0001,
"step": 152,
"step_time": 12.843935258000784
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.11048904561903328,
"epoch": 0.00306,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00968129187822342,
"kl": 0.14061896470107627,
"learning_rate": 7.999979720077173e-06,
"loss": -0.0,
"num_tokens": 8026423.0,
"reward": 2.419642925262451,
"reward_std": 0.30986252427101135,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9758727550506592,
"rewards/probe_shaping_dominance/std": 0.10938115417957306,
"rewards/probe_terminal_raw/mean": 0.0209603663533926,
"rewards/probe_terminal_raw/std": 0.09247327595949173,
"rewards/rollout_reward_func/mean": -0.49594029784202576,
"rewards/rollout_reward_func/std": 0.2378591150045395,
"sampling/importance_sampling_ratio/max": 1.1600902080535889,
"sampling/importance_sampling_ratio/mean": 0.9520583152770996,
"sampling/importance_sampling_ratio/min": 0.5003088712692261,
"sampling/sampling_logp_difference/max": 0.6657150983810425,
"sampling/sampling_logp_difference/mean": 0.025925474241375923,
"step": 153,
"step_time": 26.941947170000276
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.1122089575510472,
"epoch": 0.00308,
"grad_norm": 0.00867843721061945,
"kl": 0.15484224071647645,
"learning_rate": 7.99997937193075e-06,
"loss": -0.0,
"step": 154,
"step_time": 11.658896313999776
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0629729179199785,
"epoch": 0.0031,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.003953923936933279,
"kl": 0.03362982640601331,
"learning_rate": 7.99997902082139e-06,
"loss": 0.0,
"num_tokens": 8134364.0,
"reward": 2.304103374481201,
"reward_std": 0.3902580142021179,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9852296113967896,
"rewards/probe_shaping_dominance/std": 0.08355414122343063,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.5217512845993042,
"rewards/rollout_reward_func/std": 0.20511233806610107,
"sampling/importance_sampling_ratio/max": 1.2205973863601685,
"sampling/importance_sampling_ratio/mean": 0.9658781290054321,
"sampling/importance_sampling_ratio/min": 0.46778079867362976,
"sampling/sampling_logp_difference/max": 0.7597565650939941,
"sampling/sampling_logp_difference/mean": 0.021998237818479538,
"step": 155,
"step_time": 27.223922481999125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.06562891032081097,
"epoch": 0.00312,
"grad_norm": 0.004405137151479721,
"kl": 0.038039611198541934,
"learning_rate": 7.999978666749097e-06,
"loss": 0.0,
"step": 156,
"step_time": 12.512135376999595
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05827112344559282,
"epoch": 0.00314,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004053921438753605,
"kl": 0.22048271807530284,
"learning_rate": 7.99997830971387e-06,
"loss": -0.0,
"num_tokens": 8238748.0,
"reward": 2.4397072792053223,
"reward_std": 0.3176124691963196,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.9892078638076782,
"rewards/probe_shaping_dominance/std": 0.061049580574035645,
"rewards/probe_terminal_raw/mean": 0.010670731775462627,
"rewards/probe_terminal_raw/std": 0.06036277487874031,
"rewards/rollout_reward_func/mean": -0.5101712346076965,
"rewards/rollout_reward_func/std": 0.20784814655780792,
"sampling/importance_sampling_ratio/max": 1.6952624320983887,
"sampling/importance_sampling_ratio/mean": 0.9711546301841736,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.1546943187713623,
"sampling/sampling_logp_difference/mean": 0.03182876855134964,
"step": 157,
"step_time": 27.540745071999936
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.031250000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"entropy": 0.058620097348466516,
"epoch": 0.00316,
"grad_norm": 0.0032319524325430393,
"kl": 0.2064171105599364,
"learning_rate": 7.999977949715709e-06,
"loss": -0.0,
"step": 158,
"step_time": 11.632630814000095
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08823958231369033,
"epoch": 0.00318,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005462405737489462,
"kl": 0.09290702206544665,
"learning_rate": 7.999977586754615e-06,
"loss": 0.0001,
"num_tokens": 8341164.0,
"reward": 2.443883180618286,
"reward_std": 0.2663474678993225,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9600480794906616,
"rewards/probe_shaping_dominance/std": 0.12650074064731598,
"rewards/probe_terminal_raw/mean": 0.046875,
"rewards/probe_terminal_raw/std": 0.1480722874403,
"rewards/rollout_reward_func/mean": -0.48178985714912415,
"rewards/rollout_reward_func/std": 0.22425328195095062,
"sampling/importance_sampling_ratio/max": 1.382658839225769,
"sampling/importance_sampling_ratio/mean": 1.018369197845459,
"sampling/importance_sampling_ratio/min": 0.8050516247749329,
"sampling/sampling_logp_difference/max": 0.3240091800689697,
"sampling/sampling_logp_difference/mean": 0.023685907945036888,
"step": 159,
"step_time": 27.411928095999883
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.08342378272209316,
"epoch": 0.0032,
"grad_norm": 0.0198823194950819,
"kl": 0.08883899757620384,
"learning_rate": 7.999977220830588e-06,
"loss": 0.0001,
"step": 160,
"step_time": 12.353684361999967
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06228045103489421,
"epoch": 0.00322,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.002511651022359729,
"kl": 0.1462944263475947,
"learning_rate": 7.999976851943628e-06,
"loss": -0.0,
"num_tokens": 8445224.0,
"reward": 2.391735076904297,
"reward_std": 0.3887004256248474,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.4645148813724518,
"rewards/rollout_reward_func/std": 0.24512337148189545,
"sampling/importance_sampling_ratio/max": 1.2499885559082031,
"sampling/importance_sampling_ratio/mean": 0.964512288570404,
"sampling/importance_sampling_ratio/min": 0.2849932909011841,
"sampling/sampling_logp_difference/max": 1.2552961111068726,
"sampling/sampling_logp_difference/mean": 0.02673853561282158,
"step": 161,
"step_time": 26.90330324300021
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.07126606599922525,
"epoch": 0.00324,
"grad_norm": 0.00517527898773551,
"kl": 0.13863739833080524,
"learning_rate": 7.999976480093737e-06,
"loss": -0.0,
"step": 162,
"step_time": 11.688447676000578
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07072257142863236,
"epoch": 0.00326,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004865987226366997,
"kl": 0.1391429503753443,
"learning_rate": 7.999976105280914e-06,
"loss": -0.0,
"num_tokens": 8551746.0,
"reward": 2.3334262371063232,
"reward_std": 0.42871803045272827,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9307848215103149,
"rewards/probe_shaping_dominance/std": 0.1679239422082901,
"rewards/probe_terminal_raw/mean": 0.07113821059465408,
"rewards/probe_terminal_raw/std": 0.1717527210712433,
"rewards/rollout_reward_func/mean": -0.5247467756271362,
"rewards/rollout_reward_func/std": 0.24572212994098663,
"sampling/importance_sampling_ratio/max": 1.3134804964065552,
"sampling/importance_sampling_ratio/mean": 1.0010151863098145,
"sampling/importance_sampling_ratio/min": 0.42815467715263367,
"sampling/sampling_logp_difference/max": 0.8482714891433716,
"sampling/sampling_logp_difference/mean": 0.01988227292895317,
"step": 163,
"step_time": 28.07267034399956
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.07096506169182248,
"epoch": 0.00328,
"grad_norm": 0.004104274325072765,
"kl": 0.13155441358685493,
"learning_rate": 7.99997572750516e-06,
"loss": -0.0,
"step": 164,
"step_time": 11.647160391999023
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.078909770467817,
"epoch": 0.0033,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004251962527632713,
"kl": 0.09027766038946083,
"learning_rate": 7.999975346766472e-06,
"loss": -0.0,
"num_tokens": 8658732.0,
"reward": 2.414771795272827,
"reward_std": 0.3757838010787964,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.9784373044967651,
"rewards/probe_shaping_dominance/std": 0.08622659742832184,
"rewards/probe_terminal_raw/mean": 0.024517275393009186,
"rewards/probe_terminal_raw/std": 0.10027948766946793,
"rewards/rollout_reward_func/mean": -0.47568273544311523,
"rewards/rollout_reward_func/std": 0.19167323410511017,
"sampling/importance_sampling_ratio/max": 1.1542701721191406,
"sampling/importance_sampling_ratio/mean": 0.9669894576072693,
"sampling/importance_sampling_ratio/min": 0.6857547163963318,
"sampling/sampling_logp_difference/max": 0.37537309527397156,
"sampling/sampling_logp_difference/mean": 0.017938656732439995,
"step": 165,
"step_time": 27.2606650000007
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"entropy": 0.07565992117088172,
"epoch": 0.00332,
"grad_norm": 0.006961170118302107,
"kl": 0.08890455095081506,
"learning_rate": 7.999974963064855e-06,
"loss": -0.0,
"step": 166,
"step_time": 11.698157390000233
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07434030482545495,
"epoch": 0.00334,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004832255654036999,
"kl": 0.15626501338783783,
"learning_rate": 7.999974576400308e-06,
"loss": -0.0,
"num_tokens": 8765380.0,
"reward": 2.2938361167907715,
"reward_std": 0.4383181631565094,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.960378885269165,
"rewards/probe_shaping_dominance/std": 0.12596461176872253,
"rewards/probe_terminal_raw/mean": 0.046875,
"rewards/probe_terminal_raw/std": 0.1480722874403,
"rewards/rollout_reward_func/mean": -0.5071678757667542,
"rewards/rollout_reward_func/std": 0.2304636836051941,
"sampling/importance_sampling_ratio/max": 1.6727243661880493,
"sampling/importance_sampling_ratio/mean": 1.0108327865600586,
"sampling/importance_sampling_ratio/min": 0.4802703857421875,
"sampling/sampling_logp_difference/max": 0.737343966960907,
"sampling/sampling_logp_difference/mean": 0.023180868476629257,
"step": 167,
"step_time": 28.38192438599981
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.0768249062821269,
"epoch": 0.00336,
"grad_norm": 0.0052077267318964005,
"kl": 0.15163502033101395,
"learning_rate": 7.999974186772832e-06,
"loss": -0.0,
"step": 168,
"step_time": 11.745391591000953
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.11408041534014046,
"epoch": 0.00338,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005346886347979307,
"kl": 0.05663721589365878,
"learning_rate": 7.999973794182426e-06,
"loss": 0.0,
"num_tokens": 8871458.0,
"reward": 2.347496271133423,
"reward_std": 0.37117481231689453,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.984038233757019,
"rewards/probe_shaping_dominance/std": 0.09029316157102585,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.477167010307312,
"rewards/rollout_reward_func/std": 0.2263534814119339,
"sampling/importance_sampling_ratio/max": 1.2048081159591675,
"sampling/importance_sampling_ratio/mean": 0.967424750328064,
"sampling/importance_sampling_ratio/min": 0.7366955280303955,
"sampling/sampling_logp_difference/max": 0.3062773644924164,
"sampling/sampling_logp_difference/mean": 0.022138062864542007,
"step": 169,
"step_time": 26.940671711000505
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.11010659678140655,
"epoch": 0.0034,
"grad_norm": 0.006358719430863857,
"kl": 0.05905036644250572,
"learning_rate": 7.99997339862909e-06,
"loss": 0.0,
"step": 170,
"step_time": 12.187279679998937
},
{
"clip_ratio/high_max": 0.06666666828095913,
"clip_ratio/high_mean": 0.033333334140479565,
"clip_ratio/low_mean": 0.035416667349636555,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06875000149011612,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.10378921253141016,
"epoch": 0.00342,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004866220988333225,
"kl": 0.3513250324758701,
"learning_rate": 7.999973000112826e-06,
"loss": -0.0,
"num_tokens": 8977121.0,
"reward": 2.3662233352661133,
"reward_std": 0.36591798067092896,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.490026593208313,
"rewards/rollout_reward_func/std": 0.1713269054889679,
"sampling/importance_sampling_ratio/max": 2.4813146591186523,
"sampling/importance_sampling_ratio/mean": 1.0544798374176025,
"sampling/importance_sampling_ratio/min": 0.5539883375167847,
"sampling/sampling_logp_difference/max": 0.9087880849838257,
"sampling/sampling_logp_difference/mean": 0.04017889127135277,
"step": 171,
"step_time": 27.655318435999106
},
{
"clip_ratio/high_max": 0.06666666828095913,
"clip_ratio/high_mean": 0.033333334140479565,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04375000111758709,
"entropy": 0.10798206774052233,
"epoch": 0.00344,
"grad_norm": 0.012118767946958542,
"kl": 0.39312139721005224,
"learning_rate": 7.999972598633632e-06,
"loss": -0.0,
"step": 172,
"step_time": 11.631308623997938
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06574001582339406,
"epoch": 0.00346,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004147569183260202,
"kl": 0.01835462471728988,
"learning_rate": 7.999972194191514e-06,
"loss": 0.0001,
"num_tokens": 9080753.0,
"reward": 2.3741204738616943,
"reward_std": 0.33386632800102234,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9828835725784302,
"rewards/probe_shaping_dominance/std": 0.06857176870107651,
"rewards/probe_terminal_raw/mean": 0.016895325854420662,
"rewards/probe_terminal_raw/std": 0.067360520362854,
"rewards/rollout_reward_func/mean": -0.48190829157829285,
"rewards/rollout_reward_func/std": 0.23477764427661896,
"sampling/importance_sampling_ratio/max": 2.0903360843658447,
"sampling/importance_sampling_ratio/mean": 1.0450650453567505,
"sampling/importance_sampling_ratio/min": 0.8843300342559814,
"sampling/sampling_logp_difference/max": 0.7373225688934326,
"sampling/sampling_logp_difference/mean": 0.01723039150238037,
"step": 173,
"step_time": 26.502221221999207
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.06844324560370296,
"epoch": 0.00348,
"grad_norm": 0.0040916260331869125,
"kl": 0.022212313354311064,
"learning_rate": 7.999971786786465e-06,
"loss": 0.0001,
"step": 174,
"step_time": 11.897189610000169
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07524242554791272,
"epoch": 0.0035,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005887735169380903,
"kl": 0.22349138231948018,
"learning_rate": 7.99997137641849e-06,
"loss": -0.0,
"num_tokens": 9185715.0,
"reward": 2.4274468421936035,
"reward_std": 0.30020296573638916,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9728903770446777,
"rewards/probe_shaping_dominance/std": 0.10675826668739319,
"rewards/probe_terminal_raw/mean": 0.03125,
"rewards/probe_terminal_raw/std": 0.12296734005212784,
"rewards/rollout_reward_func/mean": -0.46419334411621094,
"rewards/rollout_reward_func/std": 0.211602121591568,
"sampling/importance_sampling_ratio/max": 1.1768231391906738,
"sampling/importance_sampling_ratio/mean": 0.9632406830787659,
"sampling/importance_sampling_ratio/min": 0.32605040073394775,
"sampling/sampling_logp_difference/max": 1.1148320436477661,
"sampling/sampling_logp_difference/mean": 0.02662883885204792,
"step": 175,
"step_time": 27.599337874999037
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.0704325451515615,
"epoch": 0.00352,
"grad_norm": 0.004202236421406269,
"kl": 0.2313449110952206,
"learning_rate": 7.999970963087587e-06,
"loss": -0.0,
"step": 176,
"step_time": 11.622392715999013
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.13769991835579276,
"epoch": 0.00354,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.008693602867424488,
"kl": 0.1878440118744038,
"learning_rate": 7.99997054679376e-06,
"loss": -0.0001,
"num_tokens": 9289277.0,
"reward": 2.358966588973999,
"reward_std": 0.3925982713699341,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9915453791618347,
"rewards/probe_shaping_dominance/std": 0.04782645031809807,
"rewards/probe_terminal_raw/mean": 0.006986788474023342,
"rewards/probe_terminal_raw/std": 0.03952324390411377,
"rewards/rollout_reward_func/mean": -0.4958154261112213,
"rewards/rollout_reward_func/std": 0.18107342720031738,
"sampling/importance_sampling_ratio/max": 1.5426419973373413,
"sampling/importance_sampling_ratio/mean": 0.9988285303115845,
"sampling/importance_sampling_ratio/min": 0.43416687846183777,
"sampling/sampling_logp_difference/max": 0.5040676593780518,
"sampling/sampling_logp_difference/mean": 0.04200742021203041,
"step": 177,
"step_time": 27.01749301200016
},
{
"clip_ratio/high_max": 0.06666666828095913,
"clip_ratio/high_mean": 0.033333334140479565,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04375000111758709,
"entropy": 0.13739392068237066,
"epoch": 0.00356,
"grad_norm": 0.00472621712833643,
"kl": 0.1952200917294249,
"learning_rate": 7.999970127537005e-06,
"loss": -0.0001,
"step": 178,
"step_time": 12.335309556999164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07692393008619547,
"epoch": 0.00358,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.028508227318525314,
"kl": 5.385302404543381,
"learning_rate": 7.999969705317325e-06,
"loss": 0.0001,
"num_tokens": 9389166.0,
"reward": 2.4562783241271973,
"reward_std": 0.2598528265953064,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.4624716639518738,
"rewards/rollout_reward_func/std": 0.18222831189632416,
"sampling/importance_sampling_ratio/max": 1.1737666130065918,
"sampling/importance_sampling_ratio/mean": 0.9517749547958374,
"sampling/importance_sampling_ratio/min": 0.2871549129486084,
"sampling/sampling_logp_difference/max": 1.2477340698242188,
"sampling/sampling_logp_difference/mean": 0.038661930710077286,
"step": 179,
"step_time": 26.827888970999993
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.08128447085618973,
"epoch": 0.0036,
"grad_norm": 0.00963876023888588,
"kl": 2.0179060684172327,
"learning_rate": 7.99996928013472e-06,
"loss": 0.0001,
"step": 180,
"step_time": 11.37347329900058
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.10767973656766117,
"epoch": 0.00362,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010538225993514061,
"kl": 1.1300342498579994,
"learning_rate": 7.999968851989192e-06,
"loss": 0.0,
"num_tokens": 9494689.0,
"reward": 2.297545909881592,
"reward_std": 0.3879827558994293,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.98197340965271,
"rewards/probe_shaping_dominance/std": 0.10197389870882034,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.5250524282455444,
"rewards/rollout_reward_func/std": 0.19950900971889496,
"sampling/importance_sampling_ratio/max": 1.4360560178756714,
"sampling/importance_sampling_ratio/mean": 0.9875404834747314,
"sampling/importance_sampling_ratio/min": 0.18539370596408844,
"sampling/sampling_logp_difference/max": 1.6852741241455078,
"sampling/sampling_logp_difference/mean": 0.049665287137031555,
"step": 181,
"step_time": 26.69406858900038
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"entropy": 0.11389242531731725,
"epoch": 0.00364,
"grad_norm": 0.003970544785261154,
"kl": 0.6293696188367903,
"learning_rate": 7.999968420880736e-06,
"loss": 0.0,
"step": 182,
"step_time": 12.197639549000996
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09248453052714467,
"epoch": 0.00366,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0045459093526005745,
"kl": 0.13789485239249188,
"learning_rate": 7.99996798680936e-06,
"loss": -0.0001,
"num_tokens": 9599380.0,
"reward": 2.4226768016815186,
"reward_std": 0.3249405324459076,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9769073724746704,
"rewards/probe_shaping_dominance/std": 0.09389247745275497,
"rewards/probe_terminal_raw/mean": 0.024263210594654083,
"rewards/probe_terminal_raw/std": 0.09960746020078659,
"rewards/rollout_reward_func/mean": -0.4659937620162964,
"rewards/rollout_reward_func/std": 0.19758032262325287,
"sampling/importance_sampling_ratio/max": 1.1653671264648438,
"sampling/importance_sampling_ratio/mean": 0.9370558261871338,
"sampling/importance_sampling_ratio/min": 0.46233388781547546,
"sampling/sampling_logp_difference/max": 0.7714686393737793,
"sampling/sampling_logp_difference/mean": 0.038370583206415176,
"step": 183,
"step_time": 26.904572651000308
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.0933451559394598,
"epoch": 0.00368,
"grad_norm": 0.004598891828209162,
"kl": 0.12106670817593113,
"learning_rate": 7.999967549775057e-06,
"loss": -0.0001,
"step": 184,
"step_time": 11.607436572001461
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.15387224033474922,
"epoch": 0.0037,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010245956480503082,
"kl": 0.5030446688178927,
"learning_rate": 7.999967109777834e-06,
"loss": -0.0,
"num_tokens": 9707382.0,
"reward": 2.4315314292907715,
"reward_std": 0.47317853569984436,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0625,
"rewards/probe_invalid_count/std": 0.24593468010425568,
"rewards/probe_shaping_dominance/mean": 0.9855233430862427,
"rewards/probe_shaping_dominance/std": 0.0818924754858017,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.4883667528629303,
"rewards/rollout_reward_func/std": 0.20319455862045288,
"sampling/importance_sampling_ratio/max": 1.2542879581451416,
"sampling/importance_sampling_ratio/mean": 0.9586943984031677,
"sampling/importance_sampling_ratio/min": 0.3715563118457794,
"sampling/sampling_logp_difference/max": 0.9900554418563843,
"sampling/sampling_logp_difference/mean": 0.04447564482688904,
"step": 185,
"step_time": 27.28605421100019
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.056250001303851604,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06875000149011612,
"entropy": 0.15163114294409752,
"epoch": 0.00372,
"grad_norm": 0.0044283876195549965,
"kl": 0.7128359689377248,
"learning_rate": 7.999966666817687e-06,
"loss": -0.0,
"step": 186,
"step_time": 12.221499876998678
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.1375539805740118,
"epoch": 0.00374,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006928480230271816,
"kl": 0.14416655764216557,
"learning_rate": 7.999966220894617e-06,
"loss": -0.0,
"num_tokens": 9814422.0,
"reward": 2.40926456451416,
"reward_std": 0.47349250316619873,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9848357439041138,
"rewards/probe_shaping_dominance/std": 0.08578190207481384,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.3849461078643799,
"rewards/rollout_reward_func/std": 0.28888392448425293,
"sampling/importance_sampling_ratio/max": 1.243560791015625,
"sampling/importance_sampling_ratio/mean": 0.9681116342544556,
"sampling/importance_sampling_ratio/min": 0.665830671787262,
"sampling/sampling_logp_difference/max": 0.37914347648620605,
"sampling/sampling_logp_difference/mean": 0.03084658458828926,
"step": 187,
"step_time": 28.931869071998335
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.13996880408376455,
"epoch": 0.00376,
"grad_norm": 0.009369016624987125,
"kl": 0.15229893615469337,
"learning_rate": 7.999965772008627e-06,
"loss": -0.0,
"step": 188,
"step_time": 11.766830096999001
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.035416667349636555,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.10358174092834815,
"epoch": 0.00378,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03559152036905289,
"kl": 0.39252137734001735,
"learning_rate": 7.999965320159715e-06,
"loss": 0.0,
"num_tokens": 9914246.0,
"reward": 2.483328342437744,
"reward_std": 0.3890749216079712,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.950668215751648,
"rewards/probe_shaping_dominance/std": 0.1378525346517563,
"rewards/probe_terminal_raw/mean": 0.056783534586429596,
"rewards/probe_terminal_raw/std": 0.15526829659938812,
"rewards/rollout_reward_func/mean": -0.44287341833114624,
"rewards/rollout_reward_func/std": 0.26299041509628296,
"sampling/importance_sampling_ratio/max": 1.2944039106369019,
"sampling/importance_sampling_ratio/mean": 0.9779493808746338,
"sampling/importance_sampling_ratio/min": 0.5075531005859375,
"sampling/sampling_logp_difference/max": 0.6781981587409973,
"sampling/sampling_logp_difference/mean": 0.026978708803653717,
"step": 189,
"step_time": 27.036868832000437
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"entropy": 0.10481282410910353,
"epoch": 0.0038,
"grad_norm": 0.0055263713002204895,
"kl": 0.39047255569312256,
"learning_rate": 7.999964865347883e-06,
"loss": 0.0001,
"step": 190,
"step_time": 11.940458628999295
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.8125,
"completions/mean_terminated_length": 2.8125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.1198381851427257,
"epoch": 0.00382,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0030459309928119183,
"kl": 0.34787876208429225,
"learning_rate": 7.999964407573131e-06,
"loss": 0.0,
"num_tokens": 10017338.0,
"reward": 2.2820868492126465,
"reward_std": 0.4749685525894165,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.8125,
"rewards/probe_completion_length/std": 0.3965577781200409,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.966022253036499,
"rewards/probe_shaping_dominance/std": 0.13379566371440887,
"rewards/probe_terminal_raw/mean": 0.03125,
"rewards/probe_terminal_raw/std": 0.12296734005212784,
"rewards/rollout_reward_func/mean": -0.47768545150756836,
"rewards/rollout_reward_func/std": 0.2837761640548706,
"sampling/importance_sampling_ratio/max": 1.7857273817062378,
"sampling/importance_sampling_ratio/mean": 1.0156748294830322,
"sampling/importance_sampling_ratio/min": 0.514444887638092,
"sampling/sampling_logp_difference/max": 0.6646687984466553,
"sampling/sampling_logp_difference/mean": 0.03443087264895439,
"step": 191,
"step_time": 27.4936487089999
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.12180041195824742,
"epoch": 0.00384,
"grad_norm": 0.009600832127034664,
"kl": 0.3496675969581702,
"learning_rate": 7.999963946835458e-06,
"loss": 0.0,
"step": 192,
"step_time": 11.71437842100022
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07277914439328015,
"epoch": 0.00386,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005412722937762737,
"kl": 0.6871760921980012,
"learning_rate": 7.999963483134866e-06,
"loss": 0.0001,
"num_tokens": 10123551.0,
"reward": 2.4312024116516113,
"reward_std": 0.31741824746131897,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.954460859298706,
"rewards/probe_shaping_dominance/std": 0.1439775824546814,
"rewards/probe_terminal_raw/mean": 0.046875,
"rewards/probe_terminal_raw/std": 0.1480722874403,
"rewards/rollout_reward_func/mean": -0.48888325691223145,
"rewards/rollout_reward_func/std": 0.2712078392505646,
"sampling/importance_sampling_ratio/max": 1.8100159168243408,
"sampling/importance_sampling_ratio/mean": 1.0015695095062256,
"sampling/importance_sampling_ratio/min": 0.4417291283607483,
"sampling/sampling_logp_difference/max": 0.817058801651001,
"sampling/sampling_logp_difference/mean": 0.03453746810555458,
"step": 193,
"step_time": 26.960456193001846
},
{
"clip_ratio/high_max": 0.06250000186264515,
"clip_ratio/high_mean": 0.031250000931322575,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0416666679084301,
"entropy": 0.07823239883873612,
"epoch": 0.00388,
"grad_norm": 0.01935429498553276,
"kl": 0.6307496229807157,
"learning_rate": 7.999963016471355e-06,
"loss": 0.0001,
"step": 194,
"step_time": 12.808481609999944
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02500000037252903,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09379608882591128,
"epoch": 0.0039,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0030668650288134813,
"kl": 0.2814688477665186,
"learning_rate": 7.999962546844924e-06,
"loss": 0.0001,
"num_tokens": 10225590.0,
"reward": 2.361347198486328,
"reward_std": 0.32310429215431213,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9668688178062439,
"rewards/probe_shaping_dominance/std": 0.13289061188697815,
"rewards/probe_terminal_raw/mean": 0.028328251093626022,
"rewards/probe_terminal_raw/std": 0.11210102587938309,
"rewards/rollout_reward_func/mean": -0.49010002613067627,
"rewards/rollout_reward_func/std": 0.24613085389137268,
"sampling/importance_sampling_ratio/max": 1.3004266023635864,
"sampling/importance_sampling_ratio/mean": 0.9684375524520874,
"sampling/importance_sampling_ratio/min": 0.5094537734985352,
"sampling/sampling_logp_difference/max": 0.6744171380996704,
"sampling/sampling_logp_difference/mean": 0.028151309117674828,
"step": 195,
"step_time": 25.599357043000964
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"entropy": 0.09113423456437886,
"epoch": 0.00392,
"grad_norm": 0.003771732561290264,
"kl": 0.27635849734906515,
"learning_rate": 7.999962074255578e-06,
"loss": 0.0001,
"step": 196,
"step_time": 11.204666337999697
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.04920864764972066,
"epoch": 0.00394,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0027844668366014957,
"kl": 0.3839081407932099,
"learning_rate": 7.999961598703312e-06,
"loss": -0.0,
"num_tokens": 10330063.0,
"reward": 2.415410041809082,
"reward_std": 0.4154632091522217,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9844269156455994,
"rewards/probe_shaping_dominance/std": 0.08809469640254974,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.44089192152023315,
"rewards/rollout_reward_func/std": 0.2551630139350891,
"sampling/importance_sampling_ratio/max": 1.1653680801391602,
"sampling/importance_sampling_ratio/mean": 0.9744973182678223,
"sampling/importance_sampling_ratio/min": 0.20111165940761566,
"sampling/sampling_logp_difference/max": 1.6039009094238281,
"sampling/sampling_logp_difference/mean": 0.030936850234866142,
"step": 197,
"step_time": 26.98998093100181
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"entropy": 0.0472417699656944,
"epoch": 0.00396,
"grad_norm": 0.0009492259123362601,
"kl": 0.3996036083844956,
"learning_rate": 7.99996112018813e-06,
"loss": -0.0,
"step": 198,
"step_time": 12.02342930299983
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06072818394750357,
"epoch": 0.00398,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0025375511031597853,
"kl": 0.2914491758947406,
"learning_rate": 7.999960638710032e-06,
"loss": 0.0,
"num_tokens": 10431419.0,
"reward": 2.499394178390503,
"reward_std": 0.29632288217544556,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9628201127052307,
"rewards/probe_shaping_dominance/std": 0.12069481611251831,
"rewards/probe_terminal_raw/mean": 0.04026930779218674,
"rewards/probe_terminal_raw/std": 0.13092826306819916,
"rewards/rollout_reward_func/mean": -0.42244523763656616,
"rewards/rollout_reward_func/std": 0.24739933013916016,
"sampling/importance_sampling_ratio/max": 1.3507100343704224,
"sampling/importance_sampling_ratio/mean": 1.0147151947021484,
"sampling/importance_sampling_ratio/min": 0.9091832637786865,
"sampling/sampling_logp_difference/max": 0.338870108127594,
"sampling/sampling_logp_difference/mean": 0.010294873267412186,
"step": 199,
"step_time": 27.086743224999736
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.058446566108614206,
"epoch": 0.004,
"grad_norm": 0.0024834321811795235,
"kl": 0.2936624846115592,
"learning_rate": 7.999960154269017e-06,
"loss": 0.0,
"step": 200,
"step_time": 11.463394613998389
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09987628925591707,
"epoch": 0.00402,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0032795185688883066,
"kl": 0.33637799334246665,
"learning_rate": 7.999959666865086e-06,
"loss": -0.0,
"num_tokens": 10533498.0,
"reward": 2.4651217460632324,
"reward_std": 0.32078394293785095,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9926146268844604,
"rewards/probe_shaping_dominance/std": 0.04177792742848396,
"rewards/probe_terminal_raw/mean": 0.008003048598766327,
"rewards/probe_terminal_raw/std": 0.04527207836508751,
"rewards/rollout_reward_func/mean": -0.4229958653450012,
"rewards/rollout_reward_func/std": 0.19672146439552307,
"sampling/importance_sampling_ratio/max": 1.195106863975525,
"sampling/importance_sampling_ratio/mean": 0.9418940544128418,
"sampling/importance_sampling_ratio/min": 0.318993479013443,
"sampling/sampling_logp_difference/max": 0.9258831739425659,
"sampling/sampling_logp_difference/mean": 0.038004204630851746,
"step": 201,
"step_time": 26.624555751001026
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.10252866102382541,
"epoch": 0.00404,
"grad_norm": 0.0035051219165325165,
"kl": 0.3395325805176981,
"learning_rate": 7.99995917649824e-06,
"loss": -0.0,
"step": 202,
"step_time": 12.736442242999146
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.10787439718842506,
"epoch": 0.00406,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00344108697026968,
"kl": 0.40611333276319783,
"learning_rate": 7.999958683168479e-06,
"loss": 0.0,
"num_tokens": 10637062.0,
"reward": 2.5038881301879883,
"reward_std": 0.22744759917259216,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.4148617684841156,
"rewards/rollout_reward_func/std": 0.18833006918430328,
"sampling/importance_sampling_ratio/max": 1.1548116207122803,
"sampling/importance_sampling_ratio/mean": 0.9753589630126953,
"sampling/importance_sampling_ratio/min": 0.7033773064613342,
"sampling/sampling_logp_difference/max": 0.35186219215393066,
"sampling/sampling_logp_difference/mean": 0.019522543996572495,
"step": 203,
"step_time": 26.715982574999543
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"entropy": 0.1049330742098391,
"epoch": 0.00408,
"grad_norm": 0.0019796311389654875,
"kl": 0.4593061124905944,
"learning_rate": 7.999958186875805e-06,
"loss": -0.0,
"step": 204,
"step_time": 11.646448757999678
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 3.0,
"completions/mean_terminated_length": 3.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.055373367242282256,
"epoch": 0.0041,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006926523055881262,
"kl": 0.05150494979155518,
"learning_rate": 7.999957687620215e-06,
"loss": -0.0,
"num_tokens": 10738428.0,
"reward": 2.550138473510742,
"reward_std": 0.22538912296295166,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 2.0,
"rewards/probe_completion_length/std": 0.0,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9910282492637634,
"rewards/probe_shaping_dominance/std": 0.05075191706418991,
"rewards/probe_terminal_raw/mean": 0.00940040685236454,
"rewards/probe_terminal_raw/std": 0.05317673459649086,
"rewards/rollout_reward_func/mean": -0.4002901315689087,
"rewards/rollout_reward_func/std": 0.22546610236167908,
"sampling/importance_sampling_ratio/max": 1.2517437934875488,
"sampling/importance_sampling_ratio/mean": 0.9799097180366516,
"sampling/importance_sampling_ratio/min": 0.5997620224952698,
"sampling/sampling_logp_difference/max": 0.5112212896347046,
"sampling/sampling_logp_difference/mean": 0.01782449334859848,
"step": 205,
"step_time": 26.197072295999533
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.06316551179043017,
"epoch": 0.00412,
"grad_norm": 0.0017257543513551354,
"kl": 0.053950335964449536,
"learning_rate": 7.999957185401714e-06,
"loss": -0.0,
"step": 206,
"step_time": 12.549622151999756
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.02291666716337204,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.11197321023792028,
"epoch": 0.00414,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.031097499653697014,
"kl": 3.89400917571038,
"learning_rate": 7.9999566802203e-06,
"loss": 0.0001,
"num_tokens": 10840689.0,
"reward": 2.345735549926758,
"reward_std": 0.5137441754341125,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0625,
"rewards/probe_invalid_count/std": 0.24593468010425568,
"rewards/probe_shaping_dominance/mean": 0.9725180268287659,
"rewards/probe_shaping_dominance/std": 0.10821773111820221,
"rewards/probe_terminal_raw/mean": 0.03125,
"rewards/probe_terminal_raw/std": 0.12296734005212784,
"rewards/rollout_reward_func/mean": -0.5142826437950134,
"rewards/rollout_reward_func/std": 0.19152681529521942,
"sampling/importance_sampling_ratio/max": 1.9924818277359009,
"sampling/importance_sampling_ratio/mean": 0.9943416118621826,
"sampling/importance_sampling_ratio/min": 0.39203470945358276,
"sampling/sampling_logp_difference/max": 0.9361467361450195,
"sampling/sampling_logp_difference/mean": 0.053312450647354126,
"step": 207,
"step_time": 26.643122880999726
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.035416667349636555,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.058333334513008595,
"entropy": 0.11182145914062858,
"epoch": 0.00416,
"grad_norm": 0.007240073289722204,
"kl": 1.6443076208233833,
"learning_rate": 7.999956172075974e-06,
"loss": 0.0,
"step": 208,
"step_time": 11.64378536300228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.12927352613769472,
"epoch": 0.00418,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004229975864291191,
"kl": 0.6016647743063004,
"learning_rate": 7.999955660968735e-06,
"loss": -0.0,
"num_tokens": 10944113.0,
"reward": 2.364624261856079,
"reward_std": 0.36824679374694824,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.49162572622299194,
"rewards/rollout_reward_func/std": 0.21871674060821533,
"sampling/importance_sampling_ratio/max": 1.3223011493682861,
"sampling/importance_sampling_ratio/mean": 0.9632259607315063,
"sampling/importance_sampling_ratio/min": 0.3602616786956787,
"sampling/sampling_logp_difference/max": 0.6850378513336182,
"sampling/sampling_logp_difference/mean": 0.04301746189594269,
"step": 209,
"step_time": 26.264724693000062
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.12698473082855344,
"epoch": 0.0042,
"grad_norm": 0.004611098673194647,
"kl": 0.6409582832593514,
"learning_rate": 7.999955146898586e-06,
"loss": -0.0001,
"step": 210,
"step_time": 12.728916892999223
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.04340869339648634,
"epoch": 0.00422,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0008402821840718389,
"kl": 0.035792879805057964,
"learning_rate": 7.999954629865525e-06,
"loss": -0.0,
"num_tokens": 11047946.0,
"reward": 2.3281283378601074,
"reward_std": 0.43589621782302856,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9857661724090576,
"rewards/probe_shaping_dominance/std": 0.08051877468824387,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.4982629418373108,
"rewards/rollout_reward_func/std": 0.20851053297519684,
"sampling/importance_sampling_ratio/max": 1.0012203454971313,
"sampling/importance_sampling_ratio/mean": 0.9677799940109253,
"sampling/importance_sampling_ratio/min": 0.4670157730579376,
"sampling/sampling_logp_difference/max": 0.7613925933837891,
"sampling/sampling_logp_difference/mean": 0.014532409608364105,
"step": 211,
"step_time": 26.491312149000805
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.04452452051918954,
"epoch": 0.00424,
"grad_norm": 0.0009245733381249011,
"kl": 0.039327465879523515,
"learning_rate": 7.999954109869554e-06,
"loss": -0.0,
"step": 212,
"step_time": 11.690953868999713
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.10158436209894717,
"epoch": 0.00426,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0037761996500194073,
"kl": 0.43266808055341244,
"learning_rate": 7.999953586910674e-06,
"loss": -0.0,
"num_tokens": 11155145.0,
"reward": 2.33209490776062,
"reward_std": 0.3974522352218628,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9518005847930908,
"rewards/probe_shaping_dominance/std": 0.15248610079288483,
"rewards/probe_terminal_raw/mean": 0.046875,
"rewards/probe_terminal_raw/std": 0.1480722874403,
"rewards/rollout_reward_func/mean": -0.4603305459022522,
"rewards/rollout_reward_func/std": 0.2795467674732208,
"sampling/importance_sampling_ratio/max": 1.5568536520004272,
"sampling/importance_sampling_ratio/mean": 1.0121254920959473,
"sampling/importance_sampling_ratio/min": 0.6084503531455994,
"sampling/sampling_logp_difference/max": 0.49602431058883667,
"sampling/sampling_logp_difference/mean": 0.017653338611125946,
"step": 213,
"step_time": 26.773649626000406
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.10326709412038326,
"epoch": 0.00428,
"grad_norm": 0.004299989901483059,
"kl": 0.4246340822428465,
"learning_rate": 7.999953060988884e-06,
"loss": 0.0,
"step": 214,
"step_time": 12.393828191000466
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.10672931908629835,
"epoch": 0.0043,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0029812948778271675,
"kl": 0.5036190063692629,
"learning_rate": 7.999952532104185e-06,
"loss": 0.0,
"num_tokens": 11256499.0,
"reward": 2.3668174743652344,
"reward_std": 0.4220028221607208,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.984410285949707,
"rewards/probe_shaping_dominance/std": 0.08818867057561874,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.458217978477478,
"rewards/rollout_reward_func/std": 0.1993042230606079,
"sampling/importance_sampling_ratio/max": 1.2048288583755493,
"sampling/importance_sampling_ratio/mean": 0.9700103998184204,
"sampling/importance_sampling_ratio/min": 0.2804865837097168,
"sampling/sampling_logp_difference/max": 1.2170777320861816,
"sampling/sampling_logp_difference/mean": 0.027440235018730164,
"step": 215,
"step_time": 26.241349470000387
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.10663612652570009,
"epoch": 0.00432,
"grad_norm": 0.0025962339714169502,
"kl": 0.514960631611757,
"learning_rate": 7.99995200025658e-06,
"loss": 0.0,
"step": 216,
"step_time": 11.455195212000945
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.13120519556105137,
"epoch": 0.00434,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00685320096090436,
"kl": 0.5306107758951839,
"learning_rate": 7.999951465446065e-06,
"loss": 0.0,
"num_tokens": 11358760.0,
"reward": 2.4137301445007324,
"reward_std": 0.38182157278060913,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9719860553741455,
"rewards/probe_shaping_dominance/std": 0.1105431467294693,
"rewards/probe_terminal_raw/mean": 0.03125,
"rewards/probe_terminal_raw/std": 0.12296734005212784,
"rewards/rollout_reward_func/mean": -0.4770059883594513,
"rewards/rollout_reward_func/std": 0.27089011669158936,
"sampling/importance_sampling_ratio/max": 1.8946123123168945,
"sampling/importance_sampling_ratio/mean": 1.0106232166290283,
"sampling/importance_sampling_ratio/min": 0.6873172521591187,
"sampling/sampling_logp_difference/max": 0.6602880954742432,
"sampling/sampling_logp_difference/mean": 0.026765936985611916,
"step": 217,
"step_time": 28.19653884499894
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.1328302058391273,
"epoch": 0.00436,
"grad_norm": 0.006467514205724001,
"kl": 0.5236879177391529,
"learning_rate": 7.999950927672645e-06,
"loss": 0.0,
"step": 218,
"step_time": 11.548230411000986
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0558876832947135,
"epoch": 0.00438,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.002873801626265049,
"kl": 0.43705418131622764,
"learning_rate": 7.999950386936317e-06,
"loss": 0.0001,
"num_tokens": 11459134.0,
"reward": 2.4926953315734863,
"reward_std": 0.2576614320278168,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9532788395881653,
"rewards/probe_shaping_dominance/std": 0.1259964108467102,
"rewards/probe_terminal_raw/mean": 0.049288615584373474,
"rewards/probe_terminal_raw/std": 0.13439743220806122,
"rewards/rollout_reward_func/mean": -0.4286222755908966,
"rewards/rollout_reward_func/std": 0.13808076083660126,
"sampling/importance_sampling_ratio/max": 2.167020320892334,
"sampling/importance_sampling_ratio/mean": 1.0488494634628296,
"sampling/importance_sampling_ratio/min": 0.5981054306030273,
"sampling/sampling_logp_difference/max": 0.773352861404419,
"sampling/sampling_logp_difference/mean": 0.021742573007941246,
"step": 219,
"step_time": 26.59079552000003
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.05215576570481062,
"epoch": 0.0044,
"grad_norm": 0.013386573642492294,
"kl": 0.4328960892962641,
"learning_rate": 7.999949843237083e-06,
"loss": 0.0001,
"step": 220,
"step_time": 11.575578054999824
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 3.0,
"completions/mean_terminated_length": 3.0,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.10125815495848656,
"epoch": 0.00442,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.003916088026016951,
"kl": 0.22720737754934817,
"learning_rate": 7.999949296574944e-06,
"loss": 0.0,
"num_tokens": 11564110.0,
"reward": 2.5024495124816895,
"reward_std": 0.21472422778606415,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 2.0,
"rewards/probe_completion_length/std": 0.0,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9687739014625549,
"rewards/probe_shaping_dominance/std": 0.12289554625749588,
"rewards/probe_terminal_raw/mean": 0.03125,
"rewards/probe_terminal_raw/std": 0.12296734005212784,
"rewards/rollout_reward_func/mean": -0.4475744962692261,
"rewards/rollout_reward_func/std": 0.21471136808395386,
"sampling/importance_sampling_ratio/max": 1.2565096616744995,
"sampling/importance_sampling_ratio/mean": 0.9851142168045044,
"sampling/importance_sampling_ratio/min": 0.7785980701446533,
"sampling/sampling_logp_difference/max": 0.25026071071624756,
"sampling/sampling_logp_difference/mean": 0.014336168766021729,
"step": 221,
"step_time": 28.308517722000943
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.10386610007844865,
"epoch": 0.00444,
"grad_norm": 0.0038715400733053684,
"kl": 0.2309217918664217,
"learning_rate": 7.9999487469499e-06,
"loss": 0.0,
"step": 222,
"step_time": 11.59164219199829
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08839935716241598,
"epoch": 0.00446,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0031392991077154875,
"kl": 0.3969584498627228,
"learning_rate": 7.999948194361951e-06,
"loss": 0.0,
"num_tokens": 11670791.0,
"reward": 2.504007339477539,
"reward_std": 0.40813401341438293,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0625,
"rewards/probe_invalid_count/std": 0.24593468010425568,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.4459925591945648,
"rewards/rollout_reward_func/std": 0.22923637926578522,
"sampling/importance_sampling_ratio/max": 1.2424126863479614,
"sampling/importance_sampling_ratio/mean": 1.0054875612258911,
"sampling/importance_sampling_ratio/min": 0.8022926449775696,
"sampling/sampling_logp_difference/max": 0.2571254372596741,
"sampling/sampling_logp_difference/mean": 0.01522812806069851,
"step": 223,
"step_time": 27.01184939599989
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.08901654137298465,
"epoch": 0.00448,
"grad_norm": 0.0026675413828343153,
"kl": 0.3970091380215308,
"learning_rate": 7.999947638811098e-06,
"loss": 0.0,
"step": 224,
"step_time": 12.880684480999662
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06860345043241978,
"epoch": 0.0045,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005898882634937763,
"kl": 0.2994147054851055,
"learning_rate": 7.999947080297344e-06,
"loss": 0.0001,
"num_tokens": 11778059.0,
"reward": 2.442521095275879,
"reward_std": 0.44092267751693726,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0625,
"rewards/probe_invalid_count/std": 0.3535533845424652,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5387288331985474,
"rewards/rollout_reward_func/std": 0.17624567449092865,
"sampling/importance_sampling_ratio/max": 1.9132263660430908,
"sampling/importance_sampling_ratio/mean": 1.0267926454544067,
"sampling/importance_sampling_ratio/min": 0.2760489583015442,
"sampling/sampling_logp_difference/max": 1.2855275869369507,
"sampling/sampling_logp_difference/mean": 0.03292452543973923,
"step": 225,
"step_time": 26.894577987999583
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"entropy": 0.0715375836007297,
"epoch": 0.00452,
"grad_norm": 0.004127421882003546,
"kl": 0.2991956745972857,
"learning_rate": 7.999946518820686e-06,
"loss": 0.0001,
"step": 226,
"step_time": 11.7451522450001
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07084862189367414,
"epoch": 0.00454,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007534612435847521,
"kl": 0.3083134523330955,
"learning_rate": 7.999945954381125e-06,
"loss": -0.0,
"num_tokens": 11885416.0,
"reward": 2.2896175384521484,
"reward_std": 0.4199885129928589,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9857305884361267,
"rewards/probe_shaping_dominance/std": 0.080719955265522,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.5367380380630493,
"rewards/rollout_reward_func/std": 0.2644577920436859,
"sampling/importance_sampling_ratio/max": 1.2167645692825317,
"sampling/importance_sampling_ratio/mean": 0.9729256629943848,
"sampling/importance_sampling_ratio/min": 0.5702285766601562,
"sampling/sampling_logp_difference/max": 0.556563138961792,
"sampling/sampling_logp_difference/mean": 0.01854308322072029,
"step": 227,
"step_time": 26.478597906999312
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.07101618498563766,
"epoch": 0.00456,
"grad_norm": 0.005244475323706865,
"kl": 0.275350460462505,
"learning_rate": 7.999945386978663e-06,
"loss": -0.0,
"step": 228,
"step_time": 12.815234450999014
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.10753743472741917,
"epoch": 0.00458,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.002793548395857215,
"kl": 0.3363812413687519,
"learning_rate": 7.999944816613299e-06,
"loss": 0.0,
"num_tokens": 11990346.0,
"reward": 2.4647884368896484,
"reward_std": 0.3218696117401123,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9882341623306274,
"rewards/probe_shaping_dominance/std": 0.06655776500701904,
"rewards/probe_terminal_raw/mean": 0.011559959501028061,
"rewards/probe_terminal_raw/std": 0.06539300829172134,
"rewards/rollout_reward_func/mean": -0.45375561714172363,
"rewards/rollout_reward_func/std": 0.26721474528312683,
"sampling/importance_sampling_ratio/max": 1.7522544860839844,
"sampling/importance_sampling_ratio/mean": 1.0056817531585693,
"sampling/importance_sampling_ratio/min": 0.39151322841644287,
"sampling/sampling_logp_difference/max": 0.9377517700195312,
"sampling/sampling_logp_difference/mean": 0.030310627073049545,
"step": 229,
"step_time": 26.652824122999846
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.10178712871856987,
"epoch": 0.0046,
"grad_norm": 0.0023058054503053427,
"kl": 0.3472972925131521,
"learning_rate": 7.999944243285035e-06,
"loss": 0.0,
"step": 230,
"step_time": 11.641791465999631
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.11396907176822424,
"epoch": 0.00462,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0053448486141860485,
"kl": 0.23751085135154426,
"learning_rate": 7.999943666993872e-06,
"loss": -0.0,
"num_tokens": 12094123.0,
"reward": 2.3231983184814453,
"reward_std": 0.4537913501262665,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9484962224960327,
"rewards/probe_shaping_dominance/std": 0.14161793887615204,
"rewards/probe_terminal_raw/mean": 0.05538617819547653,
"rewards/probe_terminal_raw/std": 0.15303537249565125,
"rewards/rollout_reward_func/mean": -0.4744342267513275,
"rewards/rollout_reward_func/std": 0.27888038754463196,
"sampling/importance_sampling_ratio/max": 1.2306643724441528,
"sampling/importance_sampling_ratio/mean": 0.9789013862609863,
"sampling/importance_sampling_ratio/min": 0.5588669180870056,
"sampling/sampling_logp_difference/max": 0.5087692737579346,
"sampling/sampling_logp_difference/mean": 0.027260489761829376,
"step": 231,
"step_time": 27.108853302998796
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.11494649667292833,
"epoch": 0.00464,
"grad_norm": 0.0034225336275994778,
"kl": 0.2446515706833452,
"learning_rate": 7.999943087739808e-06,
"loss": -0.0,
"step": 232,
"step_time": 12.437156906999007
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.8125,
"completions/mean_terminated_length": 2.8125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09780422016046941,
"epoch": 0.00466,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00331493909470737,
"kl": 0.29221273493021727,
"learning_rate": 7.999942505522845e-06,
"loss": 0.0,
"num_tokens": 12202392.0,
"reward": 2.31793212890625,
"reward_std": 0.4711916446685791,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.8125,
"rewards/probe_completion_length/std": 0.3965577781200409,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9865642786026001,
"rewards/probe_shaping_dominance/std": 0.07600414007902145,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.44675713777542114,
"rewards/rollout_reward_func/std": 0.27934086322784424,
"sampling/importance_sampling_ratio/max": 1.2045822143554688,
"sampling/importance_sampling_ratio/mean": 0.9702666997909546,
"sampling/importance_sampling_ratio/min": 0.5390675067901611,
"sampling/sampling_logp_difference/max": 0.6179147958755493,
"sampling/sampling_logp_difference/mean": 0.02464653179049492,
"step": 233,
"step_time": 27.07101158400019
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.10010742908343673,
"epoch": 0.00468,
"grad_norm": 0.00394394900649786,
"kl": 0.28515962581150234,
"learning_rate": 7.999941920342986e-06,
"loss": 0.0,
"step": 234,
"step_time": 11.908877233997373
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09357268398161978,
"epoch": 0.0047,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.003001472679898143,
"kl": 0.4120303535989933,
"learning_rate": 7.999941332200228e-06,
"loss": 0.0,
"num_tokens": 12307473.0,
"reward": 2.356600761413574,
"reward_std": 0.39092886447906494,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9900838732719421,
"rewards/probe_shaping_dominance/std": 0.05609414726495743,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.47410792112350464,
"rewards/rollout_reward_func/std": 0.2651825547218323,
"sampling/importance_sampling_ratio/max": 1.2125083208084106,
"sampling/importance_sampling_ratio/mean": 0.9483182430267334,
"sampling/importance_sampling_ratio/min": 0.5642846822738647,
"sampling/sampling_logp_difference/max": 0.5796399116516113,
"sampling/sampling_logp_difference/mean": 0.029487669467926025,
"step": 235,
"step_time": 27.473293748998913
},
{
"clip_ratio/high_max": 0.06666666828095913,
"clip_ratio/high_mean": 0.033333334140479565,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"entropy": 0.08885149616980925,
"epoch": 0.00472,
"grad_norm": 0.004106747917830944,
"kl": 0.39987785345859805,
"learning_rate": 7.999940741094573e-06,
"loss": 0.0,
"step": 236,
"step_time": 11.607714889999443
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0416666679084301,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0790116679854691,
"epoch": 0.00474,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0017510356847196817,
"kl": 0.49183082331728656,
"learning_rate": 7.999940147026021e-06,
"loss": 0.0,
"num_tokens": 12410261.0,
"reward": 2.362030029296875,
"reward_std": 0.48628348112106323,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9550326466560364,
"rewards/probe_shaping_dominance/std": 0.1423776000738144,
"rewards/probe_terminal_raw/mean": 0.046875,
"rewards/probe_terminal_raw/std": 0.1480722874403,
"rewards/rollout_reward_func/mean": -0.464877724647522,
"rewards/rollout_reward_func/std": 0.2927810847759247,
"sampling/importance_sampling_ratio/max": 1.2767555713653564,
"sampling/importance_sampling_ratio/mean": 1.0007102489471436,
"sampling/importance_sampling_ratio/min": 0.5674677491188049,
"sampling/sampling_logp_difference/max": 0.564541220664978,
"sampling/sampling_logp_difference/mean": 0.017719101160764694,
"step": 237,
"step_time": 26.277223889999732
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.02083333395421505,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"entropy": 0.08091688924469054,
"epoch": 0.00476,
"grad_norm": 0.0037676175124943256,
"kl": 0.4987390860915184,
"learning_rate": 7.999939549994574e-06,
"loss": 0.0,
"step": 238,
"step_time": 11.42589379400033
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08572797977831215,
"epoch": 0.00478,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0028349068015813828,
"kl": 0.29074460588162765,
"learning_rate": 7.99993895000023e-06,
"loss": -0.0001,
"num_tokens": 12515046.0,
"reward": 2.3852663040161133,
"reward_std": 0.48509836196899414,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9448926448822021,
"rewards/probe_shaping_dominance/std": 0.15274296700954437,
"rewards/probe_terminal_raw/mean": 0.05525914579629898,
"rewards/probe_terminal_raw/std": 0.15285103023052216,
"rewards/rollout_reward_func/mean": -0.43988555669784546,
"rewards/rollout_reward_func/std": 0.28072717785835266,
"sampling/importance_sampling_ratio/max": 1.2809064388275146,
"sampling/importance_sampling_ratio/mean": 0.9681559801101685,
"sampling/importance_sampling_ratio/min": 0.417494535446167,
"sampling/sampling_logp_difference/max": 0.8734843134880066,
"sampling/sampling_logp_difference/mean": 0.02679057978093624,
"step": 239,
"step_time": 27.850705083998037
},
{
"clip_ratio/high_max": 0.05208333395421505,
"clip_ratio/high_mean": 0.026041666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.026041666977107525,
"entropy": 0.09145444841124117,
"epoch": 0.0048,
"grad_norm": 0.003533316310495138,
"kl": 0.276357589289546,
"learning_rate": 7.999938347042993e-06,
"loss": -0.0001,
"step": 240,
"step_time": 11.650785684000766
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05321495997486636,
"epoch": 0.00482,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.002123113488778472,
"kl": 0.1996255109550784,
"learning_rate": 7.999937741122862e-06,
"loss": 0.0,
"num_tokens": 12618608.0,
"reward": 2.31355619430542,
"reward_std": 0.3297788202762604,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5426939129829407,
"rewards/rollout_reward_func/std": 0.22457517683506012,
"sampling/importance_sampling_ratio/max": 1.1050293445587158,
"sampling/importance_sampling_ratio/mean": 1.0058460235595703,
"sampling/importance_sampling_ratio/min": 0.9022819995880127,
"sampling/sampling_logp_difference/max": 0.10648787021636963,
"sampling/sampling_logp_difference/mean": 0.005735831335186958,
"step": 241,
"step_time": 26.73763404300007
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.05486724083311856,
"epoch": 0.00484,
"grad_norm": 0.003093272214755416,
"kl": 0.1941228064047955,
"learning_rate": 7.999937132239836e-06,
"loss": 0.0,
"step": 242,
"step_time": 11.670754389999274
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07577884336933494,
"epoch": 0.00486,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0036162908654659986,
"kl": 0.4399729967590247,
"learning_rate": 7.999936520393918e-06,
"loss": 0.0,
"num_tokens": 12726447.0,
"reward": 2.3645379543304443,
"reward_std": 0.41120022535324097,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9753913879394531,
"rewards/probe_shaping_dominance/std": 0.09771986305713654,
"rewards/probe_terminal_raw/mean": 0.0260416679084301,
"rewards/probe_terminal_raw/std": 0.1046360433101654,
"rewards/rollout_reward_func/mean": -0.4618951678276062,
"rewards/rollout_reward_func/std": 0.1977241188287735,
"sampling/importance_sampling_ratio/max": 1.1149406433105469,
"sampling/importance_sampling_ratio/mean": 0.9780128002166748,
"sampling/importance_sampling_ratio/min": 0.7354345321655273,
"sampling/sampling_logp_difference/max": 0.18633489310741425,
"sampling/sampling_logp_difference/mean": 0.013524588197469711,
"step": 243,
"step_time": 27.977090622001015
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"entropy": 0.07133703003637493,
"epoch": 0.00488,
"grad_norm": 0.002898427424952388,
"kl": 0.44227540418796707,
"learning_rate": 7.999935905585108e-06,
"loss": 0.0,
"step": 244,
"step_time": 11.75723793999805
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0315001527142158,
"epoch": 0.0049,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.001392417005263269,
"kl": 0.23886053822934628,
"learning_rate": 7.999935287813407e-06,
"loss": -0.0,
"num_tokens": 12827575.0,
"reward": 2.4073498249053955,
"reward_std": 0.42101356387138367,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9554626941680908,
"rewards/probe_shaping_dominance/std": 0.14310474693775177,
"rewards/probe_terminal_raw/mean": 0.046875,
"rewards/probe_terminal_raw/std": 0.1480722874403,
"rewards/rollout_reward_func/mean": -0.4199880063533783,
"rewards/rollout_reward_func/std": 0.2148957997560501,
"sampling/importance_sampling_ratio/max": 1.0394365787506104,
"sampling/importance_sampling_ratio/mean": 0.995591402053833,
"sampling/importance_sampling_ratio/min": 0.8603565096855164,
"sampling/sampling_logp_difference/max": 0.1303640604019165,
"sampling/sampling_logp_difference/mean": 0.004159946460276842,
"step": 245,
"step_time": 26.077412141000423
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.032327667491699685,
"epoch": 0.00492,
"grad_norm": 0.0010727684712037444,
"kl": 0.23855953469561797,
"learning_rate": 7.999934667078813e-06,
"loss": -0.0,
"step": 246,
"step_time": 11.513740063000114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0816163292620331,
"epoch": 0.00494,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0027896249666810036,
"kl": 0.4679242782876827,
"learning_rate": 7.999934043381328e-06,
"loss": 0.0,
"num_tokens": 12935730.0,
"reward": 2.46283221244812,
"reward_std": 0.36876291036605835,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9373108148574829,
"rewards/probe_shaping_dominance/std": 0.1728522628545761,
"rewards/probe_terminal_raw/mean": 0.0625,
"rewards/probe_terminal_raw/std": 0.16800537705421448,
"rewards/rollout_reward_func/mean": -0.3932287096977234,
"rewards/rollout_reward_func/std": 0.24200834333896637,
"sampling/importance_sampling_ratio/max": 1.2427064180374146,
"sampling/importance_sampling_ratio/mean": 1.0063412189483643,
"sampling/importance_sampling_ratio/min": 0.8085158467292786,
"sampling/sampling_logp_difference/max": 0.21965795755386353,
"sampling/sampling_logp_difference/mean": 0.01280665211379528,
"step": 247,
"step_time": 28.041720137000084
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.08206989825703204,
"epoch": 0.00496,
"grad_norm": 0.00293480372056365,
"kl": 0.46830739825963974,
"learning_rate": 7.999933416720957e-06,
"loss": 0.0,
"step": 248,
"step_time": 11.713867525000751
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06533269377541728,
"epoch": 0.00498,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.003347411984577775,
"kl": 0.36843465792230745,
"learning_rate": 7.999932787097692e-06,
"loss": 0.0001,
"num_tokens": 13041381.0,
"reward": 2.382171630859375,
"reward_std": 0.4231238067150116,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.9549021124839783,
"rewards/probe_shaping_dominance/std": 0.14646287262439728,
"rewards/probe_terminal_raw/mean": 0.04255589470267296,
"rewards/probe_terminal_raw/std": 0.13594815135002136,
"rewards/rollout_reward_func/mean": -0.50278639793396,
"rewards/rollout_reward_func/std": 0.27676716446876526,
"sampling/importance_sampling_ratio/max": 1.3422638177871704,
"sampling/importance_sampling_ratio/mean": 0.9941832423210144,
"sampling/importance_sampling_ratio/min": 0.6115661263465881,
"sampling/sampling_logp_difference/max": 0.4917324185371399,
"sampling/sampling_logp_difference/mean": 0.018511097878217697,
"step": 249,
"step_time": 26.64282015000026
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.07043309864820912,
"epoch": 0.005,
"grad_norm": 0.0035562312696129084,
"kl": 0.359963540629451,
"learning_rate": 7.999932154511542e-06,
"loss": 0.0,
"step": 250,
"step_time": 11.727345789000537
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08174855704419315,
"epoch": 0.00502,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.003543607424944639,
"kl": 0.5413316028789268,
"learning_rate": 7.999931518962502e-06,
"loss": 0.0,
"num_tokens": 13146021.0,
"reward": 2.4559497833251953,
"reward_std": 0.3885264992713928,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.9621438384056091,
"rewards/probe_shaping_dominance/std": 0.1238301619887352,
"rewards/probe_terminal_raw/mean": 0.03963414579629898,
"rewards/probe_terminal_raw/std": 0.12972840666770935,
"rewards/rollout_reward_func/mean": -0.40207818150520325,
"rewards/rollout_reward_func/std": 0.2555524408817291,
"sampling/importance_sampling_ratio/max": 1.1064826250076294,
"sampling/importance_sampling_ratio/mean": 0.954660177230835,
"sampling/importance_sampling_ratio/min": 0.41962218284606934,
"sampling/sampling_logp_difference/max": 0.7979011535644531,
"sampling/sampling_logp_difference/mean": 0.023729108273983,
"step": 251,
"step_time": 27.992850227999952
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.07990033202804625,
"epoch": 0.00504,
"grad_norm": 0.003231135895475745,
"kl": 0.524783481414488,
"learning_rate": 7.999930880450575e-06,
"loss": 0.0,
"step": 252,
"step_time": 11.643585757999972
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.07902092937729321,
"epoch": 0.00506,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006585233379155397,
"kl": 0.37969694038247326,
"learning_rate": 7.99993023897576e-06,
"loss": 0.0,
"num_tokens": 13246298.0,
"reward": 2.4005722999572754,
"reward_std": 0.3679780662059784,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.974242091178894,
"rewards/probe_shaping_dominance/std": 0.10219167917966843,
"rewards/probe_terminal_raw/mean": 0.026549797505140305,
"rewards/probe_terminal_raw/std": 0.10620416700839996,
"rewards/rollout_reward_func/mean": -0.42521971464157104,
"rewards/rollout_reward_func/std": 0.21645236015319824,
"sampling/importance_sampling_ratio/max": 1.969668984413147,
"sampling/importance_sampling_ratio/mean": 1.0500105619430542,
"sampling/importance_sampling_ratio/min": 0.7689392566680908,
"sampling/sampling_logp_difference/max": 0.6780328750610352,
"sampling/sampling_logp_difference/mean": 0.02139047347009182,
"step": 253,
"step_time": 26.232431414999155
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"entropy": 0.07854951097397134,
"epoch": 0.00508,
"grad_norm": 0.005968212615698576,
"kl": 0.3778405386647137,
"learning_rate": 7.99992959453806e-06,
"loss": 0.0,
"step": 254,
"step_time": 12.017544923999594
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.04191483659815276,
"epoch": 0.0051,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004378916695713997,
"kl": 0.3174490866222186,
"learning_rate": 7.999928947137475e-06,
"loss": -0.0,
"num_tokens": 13351235.0,
"reward": 2.3821582794189453,
"reward_std": 0.4624309539794922,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9869383573532104,
"rewards/probe_shaping_dominance/std": 0.07388784736394882,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.41415512561798096,
"rewards/rollout_reward_func/std": 0.23873184621334076,
"sampling/importance_sampling_ratio/max": 1.257253885269165,
"sampling/importance_sampling_ratio/mean": 1.011238932609558,
"sampling/importance_sampling_ratio/min": 0.9685202836990356,
"sampling/sampling_logp_difference/max": 0.2289290428161621,
"sampling/sampling_logp_difference/mean": 0.005532183218747377,
"step": 255,
"step_time": 28.14665811800114
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.044428632616472896,
"epoch": 0.00512,
"grad_norm": 0.001523565617389977,
"kl": 0.3174588828405831,
"learning_rate": 7.999928296774006e-06,
"loss": -0.0,
"step": 256,
"step_time": 11.396023698001045
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.8125,
"completions/mean_terminated_length": 2.8125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.11287707928568125,
"epoch": 0.00514,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0049528395757079124,
"kl": 0.3751811153779272,
"learning_rate": 7.999927643447652e-06,
"loss": -0.0001,
"num_tokens": 13453732.0,
"reward": 2.2990427017211914,
"reward_std": 0.4729869067668915,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.8125,
"rewards/probe_completion_length/std": 0.3965577781200409,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.8994619250297546,
"rewards/probe_shaping_dominance/std": 0.23433801531791687,
"rewards/probe_terminal_raw/mean": 0.08892276883125305,
"rewards/probe_terminal_raw/std": 0.1897670477628708,
"rewards/rollout_reward_func/mean": -0.451841801404953,
"rewards/rollout_reward_func/std": 0.3020572066307068,
"sampling/importance_sampling_ratio/max": 1.7735323905944824,
"sampling/importance_sampling_ratio/mean": 1.0311025381088257,
"sampling/importance_sampling_ratio/min": 0.48170769214630127,
"sampling/sampling_logp_difference/max": 0.5872056484222412,
"sampling/sampling_logp_difference/mean": 0.03187928348779678,
"step": 257,
"step_time": 27.428863920001277
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.11026706825941801,
"epoch": 0.00516,
"grad_norm": 0.0036789914593100548,
"kl": 0.37549637774645817,
"learning_rate": 7.999926987158413e-06,
"loss": -0.0001,
"step": 258,
"step_time": 12.307902244997422
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09494227101095021,
"epoch": 0.00518,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004995207767933607,
"kl": 0.5894506504137098,
"learning_rate": 7.999926327906292e-06,
"loss": 0.0,
"num_tokens": 13559320.0,
"reward": 2.3814258575439453,
"reward_std": 0.36968865990638733,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9921875,
"rewards/probe_shaping_dominance/std": 0.04419417306780815,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.4826367497444153,
"rewards/rollout_reward_func/std": 0.231715127825737,
"sampling/importance_sampling_ratio/max": 1.2988759279251099,
"sampling/importance_sampling_ratio/mean": 0.989588737487793,
"sampling/importance_sampling_ratio/min": 0.3728586435317993,
"sampling/sampling_logp_difference/max": 0.9864900708198547,
"sampling/sampling_logp_difference/mean": 0.030208630487322807,
"step": 259,
"step_time": 28.526762178002173
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.09542630659416318,
"epoch": 0.0052,
"grad_norm": 0.009572784416377544,
"kl": 0.5865388629335939,
"learning_rate": 7.999925665691289e-06,
"loss": 0.0,
"step": 260,
"step_time": 11.52395996999985
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.042740301505546086,
"epoch": 0.00522,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0034757580142468214,
"kl": 0.16234587341508444,
"learning_rate": 7.999925000513405e-06,
"loss": 0.0001,
"num_tokens": 13662277.0,
"reward": 2.3550405502319336,
"reward_std": 0.3789060413837433,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9763131737709045,
"rewards/probe_shaping_dominance/std": 0.09356633573770523,
"rewards/probe_terminal_raw/mean": 0.023119919002056122,
"rewards/probe_terminal_raw/std": 0.0910695344209671,
"rewards/rollout_reward_func/mean": -0.4693926274776459,
"rewards/rollout_reward_func/std": 0.27393800020217896,
"sampling/importance_sampling_ratio/max": 1.9132373332977295,
"sampling/importance_sampling_ratio/mean": 1.0334219932556152,
"sampling/importance_sampling_ratio/min": 0.8748363256454468,
"sampling/sampling_logp_difference/max": 0.648794412612915,
"sampling/sampling_logp_difference/mean": 0.015361637808382511,
"step": 261,
"step_time": 27.68740953500128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.043997991022479255,
"epoch": 0.00524,
"grad_norm": 0.0034889201633632183,
"kl": 0.1585660980490502,
"learning_rate": 7.999924332372639e-06,
"loss": 0.0,
"step": 262,
"step_time": 12.369422526000562
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.06730000481184106,
"epoch": 0.00526,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0019632691983133554,
"kl": 0.2906430190632818,
"learning_rate": 7.999923661268994e-06,
"loss": -0.0,
"num_tokens": 13768535.0,
"reward": 2.461604356765747,
"reward_std": 0.28569555282592773,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9790951013565063,
"rewards/probe_shaping_dominance/std": 0.08612176775932312,
"rewards/probe_terminal_raw/mean": 0.023373983800411224,
"rewards/probe_terminal_raw/std": 0.09738598018884659,
"rewards/rollout_reward_func/mean": -0.42836469411849976,
"rewards/rollout_reward_func/std": 0.21179892122745514,
"sampling/importance_sampling_ratio/max": 1.027362585067749,
"sampling/importance_sampling_ratio/mean": 0.911888837814331,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2283318042755127,
"sampling/sampling_logp_difference/mean": 0.04068940505385399,
"step": 263,
"step_time": 28.125288621000436
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.0668873688664462,
"epoch": 0.00528,
"grad_norm": 0.0020422539673745632,
"kl": 0.30596065653662663,
"learning_rate": 7.999922987202466e-06,
"loss": -0.0,
"step": 264,
"step_time": 11.507015873000455
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05058241146616638,
"epoch": 0.0053,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0018712878227233887,
"kl": 0.39055716490838677,
"learning_rate": 7.999922310173063e-06,
"loss": -0.0,
"num_tokens": 13871840.0,
"reward": 2.4825406074523926,
"reward_std": 0.31064870953559875,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9752524495124817,
"rewards/probe_shaping_dominance/std": 0.09777678549289703,
"rewards/probe_terminal_raw/mean": 0.03125,
"rewards/probe_terminal_raw/std": 0.12296734005212784,
"rewards/rollout_reward_func/mean": -0.41146183013916016,
"rewards/rollout_reward_func/std": 0.21425116062164307,
"sampling/importance_sampling_ratio/max": 1.5599281787872314,
"sampling/importance_sampling_ratio/mean": 1.0341243743896484,
"sampling/importance_sampling_ratio/min": 0.8953186869621277,
"sampling/sampling_logp_difference/max": 0.4449194669723511,
"sampling/sampling_logp_difference/mean": 0.013410702347755432,
"step": 265,
"step_time": 27.96838706700055
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.04936367901973426,
"epoch": 0.00532,
"grad_norm": 0.006141372956335545,
"kl": 0.3867563092110231,
"learning_rate": 7.99992163018078e-06,
"loss": -0.0,
"step": 266,
"step_time": 12.308435358998395
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05740413888270268,
"epoch": 0.00534,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0028442663606256247,
"kl": 0.3010439347126521,
"learning_rate": 7.99992094722562e-06,
"loss": -0.0,
"num_tokens": 13974703.0,
"reward": 2.375330924987793,
"reward_std": 0.3971181809902191,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9330692291259766,
"rewards/probe_shaping_dominance/std": 0.15932095050811768,
"rewards/probe_terminal_raw/mean": 0.06885162740945816,
"rewards/probe_terminal_raw/std": 0.1653386801481247,
"rewards/rollout_reward_func/mean": -0.42034000158309937,
"rewards/rollout_reward_func/std": 0.19739177823066711,
"sampling/importance_sampling_ratio/max": 1.2114074230194092,
"sampling/importance_sampling_ratio/mean": 0.9802918434143066,
"sampling/importance_sampling_ratio/min": 0.3451912999153137,
"sampling/sampling_logp_difference/max": 1.0613338947296143,
"sampling/sampling_logp_difference/mean": 0.018370507284998894,
"step": 267,
"step_time": 27.86067632400045
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.055183965210744645,
"epoch": 0.00536,
"grad_norm": 0.0022630670573562384,
"kl": 0.344313826324651,
"learning_rate": 7.999920261307583e-06,
"loss": -0.0,
"step": 268,
"step_time": 11.746586444000059
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.08427908451994881,
"epoch": 0.00538,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0037011466920375824,
"kl": 0.412635525688529,
"learning_rate": 7.999919572426668e-06,
"loss": -0.0,
"num_tokens": 14078089.0,
"reward": 2.4167308807373047,
"reward_std": 0.32326242327690125,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9451819658279419,
"rewards/probe_shaping_dominance/std": 0.147642120718956,
"rewards/probe_terminal_raw/mean": 0.05843495950102806,
"rewards/probe_terminal_raw/std": 0.15837596356868744,
"rewards/rollout_reward_func/mean": -0.44313597679138184,
"rewards/rollout_reward_func/std": 0.24654169380664825,
"sampling/importance_sampling_ratio/max": 1.858984112739563,
"sampling/importance_sampling_ratio/mean": 0.9879124164581299,
"sampling/importance_sampling_ratio/min": 0.6056866645812988,
"sampling/sampling_logp_difference/max": 0.6200296878814697,
"sampling/sampling_logp_difference/mean": 0.027817152440547943,
"step": 269,
"step_time": 26.451382616000046
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.07716414582682773,
"epoch": 0.0054,
"grad_norm": 0.0030677285976707935,
"kl": 0.4153696422581561,
"learning_rate": 7.999918880582879e-06,
"loss": -0.0,
"step": 270,
"step_time": 12.785874016998605
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.04053633386229194,
"epoch": 0.00542,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.001796143944375217,
"kl": 0.5015344847925007,
"learning_rate": 7.999918185776215e-06,
"loss": 0.0,
"num_tokens": 14181503.0,
"reward": 2.4646096229553223,
"reward_std": 0.2045918107032776,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9679263234138489,
"rewards/probe_shaping_dominance/std": 0.1049569845199585,
"rewards/probe_terminal_raw/mean": 0.0364583358168602,
"rewards/probe_terminal_raw/std": 0.11773227155208588,
"rewards/rollout_reward_func/mean": -0.4585248529911041,
"rewards/rollout_reward_func/std": 0.16162419319152832,
"sampling/importance_sampling_ratio/max": 1.4571605920791626,
"sampling/importance_sampling_ratio/mean": 1.0197436809539795,
"sampling/importance_sampling_ratio/min": 0.8846800923347473,
"sampling/sampling_logp_difference/max": 0.3764890432357788,
"sampling/sampling_logp_difference/mean": 0.012306122109293938,
"step": 271,
"step_time": 26.693239825001
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.038741875116102165,
"epoch": 0.00544,
"grad_norm": 0.0020677302964031696,
"kl": 0.5029990994371474,
"learning_rate": 7.999917488006676e-06,
"loss": 0.0,
"step": 272,
"step_time": 11.444299719997616
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.04818115712259896,
"epoch": 0.00546,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.008343451656401157,
"kl": 0.7089566249400381,
"learning_rate": 7.999916787274264e-06,
"loss": 0.0001,
"num_tokens": 14287480.0,
"reward": 2.4599452018737793,
"reward_std": 0.38899266719818115,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9238950610160828,
"rewards/probe_shaping_dominance/std": 0.16443566977977753,
"rewards/probe_terminal_raw/mean": 0.08130080997943878,
"rewards/probe_terminal_raw/std": 0.17714287340641022,
"rewards/rollout_reward_func/mean": -0.3702506721019745,
"rewards/rollout_reward_func/std": 0.21257071197032928,
"sampling/importance_sampling_ratio/max": 2.423100471496582,
"sampling/importance_sampling_ratio/mean": 1.0725514888763428,
"sampling/importance_sampling_ratio/min": 0.8080363273620605,
"sampling/sampling_logp_difference/max": 0.8850466012954712,
"sampling/sampling_logp_difference/mean": 0.024975256994366646,
"step": 273,
"step_time": 28.09797250900101
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"entropy": 0.04680645616099355,
"epoch": 0.00548,
"grad_norm": 0.003927062265574932,
"kl": 0.742738697305322,
"learning_rate": 7.99991608357898e-06,
"loss": 0.0001,
"step": 274,
"step_time": 11.650237371000003
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.04380835813935846,
"epoch": 0.0055,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0025579470675438643,
"kl": 0.21995878049926887,
"learning_rate": 7.999915376920822e-06,
"loss": -0.0,
"num_tokens": 14387389.0,
"reward": 2.2633914947509766,
"reward_std": 0.42217421531677246,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9556913375854492,
"rewards/probe_shaping_dominance/std": 0.1223374605178833,
"rewards/probe_terminal_raw/mean": 0.0518292672932148,
"rewards/probe_terminal_raw/std": 0.14265993237495422,
"rewards/rollout_reward_func/mean": -0.5378788709640503,
"rewards/rollout_reward_func/std": 0.23384462296962738,
"sampling/importance_sampling_ratio/max": 1.084592580795288,
"sampling/importance_sampling_ratio/mean": 0.9922658205032349,
"sampling/importance_sampling_ratio/min": 0.7613502740859985,
"sampling/sampling_logp_difference/max": 0.2726619839668274,
"sampling/sampling_logp_difference/mean": 0.009103155694901943,
"step": 275,
"step_time": 26.459266137000668
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.03994250175310299,
"epoch": 0.00552,
"grad_norm": 0.0021381748374551535,
"kl": 0.2157795349397702,
"learning_rate": 7.999914667299794e-06,
"loss": -0.0,
"step": 276,
"step_time": 11.672075437000785
},
{
"clip_ratio/high_max": 0.05000000074505806,
"clip_ratio/high_mean": 0.02500000037252903,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.037500000558793545,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.8125,
"completions/mean_terminated_length": 2.8125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09868528880178928,
"epoch": 0.00554,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0031571455765515566,
"kl": 0.4792258571833372,
"learning_rate": 7.999913954715895e-06,
"loss": 0.0,
"num_tokens": 14492025.0,
"reward": 2.2542710304260254,
"reward_std": 0.38688531517982483,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.8125,
"rewards/probe_completion_length/std": 0.3965577781200409,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.5082289576530457,
"rewards/rollout_reward_func/std": 0.17395071685314178,
"sampling/importance_sampling_ratio/max": 1.9612770080566406,
"sampling/importance_sampling_ratio/mean": 1.0468454360961914,
"sampling/importance_sampling_ratio/min": 0.5976178646087646,
"sampling/sampling_logp_difference/max": 0.7003155946731567,
"sampling/sampling_logp_difference/mean": 0.032625701278448105,
"step": 277,
"step_time": 27.236174976000257
},
{
"clip_ratio/high_max": 0.05000000074505806,
"clip_ratio/high_mean": 0.02500000037252903,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.037500000558793545,
"entropy": 0.09623363520950079,
"epoch": 0.00556,
"grad_norm": 0.0032991948537528515,
"kl": 0.4749853519606404,
"learning_rate": 7.999913239169126e-06,
"loss": 0.0,
"step": 278,
"step_time": 12.07052038799975
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.8125,
"completions/mean_terminated_length": 2.8125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.04121039004530758,
"epoch": 0.00558,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0032093473710119724,
"kl": 0.6897661700841127,
"learning_rate": 7.999912520659488e-06,
"loss": 0.0,
"num_tokens": 14593223.0,
"reward": 2.3469300270080566,
"reward_std": 0.5208548307418823,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.8125,
"rewards/probe_completion_length/std": 0.3965577781200409,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.95980304479599,
"rewards/probe_shaping_dominance/std": 0.12795211374759674,
"rewards/probe_terminal_raw/mean": 0.042174797505140305,
"rewards/probe_terminal_raw/std": 0.13503843545913696,
"rewards/rollout_reward_func/mean": -0.44879791140556335,
"rewards/rollout_reward_func/std": 0.2045743763446808,
"sampling/importance_sampling_ratio/max": 1.9838464260101318,
"sampling/importance_sampling_ratio/mean": 1.0156028270721436,
"sampling/importance_sampling_ratio/min": 0.1315358281135559,
"sampling/sampling_logp_difference/max": 2.028473377227783,
"sampling/sampling_logp_difference/mean": 0.03758270666003227,
"step": 279,
"step_time": 26.44161211500159
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.033333334140479565,
"entropy": 0.047012478462420404,
"epoch": 0.0056,
"grad_norm": 0.0013261314015835524,
"kl": 0.7127395562856691,
"learning_rate": 7.99991179918698e-06,
"loss": -0.0,
"step": 280,
"step_time": 11.634762280001269
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.012499196142016444,
"epoch": 0.00562,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0003787693567574024,
"kl": 0.669078703969717,
"learning_rate": 7.999911074751606e-06,
"loss": -0.0,
"num_tokens": 14693012.0,
"reward": 2.4939217567443848,
"reward_std": 0.381552517414093,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9186484813690186,
"rewards/probe_shaping_dominance/std": 0.195449098944664,
"rewards/probe_terminal_raw/mean": 0.078125,
"rewards/probe_terminal_raw/std": 0.18445101380348206,
"rewards/rollout_reward_func/mean": -0.3903515338897705,
"rewards/rollout_reward_func/std": 0.2618943452835083,
"sampling/importance_sampling_ratio/max": 1.0298659801483154,
"sampling/importance_sampling_ratio/mean": 0.9976564645767212,
"sampling/importance_sampling_ratio/min": 0.9420029520988464,
"sampling/sampling_logp_difference/max": 0.05974767729640007,
"sampling/sampling_logp_difference/mean": 0.0016555668553337455,
"step": 281,
"step_time": 26.723270941998635
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.012372259192488855,
"epoch": 0.00564,
"grad_norm": 0.0003435203689150512,
"kl": 0.6690934834768996,
"learning_rate": 7.999910347353363e-06,
"loss": -0.0,
"step": 282,
"step_time": 11.794659334002063
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.84375,
"completions/mean_terminated_length": 2.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.045614961185492575,
"epoch": 0.00566,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.003150342497974634,
"kl": 0.48013901670856285,
"learning_rate": 7.999909616992255e-06,
"loss": -0.0,
"num_tokens": 14799672.0,
"reward": 2.3399429321289062,
"reward_std": 0.4422038793563843,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.84375,
"rewards/probe_completion_length/std": 0.3689020276069641,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 1.0,
"rewards/probe_shaping_dominance/std": 0.0,
"rewards/probe_terminal_raw/mean": 0.0,
"rewards/probe_terminal_raw/std": 0.0,
"rewards/rollout_reward_func/mean": -0.45380693674087524,
"rewards/rollout_reward_func/std": 0.1835639625787735,
"sampling/importance_sampling_ratio/max": 1.2092225551605225,
"sampling/importance_sampling_ratio/mean": 0.9782531261444092,
"sampling/importance_sampling_ratio/min": 0.3157159686088562,
"sampling/sampling_logp_difference/max": 1.1528494358062744,
"sampling/sampling_logp_difference/mean": 0.019979460164904594,
"step": 283,
"step_time": 27.036351400000058
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.03985181718599051,
"epoch": 0.00568,
"grad_norm": 0.0033008423633873463,
"kl": 0.49970418894372415,
"learning_rate": 7.99990888366828e-06,
"loss": -0.0,
"step": 284,
"step_time": 11.668078124000203
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.03972258236899506,
"epoch": 0.0057,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.002630846342071891,
"kl": 0.3517824411392212,
"learning_rate": 7.99990814738144e-06,
"loss": -0.0,
"num_tokens": 14902831.0,
"reward": 2.4359757900238037,
"reward_std": 0.2911105751991272,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9871374368667603,
"rewards/probe_shaping_dominance/std": 0.07276186347007751,
"rewards/probe_terminal_raw/mean": 0.015625,
"rewards/probe_terminal_raw/std": 0.0883883461356163,
"rewards/rollout_reward_func/mean": -0.48553669452667236,
"rewards/rollout_reward_func/std": 0.2099909633398056,
"sampling/importance_sampling_ratio/max": 1.558259129524231,
"sampling/importance_sampling_ratio/mean": 1.021366834640503,
"sampling/importance_sampling_ratio/min": 0.757884681224823,
"sampling/sampling_logp_difference/max": 0.4435689449310303,
"sampling/sampling_logp_difference/mean": 0.011840267106890678,
"step": 285,
"step_time": 27.424052481000217
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.040461032156599686,
"epoch": 0.00572,
"grad_norm": 0.002737229922786355,
"kl": 0.3537818659096956,
"learning_rate": 7.999907408131737e-06,
"loss": -0.0,
"step": 286,
"step_time": 12.126654321001297
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.96875,
"completions/mean_terminated_length": 2.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.020485240605921717,
"epoch": 0.00574,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.000876868492923677,
"kl": 0.23688423214722576,
"learning_rate": 7.999906665919169e-06,
"loss": -0.0,
"num_tokens": 15005261.0,
"reward": 2.5098652839660645,
"reward_std": 0.30707597732543945,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.96875,
"rewards/probe_completion_length/std": 0.1767766922712326,
"rewards/probe_invalid_count/mean": 0.03125,
"rewards/probe_invalid_count/std": 0.1767766922712326,
"rewards/probe_shaping_dominance/mean": 0.9733736515045166,
"rewards/probe_shaping_dominance/std": 0.10744811594486237,
"rewards/probe_terminal_raw/mean": 0.0260416679084301,
"rewards/probe_terminal_raw/std": 0.1046360433101654,
"rewards/rollout_reward_func/mean": -0.4395501911640167,
"rewards/rollout_reward_func/std": 0.18828870356082916,
"sampling/importance_sampling_ratio/max": 1.0840176343917847,
"sampling/importance_sampling_ratio/mean": 1.0012118816375732,
"sampling/importance_sampling_ratio/min": 0.9655031561851501,
"sampling/sampling_logp_difference/max": 0.08256775140762329,
"sampling/sampling_logp_difference/mean": 0.00235398905351758,
"step": 287,
"step_time": 27.075085327001034
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.022996263058303157,
"epoch": 0.00576,
"grad_norm": 0.0009354232461191714,
"kl": 0.23660576696175895,
"learning_rate": 7.99990592074374e-06,
"loss": -0.0,
"step": 288,
"step_time": 11.657212093999078
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.05316271091851377,
"epoch": 0.00578,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006305683869868517,
"kl": 0.2035164695232652,
"learning_rate": 7.999905172605446e-06,
"loss": -0.0001,
"num_tokens": 15107252.0,
"reward": 2.422664165496826,
"reward_std": 0.37807923555374146,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9658713340759277,
"rewards/probe_shaping_dominance/std": 0.10942408442497253,
"rewards/probe_terminal_raw/mean": 0.03315548598766327,
"rewards/probe_terminal_raw/std": 0.1095743477344513,
"rewards/rollout_reward_func/mean": -0.40136268734931946,
"rewards/rollout_reward_func/std": 0.2093636691570282,
"sampling/importance_sampling_ratio/max": 1.5805177688598633,
"sampling/importance_sampling_ratio/mean": 1.0220205783843994,
"sampling/importance_sampling_ratio/min": 0.7326148748397827,
"sampling/sampling_logp_difference/max": 0.4577510356903076,
"sampling/sampling_logp_difference/mean": 0.019495096057653427,
"step": 289,
"step_time": 26.987616914999307
},
{
"clip_ratio/high_max": 0.04583333432674408,
"clip_ratio/high_mean": 0.02291666716337204,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02291666716337204,
"entropy": 0.05469944020660478,
"epoch": 0.0058,
"grad_norm": 0.0032733359839767218,
"kl": 0.18666235760611016,
"learning_rate": 7.999904421504293e-06,
"loss": -0.0001,
"step": 290,
"step_time": 11.951281235001261
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.02291666716337204,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.035416667349636555,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.0591709428122158,
"epoch": 0.00582,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0026493356563150883,
"kl": 0.8575776647776365,
"learning_rate": 7.999903667440278e-06,
"loss": 0.0,
"num_tokens": 15208793.0,
"reward": 2.402831792831421,
"reward_std": 0.3910689353942871,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9471915364265442,
"rewards/probe_shaping_dominance/std": 0.1431892067193985,
"rewards/probe_terminal_raw/mean": 0.0520833358168602,
"rewards/probe_terminal_raw/std": 0.1433027982711792,
"rewards/rollout_reward_func/mean": -0.42144304513931274,
"rewards/rollout_reward_func/std": 0.21596133708953857,
"sampling/importance_sampling_ratio/max": 1.0310035943984985,
"sampling/importance_sampling_ratio/mean": 0.9701290130615234,
"sampling/importance_sampling_ratio/min": 0.5706773400306702,
"sampling/sampling_logp_difference/max": 0.5609317421913147,
"sampling/sampling_logp_difference/mean": 0.014490557834506035,
"step": 291,
"step_time": 27.075510762000704
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02500000037252903,
"entropy": 0.05919087287338698,
"epoch": 0.00584,
"grad_norm": 0.0026768911629915237,
"kl": 0.8454538804168692,
"learning_rate": 7.999902910413404e-06,
"loss": 0.0,
"step": 292,
"step_time": 12.032383580999522
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.04045550918681329,
"epoch": 0.00586,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00724328076466918,
"kl": 0.80053227301687,
"learning_rate": 7.999902150423671e-06,
"loss": -0.0001,
"num_tokens": 15311233.0,
"reward": 2.4362893104553223,
"reward_std": 0.426661878824234,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9734768271446228,
"rewards/probe_shaping_dominance/std": 0.10647083818912506,
"rewards/probe_terminal_raw/mean": 0.026295732706785202,
"rewards/probe_terminal_raw/std": 0.10541322082281113,
"rewards/rollout_reward_func/mean": -0.38848331570625305,
"rewards/rollout_reward_func/std": 0.2122591733932495,
"sampling/importance_sampling_ratio/max": 1.8292688131332397,
"sampling/importance_sampling_ratio/mean": 1.001596212387085,
"sampling/importance_sampling_ratio/min": 0.44141146540641785,
"sampling/sampling_logp_difference/max": 0.8177778720855713,
"sampling/sampling_logp_difference/mean": 0.025196455419063568,
"step": 293,
"step_time": 27.23413759199957
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02083333395421505,
"entropy": 0.04230553897491518,
"epoch": 0.00588,
"grad_norm": 0.005148016382008791,
"kl": 0.6622665030881763,
"learning_rate": 7.999901387471079e-06,
"loss": -0.0001,
"step": 294,
"step_time": 11.526401772997815
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.875,
"completions/mean_terminated_length": 2.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.03366142028335162,
"epoch": 0.0059,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005694986321032047,
"kl": 0.39196249035501296,
"learning_rate": 7.99990062155563e-06,
"loss": 0.0,
"num_tokens": 15421347.0,
"reward": 2.391371726989746,
"reward_std": 0.43072906136512756,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.875,
"rewards/probe_completion_length/std": 0.33601075410842896,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9607213735580444,
"rewards/probe_shaping_dominance/std": 0.12690994143486023,
"rewards/probe_terminal_raw/mean": 0.046875,
"rewards/probe_terminal_raw/std": 0.1480722874403,
"rewards/rollout_reward_func/mean": -0.4412246346473694,
"rewards/rollout_reward_func/std": 0.21457210183143616,
"sampling/importance_sampling_ratio/max": 1.2205545902252197,
"sampling/importance_sampling_ratio/mean": 0.9986574053764343,
"sampling/importance_sampling_ratio/min": 0.7592641115188599,
"sampling/sampling_logp_difference/max": 0.2809281349182129,
"sampling/sampling_logp_difference/mean": 0.008172519505023956,
"step": 295,
"step_time": 26.643844086999707
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.030476719188300194,
"epoch": 0.00592,
"grad_norm": 0.005326179787516594,
"kl": 0.39566947892306814,
"learning_rate": 7.999899852677322e-06,
"loss": 0.0,
"step": 296,
"step_time": 12.454534126997714
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.90625,
"completions/mean_terminated_length": 2.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.04483710537169827,
"epoch": 0.00594,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0021372437477111816,
"kl": 0.4166623194081088,
"learning_rate": 7.99989908083616e-06,
"loss": 0.0,
"num_tokens": 15523076.0,
"reward": 2.4664759635925293,
"reward_std": 0.4568862318992615,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.90625,
"rewards/probe_completion_length/std": 0.2961445748806,
"rewards/probe_invalid_count/mean": 0.0625,
"rewards/probe_invalid_count/std": 0.24593468010425568,
"rewards/probe_shaping_dominance/mean": 0.9591568112373352,
"rewards/probe_shaping_dominance/std": 0.11802849918603897,
"rewards/probe_terminal_raw/mean": 0.04509654641151428,
"rewards/probe_terminal_raw/std": 0.1317683309316635,
"rewards/rollout_reward_func/mean": -0.4565274119377136,
"rewards/rollout_reward_func/std": 0.26263633370399475,
"sampling/importance_sampling_ratio/max": 1.3225888013839722,
"sampling/importance_sampling_ratio/mean": 1.0174564123153687,
"sampling/importance_sampling_ratio/min": 0.8623110055923462,
"sampling/sampling_logp_difference/max": 0.27959030866622925,
"sampling/sampling_logp_difference/mean": 0.008479975163936615,
"step": 297,
"step_time": 26.703501694998522
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.04334457405639114,
"epoch": 0.00596,
"grad_norm": 0.004324762150645256,
"kl": 0.41364979138597846,
"learning_rate": 7.999898306032144e-06,
"loss": 0.0,
"step": 298,
"step_time": 11.624797897999088
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3.0,
"completions/max_terminated_length": 3.0,
"completions/mean_length": 2.9375,
"completions/mean_terminated_length": 2.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.039461553949308836,
"epoch": 0.00598,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0021312020253390074,
"kl": 0.4295559982638224,
"learning_rate": 7.999897528265272e-06,
"loss": 0.0,
"num_tokens": 15625505.0,
"reward": 2.4885663986206055,
"reward_std": 0.32209959626197815,
"rewards/format_guard/mean": -0.05000000074505806,
"rewards/format_guard/std": 0.0,
"rewards/probe_completion_length/mean": 1.9375,
"rewards/probe_completion_length/std": 0.24593468010425568,
"rewards/probe_invalid_count/mean": 0.0,
"rewards/probe_invalid_count/std": 0.0,
"rewards/probe_shaping_dominance/mean": 0.9723982810974121,
"rewards/probe_shaping_dominance/std": 0.10864228010177612,
"rewards/probe_terminal_raw/mean": 0.03125,
"rewards/probe_terminal_raw/std": 0.12296734005212784,
"rewards/rollout_reward_func/mean": -0.40258198976516724,
"rewards/rollout_reward_func/std": 0.1721213161945343,
"sampling/importance_sampling_ratio/max": 1.015625,
"sampling/importance_sampling_ratio/mean": 0.9557619690895081,
"sampling/importance_sampling_ratio/min": 0.3387709856033325,
"sampling/sampling_logp_difference/max": 1.0839133262634277,
"sampling/sampling_logp_difference/mean": 0.021332627162337303,
"step": 299,
"step_time": 26.165180010000768
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 0.04107913846030442,
"epoch": 0.006,
"grad_norm": 0.0022343825548887253,
"kl": 0.42880946584045887,
"learning_rate": 7.999896747535546e-06,
"loss": 0.0,
"step": 300,
"step_time": 12.217135184999279
}
],
"logging_steps": 1.0,
"max_steps": 100000,
"num_input_tokens_seen": 15625505,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}