{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.144, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3458.0, "completions/mean_length": 2937.041748046875, "completions/mean_terminated_length": 1580.51611328125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.002285714285714286, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13600684702396393, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0728, "num_tokens": 298360.0, "reward": 0.3645833432674408, "reward_std": 0.26059263944625854, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3564.0, "completions/mean_length": 3029.072998046875, "completions/mean_terminated_length": 1747.0, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.004571428571428572, "frac_reward_zero_std": 0.5, "grad_norm": 0.09845250844955444, "kl": 0.0, "learning_rate": 2e-09, "loss": 0.0361, "num_tokens": 605249.0, "reward": 0.3333333432674408, "reward_std": 0.2350771129131317, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 2760.0, "completions/mean_length": 2942.73974609375, "completions/mean_terminated_length": 1303.9630126953125, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.006857142857142857, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07778529822826385, "kl": 0.00041421254475911457, "learning_rate": 4e-09, "loss": 0.0024, "num_tokens": 903910.0, "reward": 0.3020833134651184, "reward_std": 0.0852636992931366, "rewards/format_reward/mean": 0.3020833432674408, "rewards/format_reward/std": 0.46157145500183105, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3436.0, "completions/mean_length": 2905.010498046875, "completions/mean_terminated_length": 1336.310302734375, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.009142857142857144, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0983043983578682, "kl": 0.00046443939208984375, "learning_rate": 5.999999999999999e-09, "loss": 0.0618, "num_tokens": 1198265.0, "reward": 0.3333333432674408, "reward_std": 0.2163209319114685, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9166666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3531.0, "completions/mean_length": 2463.61474609375, "completions/mean_terminated_length": 1245.8043212890625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.011428571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 0.11297014355659485, "kl": 0.0006132125854492188, "learning_rate": 8e-09, "loss": 0.0303, "num_tokens": 1450600.0, "reward": 0.4895833432674408, "reward_std": 0.19560697674751282, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3491.0, "completions/mean_length": 2930.34375, "completions/mean_terminated_length": 1492.300048828125, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.013714285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 0.0906430333852768, "kl": 0.000526587168375651, "learning_rate": 1e-08, "loss": 0.0469, "num_tokens": 1749601.0, "reward": 0.3229166865348816, "reward_std": 0.18208828568458557, "rewards/format_reward/mean": 0.3229166567325592, "rewards/format_reward/std": 0.4700457453727722, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3539.0, "completions/mean_length": 3116.46875, "completions/mean_terminated_length": 1788.679931640625, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 0.016, "frac_reward_zero_std": 0.5, "grad_norm": 0.11311256140470505, "kl": 0.0005582173665364584, "learning_rate": 1.1999999999999998e-08, "loss": 0.0692, "num_tokens": 2064634.0, "reward": 0.2916666865348816, "reward_std": 0.21764282882213593, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.4569156765937805, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3538.0, "completions/mean_length": 2796.9375, "completions/mean_terminated_length": 1741.121826171875, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.018285714285714287, "frac_reward_zero_std": 0.5, "grad_norm": 0.11352546513080597, "kl": 0.0006198883056640625, "learning_rate": 1.4000000000000001e-08, "loss": 0.0384, "num_tokens": 2348632.0, "reward": 0.4479166865348816, "reward_std": 0.24183647334575653, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3575.0, "completions/mean_length": 2742.21875, "completions/mean_terminated_length": 1275.1142578125, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.02057142857142857, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10035016387701035, "kl": 0.000530242919921875, "learning_rate": 1.6e-08, "loss": 0.017, "num_tokens": 2628511.0, "reward": 0.3854166865348816, "reward_std": 0.14981341361999512, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3573.0, "completions/mean_length": 2841.84375, "completions/mean_terminated_length": 1357.53125, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.022857142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 0.10540636628866196, "kl": 0.0006135304768880209, "learning_rate": 1.8e-08, "loss": 0.0248, "num_tokens": 2917540.0, "reward": 0.3541666865348816, "reward_std": 0.17924454808235168, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.48077061772346497, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3414.0, "completions/mean_length": 2882.89599609375, "completions/mean_terminated_length": 1340.466796875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.025142857142857144, "frac_reward_zero_std": 0.625, "grad_norm": 0.1847194880247116, "kl": 0.0006192525227864584, "learning_rate": 2e-08, "loss": 0.0474, "num_tokens": 3209934.0, "reward": 0.3333333432674408, "reward_std": 0.1618102639913559, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3319.0, "completions/mean_length": 2125.02099609375, "completions/mean_terminated_length": 990.25927734375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.027428571428571427, "frac_reward_zero_std": 0.625, "grad_norm": 0.1230333223938942, "kl": 0.0005963643391927084, "learning_rate": 2.2e-08, "loss": 0.0569, "num_tokens": 3428624.0, "reward": 0.5833333730697632, "reward_std": 0.16661180555820465, "rewards/format_reward/mean": 0.5833333134651184, "rewards/format_reward/std": 0.4955945909023285, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3432.0, "completions/mean_length": 3292.947998046875, "completions/mean_terminated_length": 2253.476318359375, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 0.029714285714285714, "frac_reward_zero_std": 0.4375, "grad_norm": 0.10567952692508698, "kl": 0.0005556742350260416, "learning_rate": 2.3999999999999997e-08, "loss": 0.0282, "num_tokens": 3761127.0, "reward": 0.2708333432674408, "reward_std": 0.2586348354816437, "rewards/format_reward/mean": 0.2708333432674408, "rewards/format_reward/std": 0.44672295451164246, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3397.0, "completions/mean_length": 3178.166748046875, "completions/mean_terminated_length": 1960.666748046875, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "epoch": 0.032, "frac_reward_zero_std": 0.5, "grad_norm": 0.09415514767169952, "kl": 0.000602563222249349, "learning_rate": 2.6e-08, "loss": 0.0457, "num_tokens": 4083169.0, "reward": 0.28125, "reward_std": 0.23311930894851685, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.45196935534477234, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 3266.09375, "completions/mean_terminated_length": 2130.71435546875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.03428571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 0.07530064880847931, "kl": 0.0005038579305013021, "learning_rate": 2.8000000000000003e-08, "loss": 0.0049, "num_tokens": 4414018.0, "reward": 0.2916666865348816, "reward_std": 0.1801304817199707, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.4569156765937805, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3553.0, "completions/mean_length": 2674.17724609375, "completions/mean_terminated_length": 1643.04443359375, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 0.036571428571428574, "frac_reward_zero_std": 0.375, "grad_norm": 0.12493874132633209, "kl": 0.0005288124084472656, "learning_rate": 3e-08, "loss": 0.0587, "num_tokens": 4687161.0, "reward": 0.5, "reward_std": 0.29090970754623413, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5026246905326843, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3582.0, "completions/mean_length": 3098.822998046875, "completions/mean_terminated_length": 1720.919921875, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.038857142857142854, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09812473505735397, "kl": 0.0006046295166015625, "learning_rate": 3.2e-08, "loss": 0.0221, "num_tokens": 5001868.0, "reward": 0.2916666865348816, "reward_std": 0.2163209617137909, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.4569156765937805, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3520.0, "completions/mean_length": 2694.48974609375, "completions/mean_terminated_length": 1550.8333740234375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.04114285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.11484291404485703, "kl": 0.0005998611450195312, "learning_rate": 3.4e-08, "loss": 0.077, "num_tokens": 5276751.0, "reward": 0.4895833432674408, "reward_std": 0.25055360794067383, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3443.0, "completions/mean_length": 2670.666748046875, "completions/mean_terminated_length": 1392.0, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.04342857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 0.09073452651500702, "kl": 0.0005601247151692709, "learning_rate": 3.6e-08, "loss": 0.0491, "num_tokens": 5549443.0, "reward": 0.4166666865348816, "reward_std": 0.16661179065704346, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.49559465050697327, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3475.0, "completions/mean_length": 2887.42724609375, "completions/mean_terminated_length": 1354.966796875, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.045714285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 0.09556788951158524, "kl": 0.0005938212076822916, "learning_rate": 3.7999999999999996e-08, "loss": 0.0749, "num_tokens": 5842338.0, "reward": 0.3125, "reward_std": 0.17532894015312195, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3473.0, "completions/mean_length": 2903.916748046875, "completions/mean_terminated_length": 1663.7647705078125, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.048, "frac_reward_zero_std": 0.5, "grad_norm": 0.12257043272256851, "kl": 0.0005583763122558594, "learning_rate": 4e-08, "loss": 0.0543, "num_tokens": 6136450.0, "reward": 0.3854166865348816, "reward_std": 0.24575206637382507, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3493.0, "completions/mean_length": 3011.75, "completions/mean_terminated_length": 1811.8709716796875, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.05028571428571429, "frac_reward_zero_std": 0.4375, "grad_norm": 0.10206293314695358, "kl": 0.0005245208740234375, "learning_rate": 4.2e-08, "loss": 0.0345, "num_tokens": 6442168.0, "reward": 0.375, "reward_std": 0.25863486528396606, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.4866642653942108, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3537.0, "completions/mean_length": 3026.73974609375, "completions/mean_terminated_length": 1962.87890625, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.052571428571428575, "frac_reward_zero_std": 0.625, "grad_norm": 0.08821089565753937, "kl": 0.0005661646525065104, "learning_rate": 4.4e-08, "loss": 0.0006, "num_tokens": 6749481.0, "reward": 0.3645833432674408, "reward_std": 0.18208830058574677, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3436.0, "completions/mean_length": 2724.0, "completions/mean_terminated_length": 1749.3333740234375, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.054857142857142854, "frac_reward_zero_std": 0.25, "grad_norm": 0.1669473648071289, "kl": 0.0006227493286132812, "learning_rate": 4.6e-08, "loss": 0.0681, "num_tokens": 7026327.0, "reward": 0.53125, "reward_std": 0.3574172258377075, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3512.0, "completions/mean_length": 3240.1875, "completions/mean_terminated_length": 2263.760009765625, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.05714285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.10814571380615234, "kl": 0.0005668004353841146, "learning_rate": 4.799999999999999e-08, "loss": 0.0206, "num_tokens": 7352895.0, "reward": 0.3125, "reward_std": 0.24468021094799042, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 2478.822998046875, "completions/mean_terminated_length": 931.5750122070312, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.05942857142857143, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08812437951564789, "kl": 0.0005440711975097656, "learning_rate": 5e-08, "loss": 0.039, "num_tokens": 7606390.0, "reward": 0.4375, "reward_std": 0.1430540829896927, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3575.0, "completions/mean_length": 3112.64599609375, "completions/mean_terminated_length": 1698.5833740234375, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 0.061714285714285715, "frac_reward_zero_std": 0.4375, "grad_norm": 0.10746309161186218, "kl": 0.0005650520324707031, "learning_rate": 5.2e-08, "loss": 0.0464, "num_tokens": 7921542.0, "reward": 0.3125000298023224, "reward_std": 0.280870646238327, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3546.0, "completions/mean_length": 3297.375, "completions/mean_terminated_length": 2055.333251953125, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.064, "frac_reward_zero_std": 0.5, "grad_norm": 0.10604382306337357, "kl": 0.0006319681803385416, "learning_rate": 5.4e-08, "loss": 0.056, "num_tokens": 8254734.0, "reward": 0.2604166865348816, "reward_std": 0.23311933875083923, "rewards/format_reward/mean": 0.2604166567325592, "rewards/format_reward/std": 0.4411657154560089, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.16666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3467.0, "completions/mean_length": 3198.21875, "completions/mean_terminated_length": 2261.321533203125, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.06628571428571428, "frac_reward_zero_std": 0.5625, "grad_norm": 0.08207737654447556, "kl": 0.000556786855061849, "learning_rate": 5.6000000000000005e-08, "loss": 0.0236, "num_tokens": 8578473.0, "reward": 0.34375, "reward_std": 0.21436314284801483, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4774521291255951, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3287.0, "completions/mean_length": 2793.53125, "completions/mean_terminated_length": 1415.857177734375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.06857142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 0.09903497248888016, "kl": 0.00063323974609375, "learning_rate": 5.7999999999999997e-08, "loss": 0.0259, "num_tokens": 8862576.0, "reward": 0.40625, "reward_std": 0.18208828568458557, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3232.0, "completions/mean_length": 2894.52099609375, "completions/mean_terminated_length": 1377.666748046875, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.07085714285714285, "frac_reward_zero_std": 0.375, "grad_norm": 0.11826644837856293, "kl": 0.0004975001017252604, "learning_rate": 6e-08, "loss": 0.0843, "num_tokens": 9156134.0, "reward": 0.375, "reward_std": 0.29090970754623413, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.4866642653942108, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3519.0, "completions/mean_length": 2436.1875, "completions/mean_terminated_length": 1423.411865234375, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.07314285714285715, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12378033995628357, "kl": 0.0005936622619628906, "learning_rate": 6.2e-08, "loss": 0.035, "num_tokens": 9406298.0, "reward": 0.5833333730697632, "reward_std": 0.2163209319114685, "rewards/format_reward/mean": 0.5833333134651184, "rewards/format_reward/std": 0.4955945909023285, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3472.0, "completions/mean_length": 2883.979248046875, "completions/mean_terminated_length": 1860.871826171875, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.07542857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09643115103244781, "kl": 0.0005831718444824219, "learning_rate": 6.4e-08, "loss": 0.0298, "num_tokens": 9699252.0, "reward": 0.4479166865348816, "reward_std": 0.20084445178508759, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333337, "completions/max_length": 3584.0, "completions/max_terminated_length": 3394.0, "completions/mean_length": 3245.291748046875, "completions/mean_terminated_length": 1872.631591796875, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "epoch": 0.07771428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 0.1022980734705925, "kl": 0.0005474090576171875, "learning_rate": 6.6e-08, "loss": 0.0465, "num_tokens": 10027630.0, "reward": 0.2604166865348816, "reward_std": 0.2466379851102829, "rewards/format_reward/mean": 0.2604166567325592, "rewards/format_reward/std": 0.4411657154560089, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3362.0, "completions/mean_length": 2790.885498046875, "completions/mean_terminated_length": 1526.189208984375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.08, "frac_reward_zero_std": 0.625, "grad_norm": 0.09115320444107056, "kl": 0.0005726814270019531, "learning_rate": 6.8e-08, "loss": 0.0514, "num_tokens": 10311023.0, "reward": 0.3958333432674408, "reward_std": 0.16661179065704346, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3487.0, "completions/mean_length": 3176.09375, "completions/mean_terminated_length": 2017.6400146484375, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 0.08228571428571428, "frac_reward_zero_std": 0.5, "grad_norm": 0.09013529121875763, "kl": 0.0005197525024414062, "learning_rate": 6.999999999999999e-08, "loss": 0.0358, "num_tokens": 10632122.0, "reward": 0.2812500298023224, "reward_std": 0.23311933875083923, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.45196935534477234, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.29166666666666663, "completions/max_length": 3584.0, "completions/max_terminated_length": 3568.0, "completions/mean_length": 3280.61474609375, "completions/mean_terminated_length": 1870.7647705078125, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.08457142857142858, "frac_reward_zero_std": 0.5, "grad_norm": 0.09990935772657394, "kl": 0.0005869865417480469, "learning_rate": 7.2e-08, "loss": 0.0076, "num_tokens": 10963327.0, "reward": 0.2916666865348816, "reward_std": 0.243794247508049, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.4569156765937805, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 3584.0, "completions/max_terminated_length": 3414.0, "completions/mean_length": 3143.30224609375, "completions/mean_terminated_length": 1660.95458984375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.08685714285714285, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1052294448018074, "kl": 0.0005707740783691406, "learning_rate": 7.4e-08, "loss": 0.0365, "num_tokens": 11280684.0, "reward": 0.2604166865348816, "reward_std": 0.25187548995018005, "rewards/format_reward/mean": 0.2604166567325592, "rewards/format_reward/std": 0.4411657154560089, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 2451.0, "completions/mean_length": 2676.33349609375, "completions/mean_terminated_length": 861.0, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.08914285714285715, "frac_reward_zero_std": 0.75, "grad_norm": 0.11297257989645004, "kl": 0.0005629857381184896, "learning_rate": 7.599999999999999e-08, "loss": 0.0573, "num_tokens": 11553614.0, "reward": 0.34375, "reward_std": 0.10882141441106796, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4774521291255951, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3579.0, "completions/mean_length": 2672.42724609375, "completions/mean_terminated_length": 1449.5853271484375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.09142857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1159047782421112, "kl": 0.0005908012390136719, "learning_rate": 7.8e-08, "loss": 0.0235, "num_tokens": 11826007.0, "reward": 0.4375, "reward_std": 0.2076038122177124, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3220.0, "completions/mean_length": 3086.229248046875, "completions/mean_terminated_length": 1592.916748046875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.09371428571428571, "frac_reward_zero_std": 0.625, "grad_norm": 0.08972278982400894, "kl": 0.0006130536397298177, "learning_rate": 8e-08, "loss": 0.0427, "num_tokens": 12138461.0, "reward": 0.2916666865348816, "reward_std": 0.1801304817199707, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.4569157063961029, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3523.0, "completions/mean_length": 2744.979248046875, "completions/mean_terminated_length": 1666.2381591796875, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.096, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12443576008081436, "kl": 0.0005466143290201823, "learning_rate": 8.199999999999999e-08, "loss": 0.0248, "num_tokens": 12418449.0, "reward": 0.4583333432674408, "reward_std": 0.1430540680885315, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 3584.0, "completions/max_terminated_length": 3558.0, "completions/mean_length": 3242.52099609375, "completions/mean_terminated_length": 1242.4285888671875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.09828571428571428, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09909730404615402, "kl": 0.0005853970845540365, "learning_rate": 8.4e-08, "loss": 0.0365, "num_tokens": 12746813.0, "reward": 0.1666666716337204, "reward_std": 0.1988866627216339, "rewards/format_reward/mean": 0.1666666716337204, "rewards/format_reward/std": 0.374634325504303, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3469.0, "completions/mean_length": 2681.5, "completions/mean_terminated_length": 1779.0, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.10057142857142858, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12858478724956512, "kl": 0.0006157557169596354, "learning_rate": 8.599999999999999e-08, "loss": 0.0533, "num_tokens": 13020377.0, "reward": 0.5416666865348816, "reward_std": 0.25863486528396606, "rewards/format_reward/mean": 0.5416666865348816, "rewards/format_reward/std": 0.5008764266967773, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3498.0, "completions/mean_length": 3165.58349609375, "completions/mean_terminated_length": 2198.896484375, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "epoch": 0.10285714285714286, "frac_reward_zero_std": 0.3125, "grad_norm": 0.12761904299259186, "kl": 0.000564416249593099, "learning_rate": 8.8e-08, "loss": 0.0308, "num_tokens": 13340059.0, "reward": 0.3541666865348816, "reward_std": 0.31838303804397583, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.4807705879211426, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3209.0, "completions/mean_length": 2985.55224609375, "completions/mean_terminated_length": 1730.741943359375, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.10514285714285715, "frac_reward_zero_std": 0.375, "grad_norm": 0.1372997909784317, "kl": 0.0005257924397786459, "learning_rate": 9e-08, "loss": 0.0533, "num_tokens": 13643196.0, "reward": 0.3645833432674408, "reward_std": 0.292867511510849, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 2885.96875, "completions/mean_terminated_length": 1772.8919677734375, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.10742857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 0.09933201223611832, "kl": 0.0005814234415690104, "learning_rate": 9.2e-08, "loss": 0.0661, "num_tokens": 13936977.0, "reward": 0.4270833432674408, "reward_std": 0.18208828568458557, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.2083333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 2161.541748046875, "completions/mean_terminated_length": 1007.4717407226562, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.10971428571428571, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12496917694807053, "kl": 0.0005776087443033854, "learning_rate": 9.4e-08, "loss": 0.039, "num_tokens": 14159347.0, "reward": 0.5729166865348816, "reward_std": 0.14109627902507782, "rewards/format_reward/mean": 0.5729166865348816, "rewards/format_reward/std": 0.4972512125968933, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 2724.27099609375, "completions/mean_terminated_length": 1664.6046142578125, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.112, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1356138288974762, "kl": 0.0005715688069661459, "learning_rate": 9.599999999999999e-08, "loss": 0.0539, "num_tokens": 14436279.0, "reward": 0.4479166865348816, "reward_std": 0.2828284502029419, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3563.0, "completions/mean_length": 3031.697998046875, "completions/mean_terminated_length": 1755.689697265625, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.11428571428571428, "frac_reward_zero_std": 0.5, "grad_norm": 0.10621047019958496, "kl": 0.0006009737650553385, "learning_rate": 9.799999999999999e-08, "loss": 0.0136, "num_tokens": 14743666.0, "reward": 0.3541666865348816, "reward_std": 0.23987865447998047, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.48077061772346497, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.41666666666666663, "completions/max_length": 3584.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 3248.041748046875, "completions/mean_terminated_length": 1280.2857666015625, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.11657142857142858, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11713991314172745, "kl": 0.0005950927734375, "learning_rate": 1e-07, "loss": 0.0354, "num_tokens": 15071078.0, "reward": 0.1770833432674408, "reward_std": 0.20084446668624878, "rewards/format_reward/mean": 0.1770833283662796, "rewards/format_reward/std": 0.3837430775165558, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.2916666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 2244.541748046875, "completions/mean_terminated_length": 1246.036376953125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.11885714285714286, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10812873393297195, "kl": 0.0005008379618326823, "learning_rate": 9.999890338174275e-08, "loss": -0.0157, "num_tokens": 15303192.0, "reward": 0.5520833730697632, "reward_std": 0.14109627902507782, "rewards/format_reward/mean": 0.5520833134651184, "rewards/format_reward/std": 0.49989035725593567, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3534.0, "completions/mean_length": 2764.20849609375, "completions/mean_terminated_length": 1512.9473876953125, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 0.12114285714285715, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08318143337965012, "kl": 0.0005362828572591146, "learning_rate": 9.999561358041868e-08, "loss": 0.0381, "num_tokens": 15584606.0, "reward": 0.4479166865348816, "reward_std": 0.15372902154922485, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3549.0, "completions/mean_length": 2930.70849609375, "completions/mean_terminated_length": 1683.5152587890625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.12342857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 0.09070580452680588, "kl": 0.0005499521891276041, "learning_rate": 9.999013075636804e-08, "loss": 0.0242, "num_tokens": 15882172.0, "reward": 0.3854166865348816, "reward_std": 0.17728674411773682, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.16666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3567.0, "completions/mean_length": 3003.40625, "completions/mean_terminated_length": 1593.3929443359375, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.12571428571428572, "frac_reward_zero_std": 0.625, "grad_norm": 0.10172918438911438, "kl": 0.0005849202473958334, "learning_rate": 9.998245517681594e-08, "loss": 0.0171, "num_tokens": 16186891.0, "reward": 0.3020833432674408, "reward_std": 0.16856960952281952, "rewards/format_reward/mean": 0.3020833432674408, "rewards/format_reward/std": 0.46157145500183105, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0416666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3470.0, "completions/mean_length": 2583.979248046875, "completions/mean_terminated_length": 1624.7755126953125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.128, "frac_reward_zero_std": 0.75, "grad_norm": 0.07545579224824905, "kl": 0.0005804697672526041, "learning_rate": 9.997258721585931e-08, "loss": 0.0244, "num_tokens": 16451051.0, "reward": 0.5208333730697632, "reward_std": 0.11077921092510223, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.08333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3485.0, "completions/mean_length": 3133.73974609375, "completions/mean_terminated_length": 1921.5001220703125, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 0.13028571428571428, "frac_reward_zero_std": 0.5, "grad_norm": 0.0956675112247467, "kl": 0.0006281534830729166, "learning_rate": 9.996052735444862e-08, "loss": 0.0399, "num_tokens": 16767484.0, "reward": 0.2916666865348816, "reward_std": 0.23116151988506317, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.4569156765937805, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3497.0, "completions/mean_length": 3043.229248046875, "completions/mean_terminated_length": 1853.533447265625, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.13257142857142856, "frac_reward_zero_std": 0.5, "grad_norm": 0.09832925349473953, "kl": 0.0006004969278971354, "learning_rate": 9.994627618036453e-08, "loss": 0.045, "num_tokens": 17076140.0, "reward": 0.3333333432674408, "reward_std": 0.2350771129131317, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3482.0, "completions/mean_length": 3199.604248046875, "completions/mean_terminated_length": 2107.919921875, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.13485714285714287, "frac_reward_zero_std": 0.625, "grad_norm": 0.09223364293575287, "kl": 0.0006233851114908854, "learning_rate": 9.992983438818914e-08, "loss": 0.0386, "num_tokens": 17399472.0, "reward": 0.3125, "reward_std": 0.1975647658109665, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.08333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3352.0, "completions/mean_length": 3025.23974609375, "completions/mean_terminated_length": 1520.8846435546875, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "epoch": 0.13714285714285715, "frac_reward_zero_std": 0.375, "grad_norm": 0.1303519904613495, "kl": 0.0006186167399088541, "learning_rate": 9.991120277927222e-08, "loss": 0.0723, "num_tokens": 17706173.0, "reward": 0.3020833432674408, "reward_std": 0.3015846610069275, "rewards/format_reward/mean": 0.3020833432674408, "rewards/format_reward/std": 0.46157145500183105, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3575.0, "completions/mean_length": 3220.166748046875, "completions/mean_terminated_length": 2290.370361328125, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 0.13942857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.1127300038933754, "kl": 0.0005509058634440104, "learning_rate": 9.989038226169209e-08, "loss": 0.0621, "num_tokens": 18031425.0, "reward": 0.375, "reward_std": 0.24859580397605896, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.4866642653942108, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 3584.0, "completions/max_terminated_length": 3536.0, "completions/mean_length": 3095.354248046875, "completions/mean_terminated_length": 1544.434814453125, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.1417142857142857, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09086631238460541, "kl": 0.0006039937337239584, "learning_rate": 9.98673738502114e-08, "loss": 0.0153, "num_tokens": 18344623.0, "reward": 0.25, "reward_std": 0.1430540829896927, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.435285747051239, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0416666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3220.0, "completions/mean_length": 2315.14599609375, "completions/mean_terminated_length": 1098.0816650390625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.144, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10809347033500671, "kl": 0.0006092389424641927, "learning_rate": 9.984217866622769e-08, "loss": 0.0626, "num_tokens": 18581997.0, "reward": 0.53125, "reward_std": 0.14109627902507782, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8333333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3477.0, "completions/mean_length": 2666.30224609375, "completions/mean_terminated_length": 1581.75, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.1462857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.11078927665948868, "kl": 0.0005582173665364584, "learning_rate": 9.981479793771865e-08, "loss": 0.0651, "num_tokens": 18853448.0, "reward": 0.4791666865348816, "reward_std": 0.2350771129131317, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20833333333333337, "completions/max_length": 3584.0, "completions/max_terminated_length": 3476.0, "completions/mean_length": 3188.760498046875, "completions/mean_terminated_length": 1587.0, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.14857142857142858, "frac_reward_zero_std": 0.625, "grad_norm": 0.07864987850189209, "kl": 0.000557104746500651, "learning_rate": 9.978523299918239e-08, "loss": 0.0321, "num_tokens": 19176051.0, "reward": 0.2083333432674408, "reward_std": 0.17532894015312195, "rewards/format_reward/mean": 0.2083333283662796, "rewards/format_reward/std": 0.40824830532073975, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3388.0, "completions/mean_length": 2840.61474609375, "completions/mean_terminated_length": 1353.84375, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.15085714285714286, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1085980162024498, "kl": 0.0005858739217122396, "learning_rate": 9.975348529157229e-08, "loss": 0.0418, "num_tokens": 19464632.0, "reward": 0.3958333432674408, "reward_std": 0.2163209319114685, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9583333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3323.0, "completions/mean_length": 2540.041748046875, "completions/mean_terminated_length": 1451.6595458984375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.15314285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 0.11475242674350739, "kl": 0.0005539258321126302, "learning_rate": 9.971955636222684e-08, "loss": 0.0616, "num_tokens": 19723902.0, "reward": 0.5104166865348816, "reward_std": 0.19080540537834167, "rewards/format_reward/mean": 0.5104166865348816, "rewards/format_reward/std": 0.5025155544281006, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 2818.17724609375, "completions/mean_terminated_length": 1483.4571533203125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.15542857142857142, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09802790731191635, "kl": 0.0005327860514322916, "learning_rate": 9.968344786479416e-08, "loss": 0.0134, "num_tokens": 20010815.0, "reward": 0.40625, "reward_std": 0.14981341361999512, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3559.0, "completions/mean_length": 2808.08349609375, "completions/mean_terminated_length": 1623.7894287109375, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.15771428571428572, "frac_reward_zero_std": 0.625, "grad_norm": 0.10697630792856216, "kl": 0.0005957285563151041, "learning_rate": 9.964516155915151e-08, "loss": 0.0126, "num_tokens": 20297635.0, "reward": 0.4166666865348816, "reward_std": 0.1618102639913559, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.4955946207046509, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3463.0, "completions/mean_length": 3381.229248046875, "completions/mean_terminated_length": 2286.266845703125, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.16, "frac_reward_zero_std": 0.4375, "grad_norm": 0.10971253365278244, "kl": 0.0006380081176757812, "learning_rate": 9.960469931131937e-08, "loss": 0.0533, "num_tokens": 20637767.0, "reward": 0.2291666865348816, "reward_std": 0.2808706760406494, "rewards/format_reward/mean": 0.2291666716337204, "rewards/format_reward/std": 0.42250296473503113, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.16666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3478.0, "completions/mean_length": 3010.729248046875, "completions/mean_terminated_length": 1618.5001220703125, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 0.16228571428571428, "frac_reward_zero_std": 0.5, "grad_norm": 0.09700077027082443, "kl": 0.0005467732747395834, "learning_rate": 9.956206309337066e-08, "loss": 0.0285, "num_tokens": 20943861.0, "reward": 0.34375, "reward_std": 0.2283177673816681, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4774521291255951, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 3584.0, "completions/max_terminated_length": 3501.0, "completions/mean_length": 3115.479248046875, "completions/mean_terminated_length": 1628.434814453125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.16457142857142856, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14516979455947876, "kl": 0.0005690256754557291, "learning_rate": 9.951725498333447e-08, "loss": 0.0499, "num_tokens": 21259273.0, "reward": 0.3125, "reward_std": 0.27606913447380066, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8333333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 2583.729248046875, "completions/mean_terminated_length": 1401.5909423828125, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.16685714285714287, "frac_reward_zero_std": 0.8125, "grad_norm": 0.06490742415189743, "kl": 0.000545501708984375, "learning_rate": 9.947027716509488e-08, "loss": 0.0093, "num_tokens": 21523775.0, "reward": 0.4791666865348816, "reward_std": 0.09202303737401962, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4583333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3471.0, "completions/mean_length": 2347.875, "completions/mean_terminated_length": 1572.677978515625, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.16914285714285715, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0859948992729187, "kl": 0.00044027964274088543, "learning_rate": 9.942113192828445e-08, "loss": 0.0131, "num_tokens": 21764615.0, "reward": 0.6458333730697632, "reward_std": 0.15177123248577118, "rewards/format_reward/mean": 0.6458333134651184, "rewards/format_reward/std": 0.4807705879211426, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8333333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 2864.197998046875, "completions/mean_terminated_length": 2013.5228271484375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.17142857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.11670469492673874, "kl": 0.0005636215209960938, "learning_rate": 9.936982166817271e-08, "loss": 0.0381, "num_tokens": 22055946.0, "reward": 0.5208333730697632, "reward_std": 0.2350771278142929, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3542.0, "completions/mean_length": 2683.17724609375, "completions/mean_terminated_length": 1572.8604736328125, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.1737142857142857, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11684764176607132, "kl": 0.0005763371785481771, "learning_rate": 9.931634888554937e-08, "loss": 0.0463, "num_tokens": 22329821.0, "reward": 0.46875, "reward_std": 0.2221943587064743, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3534.0, "completions/mean_length": 2772.15625, "completions/mean_terminated_length": 1357.2286376953125, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.176, "frac_reward_zero_std": 0.625, "grad_norm": 0.10801287740468979, "kl": 0.0005984306335449219, "learning_rate": 9.926071618660237e-08, "loss": 0.0288, "num_tokens": 22611692.0, "reward": 0.3958333432674408, "reward_std": 0.1888476312160492, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3390.0, "completions/mean_length": 3108.86474609375, "completions/mean_terminated_length": 2112.61279296875, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 0.1782857142857143, "frac_reward_zero_std": 0.375, "grad_norm": 0.12827573716640472, "kl": 0.0006354649861653646, "learning_rate": 9.920292628279099e-08, "loss": 0.0595, "num_tokens": 22927237.0, "reward": 0.34375, "reward_std": 0.292867511510849, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4774521291255951, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3283.0, "completions/mean_length": 2821.23974609375, "completions/mean_terminated_length": 1491.857177734375, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.18057142857142858, "frac_reward_zero_std": 0.5, "grad_norm": 0.1304936707019806, "kl": 0.0006202061971028646, "learning_rate": 9.914298199071361e-08, "loss": 0.0767, "num_tokens": 23213934.0, "reward": 0.3854166865348816, "reward_std": 0.23311930894851685, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3472.0, "completions/mean_length": 2843.45849609375, "completions/mean_terminated_length": 1662.5946044921875, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.18285714285714286, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12500052154064178, "kl": 0.0005324681599934896, "learning_rate": 9.908088623197048e-08, "loss": 0.0331, "num_tokens": 23503124.0, "reward": 0.4166666865348816, "reward_std": 0.2076038122177124, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.49559465050697327, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3370.0, "completions/mean_length": 2606.80224609375, "completions/mean_terminated_length": 1350.40478515625, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.18514285714285714, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09482422471046448, "kl": 0.0005370775858561198, "learning_rate": 9.901664203302125e-08, "loss": 0.0333, "num_tokens": 23768887.0, "reward": 0.4479166865348816, "reward_std": 0.13629475235939026, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8333333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3567.0, "completions/mean_length": 2587.89599609375, "completions/mean_terminated_length": 1410.681884765625, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.18742857142857142, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09811008721590042, "kl": 0.0005925496419270834, "learning_rate": 9.895025252503755e-08, "loss": 0.0505, "num_tokens": 24033021.0, "reward": 0.4791666865348816, "reward_std": 0.15177121758460999, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3414.0, "completions/mean_length": 3143.73974609375, "completions/mean_terminated_length": 1571.3809814453125, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 0.18971428571428572, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07115814089775085, "kl": 0.0005896886189778646, "learning_rate": 9.888172094375033e-08, "loss": 0.038, "num_tokens": 24351236.0, "reward": 0.21875, "reward_std": 0.09006524085998535, "rewards/format_reward/mean": 0.21875, "rewards/format_reward/std": 0.4155687391757965, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3340.0, "completions/mean_length": 2987.479248046875, "completions/mean_terminated_length": 2077.0, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.192, "frac_reward_zero_std": 0.3125, "grad_norm": 0.12423859536647797, "kl": 0.0005726814270019531, "learning_rate": 9.88110506292922e-08, "loss": 0.0804, "num_tokens": 24654432.0, "reward": 0.40625, "reward_std": 0.32514238357543945, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8333333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3471.0, "completions/mean_length": 2784.9375, "completions/mean_terminated_length": 1840.5909423828125, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.19428571428571428, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1060919538140297, "kl": 0.0005359649658203125, "learning_rate": 9.873824502603459e-08, "loss": 0.0599, "num_tokens": 24938262.0, "reward": 0.53125, "reward_std": 0.26059263944625854, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3472.0, "completions/mean_length": 2932.885498046875, "completions/mean_terminated_length": 1567.6451416015625, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.19657142857142856, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14066024124622345, "kl": 0.0005710919698079427, "learning_rate": 9.866330768241983e-08, "loss": 0.0289, "num_tokens": 25235827.0, "reward": 0.375, "reward_std": 0.25863486528396606, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.4866642653942108, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.08333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3368.0, "completions/mean_length": 3057.604248046875, "completions/mean_terminated_length": 1640.3846435546875, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.19885714285714284, "frac_reward_zero_std": 0.625, "grad_norm": 0.11923669278621674, "kl": 0.0005593299865722656, "learning_rate": 9.85862422507884e-08, "loss": 0.0618, "num_tokens": 25545647.0, "reward": 0.2604166865348816, "reward_std": 0.18688982725143433, "rewards/format_reward/mean": 0.2604166567325592, "rewards/format_reward/std": 0.4411657154560089, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3567.0, "completions/mean_length": 2599.48974609375, "completions/mean_terminated_length": 1730.803955078125, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.20114285714285715, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10778623074293137, "kl": 0.0005170504252115885, "learning_rate": 9.850705248720067e-08, "loss": 0.0493, "num_tokens": 25811650.0, "reward": 0.5729166865348816, "reward_std": 0.20084445178508759, "rewards/format_reward/mean": 0.5729166865348816, "rewards/format_reward/std": 0.4972512125968933, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0416666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3487.0, "completions/mean_length": 2505.041748046875, "completions/mean_terminated_length": 1470.1224365234375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.20342857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.13670764863491058, "kl": 0.0005366007486979166, "learning_rate": 9.842574225125401e-08, "loss": 0.0698, "num_tokens": 26068160.0, "reward": 0.5520833730697632, "reward_std": 0.24183647334575653, "rewards/format_reward/mean": 0.5520833134651184, "rewards/format_reward/std": 0.4998903274536133, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3577.0, "completions/mean_length": 2809.84375, "completions/mean_terminated_length": 1726.0250244140625, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.2057142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 0.0981605127453804, "kl": 0.0005105336507161459, "learning_rate": 9.834231550589461e-08, "loss": 0.0202, "num_tokens": 26354153.0, "reward": 0.4479166865348816, "reward_std": 0.18208828568458557, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3514.0, "completions/mean_length": 2839.354248046875, "completions/mean_terminated_length": 1278.0, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.208, "frac_reward_zero_std": 0.625, "grad_norm": 0.09133773297071457, "kl": 0.0005725224812825521, "learning_rate": 9.825677631722435e-08, "loss": 0.0475, "num_tokens": 26642895.0, "reward": 0.3333333432674408, "reward_std": 0.16661179065704346, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 2715.947998046875, "completions/mean_terminated_length": 1391.0263671875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.2102857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.11354603618383408, "kl": 0.0005235671997070312, "learning_rate": 9.816912885430258e-08, "loss": 0.049, "num_tokens": 26920108.0, "reward": 0.4166666865348816, "reward_std": 0.23116151988506317, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.4955946207046509, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45833333333333337, "completions/max_length": 3584.0, "completions/max_terminated_length": 2996.0, "completions/mean_length": 3379.90625, "completions/mean_terminated_length": 2076.84619140625, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "epoch": 0.21257142857142858, "frac_reward_zero_std": 0.5, "grad_norm": 0.09983745962381363, "kl": 0.0006497701009114584, "learning_rate": 9.807937738894302e-08, "loss": 0.0502, "num_tokens": 27260623.0, "reward": 0.1666666716337204, "reward_std": 0.2350771278142929, "rewards/format_reward/mean": 0.1666666716337204, "rewards/format_reward/std": 0.374634325504303, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 3095.135498046875, "completions/mean_terminated_length": 1706.760009765625, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.21485714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.11047376692295074, "kl": 0.0006453196207682291, "learning_rate": 9.798752629550545e-08, "loss": 0.0327, "num_tokens": 27574148.0, "reward": 0.3125, "reward_std": 0.23987865447998047, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 2959.28125, "completions/mean_terminated_length": 1584.9000244140625, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.21714285714285714, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08645107597112656, "kl": 0.0006707509358723959, "learning_rate": 9.789358005068261e-08, "loss": 0.0238, "num_tokens": 27874355.0, "reward": 0.3645833432674408, "reward_std": 0.13629475235939026, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3333333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3454.0, "completions/mean_length": 2239.510498046875, "completions/mean_terminated_length": 1279.1607666015625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.21942857142857142, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11768729984760284, "kl": 0.0005092620849609375, "learning_rate": 9.779754323328191e-08, "loss": 0.0247, "num_tokens": 28104798.0, "reward": 0.6041667461395264, "reward_std": 0.2076037973165512, "rewards/format_reward/mean": 0.6041666865348816, "rewards/format_reward/std": 0.4915960133075714, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3267.0, "completions/mean_length": 2639.822998046875, "completions/mean_terminated_length": 1198.7105712890625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.22171428571428572, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09902984648942947, "kl": 0.0006221135457356771, "learning_rate": 9.769942052400234e-08, "loss": 0.0299, "num_tokens": 28374433.0, "reward": 0.4270833432674408, "reward_std": 0.15461495518684387, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3412.0, "completions/mean_length": 2780.39599609375, "completions/mean_terminated_length": 1498.9730224609375, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.224, "frac_reward_zero_std": 0.375, "grad_norm": 0.13889165222644806, "kl": 0.0006262461344401041, "learning_rate": 9.759921670520633e-08, "loss": 0.0847, "num_tokens": 28656483.0, "reward": 0.40625, "reward_std": 0.26191452145576477, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3414.0, "completions/mean_length": 3180.83349609375, "completions/mean_terminated_length": 2035.8399658203125, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.22628571428571428, "frac_reward_zero_std": 0.375, "grad_norm": 0.11569508910179138, "kl": 0.0006573994954427084, "learning_rate": 9.749693666068663e-08, "loss": 0.0812, "num_tokens": 28977947.0, "reward": 0.3020833432674408, "reward_std": 0.3015846312046051, "rewards/format_reward/mean": 0.3020833432674408, "rewards/format_reward/std": 0.46157145500183105, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.08333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3539.0, "completions/mean_length": 3071.260498046875, "completions/mean_terminated_length": 1690.8077392578125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.22857142857142856, "frac_reward_zero_std": 0.375, "grad_norm": 0.11107376217842102, "kl": 0.0005807876586914062, "learning_rate": 9.739258537542834e-08, "loss": 0.0475, "num_tokens": 29288514.0, "reward": 0.3020833432674408, "reward_std": 0.27063167095184326, "rewards/format_reward/mean": 0.3020833432674408, "rewards/format_reward/std": 0.46157145500183105, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9583333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3528.0, "completions/mean_length": 2619.375, "completions/mean_terminated_length": 1613.7020263671875, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.23085714285714284, "frac_reward_zero_std": 0.5, "grad_norm": 0.11716129630804062, "kl": 0.0005361239115397135, "learning_rate": 9.728616793536587e-08, "loss": 0.059, "num_tokens": 29556156.0, "reward": 0.5520833730697632, "reward_std": 0.24183645844459534, "rewards/format_reward/mean": 0.5520833134651184, "rewards/format_reward/std": 0.49989035725593567, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3443.0, "completions/mean_length": 2519.625, "completions/mean_terminated_length": 1029.5, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.23314285714285715, "frac_reward_zero_std": 0.5, "grad_norm": 0.12329067289829254, "kl": 0.0005308787027994791, "learning_rate": 9.717768952713512e-08, "loss": 0.0208, "num_tokens": 29812920.0, "reward": 0.4270833432674408, "reward_std": 0.23311933875083923, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3445.0, "completions/mean_length": 2608.0, "completions/mean_terminated_length": 1632.0, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.23542857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.10539143532514572, "kl": 0.0005432764689127604, "learning_rate": 9.706715543782063e-08, "loss": 0.0549, "num_tokens": 30078750.0, "reward": 0.5208333730697632, "reward_std": 0.23987865447998047, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.2083333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3516.0, "completions/mean_length": 2404.65625, "completions/mean_terminated_length": 1447.8302001953125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.2377142857142857, "frac_reward_zero_std": 0.6875, "grad_norm": 0.111219622194767, "kl": 0.0005749066670735677, "learning_rate": 9.695457105469804e-08, "loss": 0.0424, "num_tokens": 30325539.0, "reward": 0.5625, "reward_std": 0.1343369334936142, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.4986824691295624, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.29166666666666663, "completions/max_length": 3584.0, "completions/max_terminated_length": 3510.0, "completions/mean_length": 3327.0625, "completions/mean_terminated_length": 2133.058837890625, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 0.24, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11259549856185913, "kl": 0.000629425048828125, "learning_rate": 9.683994186497132e-08, "loss": 0.0331, "num_tokens": 30662145.0, "reward": 0.1979166716337204, "reward_std": 0.1921273171901703, "rewards/format_reward/mean": 0.1979166716337204, "rewards/format_reward/std": 0.4005205035209656, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3529.0, "completions/mean_length": 2797.90625, "completions/mean_terminated_length": 1427.857177734375, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.2422857142857143, "frac_reward_zero_std": 0.3125, "grad_norm": 0.13242016732692719, "kl": 0.0005540847778320312, "learning_rate": 9.672327345550542e-08, "loss": 0.0805, "num_tokens": 30947184.0, "reward": 0.4166666865348816, "reward_std": 0.31838300824165344, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.49559465050697327, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3464.0, "completions/mean_length": 2729.875, "completions/mean_terminated_length": 1306.3333740234375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.24457142857142858, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08758668601512909, "kl": 0.0005545616149902344, "learning_rate": 9.660457151255409e-08, "loss": 0.0443, "num_tokens": 31225488.0, "reward": 0.375, "reward_std": 0.1430540680885315, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.4866642653942108, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9166666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3501.0, "completions/mean_length": 2560.166748046875, "completions/mean_terminated_length": 1447.3043212890625, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "epoch": 0.24685714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.1443079113960266, "kl": 0.000594933827718099, "learning_rate": 9.648384182148251e-08, "loss": 0.0789, "num_tokens": 31486792.0, "reward": 0.5208333730697632, "reward_std": 0.24859580397605896, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.16666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3529.0, "completions/mean_length": 3014.83349609375, "completions/mean_terminated_length": 1632.571533203125, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.24914285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 0.08680311590433121, "kl": 0.0006758371988932291, "learning_rate": 9.636109026648555e-08, "loss": 0.0102, "num_tokens": 31792914.0, "reward": 0.3229166865348816, "reward_std": 0.19080543518066406, "rewards/format_reward/mean": 0.3229166567325592, "rewards/format_reward/std": 0.4700457453727722, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9166666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 2862.0, "completions/mean_length": 2426.104248046875, "completions/mean_terminated_length": 1167.521728515625, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.25142857142857145, "frac_reward_zero_std": 0.625, "grad_norm": 0.11593201011419296, "kl": 0.0005362828572591146, "learning_rate": 9.623632283030077e-08, "loss": 0.0289, "num_tokens": 32041420.0, "reward": 0.5, "reward_std": 0.16661179065704346, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5026246905326843, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3523.0, "completions/mean_length": 2867.08349609375, "completions/mean_terminated_length": 1034.9630126953125, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.2537142857142857, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09556243568658829, "kl": 0.0005575815836588541, "learning_rate": 9.610954559391703e-08, "loss": 0.0358, "num_tokens": 32332386.0, "reward": 0.3125, "reward_std": 0.2076038122177124, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4166666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3519.0, "completions/mean_length": 2368.791748046875, "completions/mean_terminated_length": 1572.6207275390625, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.256, "frac_reward_zero_std": 0.5, "grad_norm": 0.10844600200653076, "kl": 0.0005437533060709635, "learning_rate": 9.598076473627796e-08, "loss": 0.0255, "num_tokens": 32575294.0, "reward": 0.65625, "reward_std": 0.2379208505153656, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4774521291255951, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9583333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 2600.166748046875, "completions/mean_terminated_length": 1574.468017578125, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.2582857142857143, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13194780051708221, "kl": 0.0005145072937011719, "learning_rate": 9.584998653398089e-08, "loss": 0.0817, "num_tokens": 32840822.0, "reward": 0.5416666865348816, "reward_std": 0.26735198497772217, "rewards/format_reward/mean": 0.5416666865348816, "rewards/format_reward/std": 0.5008764266967773, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.16666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3549.0, "completions/mean_length": 3054.05224609375, "completions/mean_terminated_length": 1767.0357666015625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.26057142857142856, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10078351199626923, "kl": 0.0005081494649251302, "learning_rate": 9.571721736097087e-08, "loss": 0.0219, "num_tokens": 33149899.0, "reward": 0.3229166865348816, "reward_std": 0.20564600825309753, "rewards/format_reward/mean": 0.3229166567325592, "rewards/format_reward/std": 0.4700457453727722, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3479.0, "completions/mean_length": 2923.77099609375, "completions/mean_terminated_length": 1719.823486328125, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.26285714285714284, "frac_reward_zero_std": 0.4375, "grad_norm": 0.15436427295207977, "kl": 0.0006771087646484375, "learning_rate": 9.558246368823013e-08, "loss": 0.0573, "num_tokens": 33447255.0, "reward": 0.3958333432674408, "reward_std": 0.2586348056793213, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3531.0, "completions/mean_length": 2840.75, "completions/mean_terminated_length": 1800.2000732421875, "completions/min_length": 613.0, "completions/min_terminated_length": 613.0, "epoch": 0.2651428571428571, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1236443817615509, "kl": 0.0005249977111816406, "learning_rate": 9.544573208346251e-08, "loss": 0.0751, "num_tokens": 33735843.0, "reward": 0.4375, "reward_std": 0.30966588854789734, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1666666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3573.0, "completions/mean_length": 2401.322998046875, "completions/mean_terminated_length": 1400.59619140625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.2674285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.1455051302909851, "kl": 0.0005470911661783854, "learning_rate": 9.530702921077357e-08, "loss": 0.0647, "num_tokens": 33982030.0, "reward": 0.5729166865348816, "reward_std": 0.22831778228282928, "rewards/format_reward/mean": 0.5729166865348816, "rewards/format_reward/std": 0.4972512125968933, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.08333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3544.0, "completions/mean_length": 3229.52099609375, "completions/mean_terminated_length": 2275.154052734375, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.26971428571428574, "frac_reward_zero_std": 0.375, "grad_norm": 0.11932012438774109, "kl": 0.0005947748819986979, "learning_rate": 9.516636183034564e-08, "loss": 0.0577, "num_tokens": 34308324.0, "reward": 0.2916666865348816, "reward_std": 0.28219255805015564, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.45691564679145813, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 2713.9375, "completions/mean_terminated_length": 1641.534912109375, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.272, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10066317766904831, "kl": 0.0005396207173665365, "learning_rate": 9.502373679810839e-08, "loss": 0.0423, "num_tokens": 34585440.0, "reward": 0.46875, "reward_std": 0.1921273171901703, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9583333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3298.0, "completions/mean_length": 2630.80224609375, "completions/mean_terminated_length": 1637.04248046875, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.2742857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13681873679161072, "kl": 0.0005944569905598959, "learning_rate": 9.487916106540466e-08, "loss": 0.0358, "num_tokens": 34853531.0, "reward": 0.5416666865348816, "reward_std": 0.19408510625362396, "rewards/format_reward/mean": 0.5416666865348816, "rewards/format_reward/std": 0.5008764266967773, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3577.0, "completions/mean_length": 3081.822998046875, "completions/mean_terminated_length": 2123.121337890625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.2765714285714286, "frac_reward_zero_std": 0.4375, "grad_norm": 0.10625406354665756, "kl": 0.0005485216776529948, "learning_rate": 9.473264167865172e-08, "loss": 0.0282, "num_tokens": 35165340.0, "reward": 0.40625, "reward_std": 0.2780269384384155, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3012.0, "completions/mean_length": 2615.510498046875, "completions/mean_terminated_length": 1137.2894287109375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.27885714285714286, "frac_reward_zero_std": 0.875, "grad_norm": 0.04336674138903618, "kl": 0.0005785624186197916, "learning_rate": 9.458418577899774e-08, "loss": 0.0144, "num_tokens": 35432413.0, "reward": 0.40625, "reward_std": 0.05779037997126579, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3531.0, "completions/mean_length": 2774.20849609375, "completions/mean_terminated_length": 1590.666748046875, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 0.28114285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.07755483686923981, "kl": 0.000576178232828776, "learning_rate": 9.443380060197386e-08, "loss": 0.0074, "num_tokens": 35714451.0, "reward": 0.4270833432674408, "reward_std": 0.12234010547399521, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 3584.0, "completions/max_terminated_length": 3146.0, "completions/mean_length": 3162.625, "completions/mean_terminated_length": 1825.2174072265625, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 0.2834285714285714, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11886311322450638, "kl": 0.0006281534830729166, "learning_rate": 9.428149347714143e-08, "loss": 0.0249, "num_tokens": 36034413.0, "reward": 0.25, "reward_std": 0.1988866776227951, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.435285747051239, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3392.0, "completions/mean_length": 2739.041748046875, "completions/mean_terminated_length": 1605.5609130859375, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "epoch": 0.2857142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 0.09605032950639725, "kl": 0.00054168701171875, "learning_rate": 9.412727182773486e-08, "loss": 0.0456, "num_tokens": 36313561.0, "reward": 0.4479166865348816, "reward_std": 0.12234010547399521, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3417.0, "completions/mean_length": 2860.0, "completions/mean_terminated_length": 1477.8182373046875, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.288, "frac_reward_zero_std": 0.625, "grad_norm": 0.10251232981681824, "kl": 0.0005782445271809896, "learning_rate": 9.397114317029974e-08, "loss": 0.0653, "num_tokens": 36605005.0, "reward": 0.3541666865348816, "reward_std": 0.1801304817199707, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.48077061772346497, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3371.0, "completions/mean_length": 2641.39599609375, "completions/mean_terminated_length": 1376.9267578125, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.29028571428571426, "frac_reward_zero_std": 0.5, "grad_norm": 0.11962489783763885, "kl": 0.0005617141723632812, "learning_rate": 9.381311511432658e-08, "loss": 0.0987, "num_tokens": 36874623.0, "reward": 0.4583333432674408, "reward_std": 0.2525114417076111, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 2454.010498046875, "completions/mean_terminated_length": 1061.2325439453125, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.2925714285714286, "frac_reward_zero_std": 0.75, "grad_norm": 0.10726732760667801, "kl": 0.0005575815836588541, "learning_rate": 9.36531953618799e-08, "loss": 0.0395, "num_tokens": 37126294.0, "reward": 0.4583333432674408, "reward_std": 0.13301503658294678, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3352.0, "completions/mean_length": 2805.885498046875, "completions/mean_terminated_length": 1249.65625, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.2948571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 0.12535634636878967, "kl": 0.0006647109985351562, "learning_rate": 9.34913917072228e-08, "loss": 0.0999, "num_tokens": 37410617.0, "reward": 0.3541666865348816, "reward_std": 0.24468021094799042, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.48077061772346497, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3441.0, "completions/mean_length": 2837.875, "completions/mean_terminated_length": 1537.4857177734375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.29714285714285715, "frac_reward_zero_std": 0.625, "grad_norm": 0.08393247425556183, "kl": 0.0005332628885904948, "learning_rate": 9.332771203643714e-08, "loss": 0.0075, "num_tokens": 37698221.0, "reward": 0.40625, "reward_std": 0.17728674411773682, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3502.0, "completions/mean_length": 2980.27099609375, "completions/mean_terminated_length": 1585.4482421875, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.29942857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.09301748126745224, "kl": 0.0005849202473958334, "learning_rate": 9.316216432703917e-08, "loss": 0.0428, "num_tokens": 38001259.0, "reward": 0.3333333432674408, "reward_std": 0.22635996341705322, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9166666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3474.0, "completions/mean_length": 2670.9375, "completions/mean_terminated_length": 1678.478271484375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.3017142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.13140074908733368, "kl": 0.000514984130859375, "learning_rate": 9.299475664759068e-08, "loss": 0.0757, "num_tokens": 38272585.0, "reward": 0.5208333730697632, "reward_std": 0.24468019604682922, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3183.0, "completions/mean_length": 3229.95849609375, "completions/mean_terminated_length": 1695.77783203125, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 0.304, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07494261860847473, "kl": 0.00063323974609375, "learning_rate": 9.282549715730579e-08, "loss": 0.0298, "num_tokens": 38599491.0, "reward": 0.1979166716337204, "reward_std": 0.09878238290548325, "rewards/format_reward/mean": 0.1979166716337204, "rewards/format_reward/std": 0.4005205035209656, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3458.0, "completions/mean_length": 2853.416748046875, "completions/mean_terminated_length": 1392.25, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.3062857142857143, "frac_reward_zero_std": 0.6875, "grad_norm": 0.07386979460716248, "kl": 0.0005450248718261719, "learning_rate": 9.265439410565328e-08, "loss": 0.0372, "num_tokens": 38889769.0, "reward": 0.375, "reward_std": 0.15177121758460999, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.4866642653942108, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3189.0, "completions/mean_length": 2473.64599609375, "completions/mean_terminated_length": 1105.06982421875, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.30857142857142855, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11156300455331802, "kl": 0.0006051063537597656, "learning_rate": 9.248145583195447e-08, "loss": 0.0609, "num_tokens": 39141885.0, "reward": 0.4895833432674408, "reward_std": 0.20956158638000488, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3333.0, "completions/mean_length": 2794.260498046875, "completions/mean_terminated_length": 1478.02783203125, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 0.31085714285714283, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12031218409538269, "kl": 0.0005391438802083334, "learning_rate": 9.230669076497686e-08, "loss": 0.0584, "num_tokens": 39426376.0, "reward": 0.3854166865348816, "reward_std": 0.20084445178508759, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3392.0, "completions/mean_length": 2761.979248046875, "completions/mean_terminated_length": 1451.189208984375, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 0.31314285714285717, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0906476154923439, "kl": 0.0005747477213541666, "learning_rate": 9.213010742252327e-08, "loss": 0.064, "num_tokens": 39708008.0, "reward": 0.40625, "reward_std": 0.20084445178508759, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9583333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3162.0, "completions/mean_length": 2545.83349609375, "completions/mean_terminated_length": 1463.4892578125, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 0.31542857142857145, "frac_reward_zero_std": 0.625, "grad_norm": 0.08902504295110703, "kl": 0.0005307197570800781, "learning_rate": 9.195171441101667e-08, "loss": 0.0554, "num_tokens": 39968590.0, "reward": 0.5208333730697632, "reward_std": 0.18404607474803925, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3552.0, "completions/mean_length": 2885.4375, "completions/mean_terminated_length": 1488.3125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.3177142857142857, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11393805593252182, "kl": 0.0006208419799804688, "learning_rate": 9.177152042508077e-08, "loss": 0.0793, "num_tokens": 40262128.0, "reward": 0.3854166865348816, "reward_std": 0.21436314284801483, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3460.0, "completions/mean_length": 2637.36474609375, "completions/mean_terminated_length": 1564.5111083984375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.32, "frac_reward_zero_std": 0.5, "grad_norm": 0.11296574771404266, "kl": 0.0005234082539876302, "learning_rate": 9.158953424711624e-08, "loss": 0.0365, "num_tokens": 40531449.0, "reward": 0.53125, "reward_std": 0.23311933875083923, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3461.0, "completions/mean_length": 2686.83349609375, "completions/mean_terminated_length": 1317.4737548828125, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.3222857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.11850091814994812, "kl": 0.0007069905598958334, "learning_rate": 9.140576474687262e-08, "loss": 0.0538, "num_tokens": 40804793.0, "reward": 0.4166666865348816, "reward_std": 0.2263599932193756, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.49559465050697327, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3583.0, "completions/mean_length": 2979.229248046875, "completions/mean_terminated_length": 1648.7333984375, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.32457142857142857, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11879272013902664, "kl": 0.0006297429402669271, "learning_rate": 9.122022088101613e-08, "loss": 0.0812, "num_tokens": 41106555.0, "reward": 0.3333333432674408, "reward_std": 0.21151940524578094, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3579.0, "completions/mean_length": 2892.875, "completions/mean_terminated_length": 1296.137939453125, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.32685714285714285, "frac_reward_zero_std": 0.625, "grad_norm": 0.09709058701992035, "kl": 0.0006171862284342448, "learning_rate": 9.1032911692693e-08, "loss": 0.0669, "num_tokens": 41399667.0, "reward": 0.3229166865348816, "reward_std": 0.17728674411773682, "rewards/format_reward/mean": 0.3229166567325592, "rewards/format_reward/std": 0.4700457453727722, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3513.0, "completions/mean_length": 2782.64599609375, "completions/mean_terminated_length": 1321.3529052734375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.3291428571428571, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11509748548269272, "kl": 0.0006793340047200521, "learning_rate": 9.084384631108882e-08, "loss": 0.0885, "num_tokens": 41684021.0, "reward": 0.3645833432674408, "reward_std": 0.1921273171901703, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3576.0, "completions/mean_length": 2931.479248046875, "completions/mean_terminated_length": 1563.290283203125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.3314285714285714, "frac_reward_zero_std": 0.375, "grad_norm": 0.1290571242570877, "kl": 0.0006090799967447916, "learning_rate": 9.065303395098358e-08, "loss": 0.0442, "num_tokens": 41981907.0, "reward": 0.34375, "reward_std": 0.28895190358161926, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4774521291255951, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3542.0, "completions/mean_length": 2853.947998046875, "completions/mean_terminated_length": 1393.84375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.33371428571428574, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14284007251262665, "kl": 0.0005645751953125, "learning_rate": 9.046048391230248e-08, "loss": 0.0364, "num_tokens": 42271942.0, "reward": 0.34375, "reward_std": 0.24315834045410156, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4774521291255951, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3333333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3524.0, "completions/mean_length": 2461.30224609375, "completions/mean_terminated_length": 1659.3751220703125, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.336, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10502398759126663, "kl": 0.0005308787027994791, "learning_rate": 9.026620557966279e-08, "loss": 0.0276, "num_tokens": 42523395.0, "reward": 0.6145833730697632, "reward_std": 0.20084445178508759, "rewards/format_reward/mean": 0.6145833134651184, "rewards/format_reward/std": 0.4892484247684479, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3528.0, "completions/mean_length": 3048.791748046875, "completions/mean_terminated_length": 1978.375, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.3382857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.12605418264865875, "kl": 0.0006653467814127604, "learning_rate": 9.007020842191633e-08, "loss": 0.0445, "num_tokens": 42832849.0, "reward": 0.3541666865348816, "reward_std": 0.23116151988506317, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.48077061772346497, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 2798.90625, "completions/mean_terminated_length": 1228.71875, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.3405714285714286, "frac_reward_zero_std": 0.625, "grad_norm": 0.0965123325586319, "kl": 0.0006402333577473959, "learning_rate": 8.987250199168807e-08, "loss": 0.0422, "num_tokens": 43117594.0, "reward": 0.3645833432674408, "reward_std": 0.15985246002674103, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3252.0, "completions/mean_length": 2891.291748046875, "completions/mean_terminated_length": 1684.0, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.34285714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.11779191344976425, "kl": 0.0006155967712402344, "learning_rate": 8.967309592491052e-08, "loss": 0.049, "num_tokens": 43411262.0, "reward": 0.3958333432674408, "reward_std": 0.23899273574352264, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3296.0, "completions/mean_length": 2597.447998046875, "completions/mean_terminated_length": 1381.465087890625, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.34514285714285714, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09505832195281982, "kl": 0.0005811055501302084, "learning_rate": 8.9471999940354e-08, "loss": 0.0627, "num_tokens": 43676181.0, "reward": 0.4791666865348816, "reward_std": 0.2163209617137909, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3479.0, "completions/mean_length": 2815.61474609375, "completions/mean_terminated_length": 1590.3514404296875, "completions/min_length": 527.0, "completions/min_terminated_length": 527.0, "epoch": 0.3474285714285714, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0899856686592102, "kl": 0.0006135304768880209, "learning_rate": 8.926922383915315e-08, "loss": 0.0299, "num_tokens": 43962416.0, "reward": 0.40625, "reward_std": 0.14981341361999512, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3415.0, "completions/mean_length": 2734.86474609375, "completions/mean_terminated_length": 1438.8157958984375, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.3497142857142857, "frac_reward_zero_std": 0.3125, "grad_norm": 0.16459882259368896, "kl": 0.0006030400594075521, "learning_rate": 8.906477750432904e-08, "loss": 0.1225, "num_tokens": 44240029.0, "reward": 0.40625, "reward_std": 0.30770808458328247, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3537.0, "completions/mean_length": 2596.89599609375, "completions/mean_terminated_length": 1272.731689453125, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.352, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08611822873353958, "kl": 0.0005370775858561198, "learning_rate": 8.88586709003076e-08, "loss": 0.0104, "num_tokens": 44505213.0, "reward": 0.4479166865348816, "reward_std": 0.13629473745822906, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3564.0, "completions/mean_length": 3185.135498046875, "completions/mean_terminated_length": 1988.541748046875, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.35428571428571426, "frac_reward_zero_std": 0.5625, "grad_norm": 0.08149396628141403, "kl": 0.0005811055501302084, "learning_rate": 8.865091407243394e-08, "loss": 0.0194, "num_tokens": 44826592.0, "reward": 0.2708333432674408, "reward_std": 0.18536798655986786, "rewards/format_reward/mean": 0.2708333432674408, "rewards/format_reward/std": 0.44672295451164246, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 2321.385498046875, "completions/mean_terminated_length": 1207.313720703125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.3565714285714286, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09393462538719177, "kl": 0.0006497701009114584, "learning_rate": 8.844151714648274e-08, "loss": 0.0155, "num_tokens": 45064829.0, "reward": 0.59375, "reward_std": 0.16333210468292236, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.4937104284763336, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3489.0, "completions/mean_length": 2997.09375, "completions/mean_terminated_length": 2101.28955078125, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.3588571428571429, "frac_reward_zero_std": 0.375, "grad_norm": 0.10948645323514938, "kl": 0.0005507469177246094, "learning_rate": 8.823049032816478e-08, "loss": 0.03, "num_tokens": 45369080.0, "reward": 0.4375, "reward_std": 0.29962682723999023, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.16666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3548.0, "completions/mean_length": 3103.791748046875, "completions/mean_terminated_length": 1937.571533203125, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.36114285714285715, "frac_reward_zero_std": 0.625, "grad_norm": 0.09392761439085007, "kl": 0.0006020863850911459, "learning_rate": 8.801784390262942e-08, "loss": 0.0215, "num_tokens": 45683952.0, "reward": 0.354166716337204, "reward_std": 0.16661179065704346, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.4807705879211426, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3529.0, "completions/mean_length": 2809.36474609375, "completions/mean_terminated_length": 1459.2857666015625, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 0.36342857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.15056172013282776, "kl": 0.0006561279296875, "learning_rate": 8.780358823396351e-08, "loss": 0.0551, "num_tokens": 45969137.0, "reward": 0.3645833432674408, "reward_std": 0.20084445178508759, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3489.0, "completions/mean_length": 3053.90625, "completions/mean_terminated_length": 1699.2222900390625, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.3657142857142857, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10764044523239136, "kl": 0.0006933212280273438, "learning_rate": 8.758773376468605e-08, "loss": 0.0445, "num_tokens": 46278746.0, "reward": 0.28125, "reward_std": 0.20084446668624878, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.45196935534477234, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3564.0, "completions/mean_length": 2655.23974609375, "completions/mean_terminated_length": 1174.2432861328125, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.368, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08771516382694244, "kl": 0.0005706151326497396, "learning_rate": 8.737029101523929e-08, "loss": 0.0274, "num_tokens": 46549429.0, "reward": 0.40625, "reward_std": 0.14501188695430756, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3577.0, "completions/mean_length": 2715.33349609375, "completions/mean_terminated_length": 1598.4761962890625, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "epoch": 0.3702857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.11459244787693024, "kl": 0.0006163914998372396, "learning_rate": 8.715127058347614e-08, "loss": 0.0725, "num_tokens": 46825299.0, "reward": 0.4791666865348816, "reward_std": 0.22635996341705322, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3393.0, "completions/mean_length": 2602.96875, "completions/mean_terminated_length": 1491.1334228515625, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.37257142857142855, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13686417043209076, "kl": 0.0006634394327799479, "learning_rate": 8.693068314414344e-08, "loss": 0.0501, "num_tokens": 47091750.0, "reward": 0.5104166865348816, "reward_std": 0.2741113305091858, "rewards/format_reward/mean": 0.5104166865348816, "rewards/format_reward/std": 0.5025155544281006, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3161.0, "completions/mean_length": 2683.92724609375, "completions/mean_terminated_length": 1310.131591796875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.37485714285714283, "frac_reward_zero_std": 0.5, "grad_norm": 0.13948817551136017, "kl": 0.0006837844848632812, "learning_rate": 8.670853944836176e-08, "loss": 0.1137, "num_tokens": 47364659.0, "reward": 0.4062500596046448, "reward_std": 0.23311933875083923, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.08333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3441.0, "completions/mean_length": 3005.6875, "completions/mean_terminated_length": 1448.6923828125, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.37714285714285717, "frac_reward_zero_std": 0.5, "grad_norm": 0.12864314019680023, "kl": 0.0006459554036458334, "learning_rate": 8.648485032310144e-08, "loss": 0.102, "num_tokens": 47669495.0, "reward": 0.2812500298023224, "reward_std": 0.23311930894851685, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.45196935534477234, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 3584.0, "completions/max_terminated_length": 3179.0, "completions/mean_length": 3095.53125, "completions/mean_terminated_length": 1545.1739501953125, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.37942857142857145, "frac_reward_zero_std": 0.6875, "grad_norm": 0.07900777459144592, "kl": 0.0006132125854492188, "learning_rate": 8.625962667065488e-08, "loss": 0.0139, "num_tokens": 47983370.0, "reward": 0.3020833432674408, "reward_std": 0.13629472255706787, "rewards/format_reward/mean": 0.3020833432674408, "rewards/format_reward/std": 0.46157145500183105, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1666666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3384.0, "completions/mean_length": 2459.322998046875, "completions/mean_terminated_length": 1507.673095703125, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.38171428571428573, "frac_reward_zero_std": 0.5, "grad_norm": 0.11869540065526962, "kl": 0.0005857149759928385, "learning_rate": 8.603287946810513e-08, "loss": 0.0159, "num_tokens": 48235479.0, "reward": 0.5625, "reward_std": 0.23116151988506317, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.4986824691295624, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04166666666666663, "completions/max_length": 3584.0, "completions/max_terminated_length": 3556.0, "completions/mean_length": 3008.48974609375, "completions/mean_terminated_length": 1181.86962890625, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "epoch": 0.384, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08864536881446838, "kl": 0.0005278587341308594, "learning_rate": 8.580461976679099e-08, "loss": 0.0191, "num_tokens": 48540476.0, "reward": 0.28125, "reward_std": 0.14109627902507782, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.45196935534477234, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.2916666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 2370.20849609375, "completions/mean_terminated_length": 1465.3817138671875, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.3862857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.11368736624717712, "kl": 0.000568230946858724, "learning_rate": 8.557485869176825e-08, "loss": 0.0425, "num_tokens": 48783358.0, "reward": 0.59375, "reward_std": 0.219600647687912, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.4937104284763336, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3333333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3570.0, "completions/mean_length": 2376.916748046875, "completions/mean_terminated_length": 1514.71435546875, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.38857142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.15671178698539734, "kl": 0.0005776087443033854, "learning_rate": 8.534360744126753e-08, "loss": 0.0616, "num_tokens": 49027166.0, "reward": 0.5937500596046448, "reward_std": 0.23311933875083923, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.4937104284763336, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.16666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3260.0, "completions/mean_length": 2920.5625, "completions/mean_terminated_length": 1309.357177734375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.39085714285714285, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07653896510601044, "kl": 0.0005582173665364584, "learning_rate": 8.511087728614862e-08, "loss": 0.022, "num_tokens": 49323590.0, "reward": 0.3125, "reward_std": 0.09202303737401962, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1666666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3552.0, "completions/mean_length": 2526.291748046875, "completions/mean_terminated_length": 1631.3077392578125, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.3931428571428571, "frac_reward_zero_std": 0.375, "grad_norm": 0.14437232911586761, "kl": 0.0005766550699869791, "learning_rate": 8.487667956935088e-08, "loss": 0.0502, "num_tokens": 49581444.0, "reward": 0.59375, "reward_std": 0.28806596994400024, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.4937104284763336, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3507.0, "completions/mean_length": 3077.947998046875, "completions/mean_terminated_length": 2155.14697265625, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 0.3954285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 0.08480256795883179, "kl": 0.0005811055501302084, "learning_rate": 8.464102570534061e-08, "loss": 0.0511, "num_tokens": 49893529.0, "reward": 0.3645833432674408, "reward_std": 0.18208825588226318, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3542.0, "completions/mean_length": 2870.09375, "completions/mean_terminated_length": 1680.25, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.3977142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.14392878115177155, "kl": 0.0005621910095214844, "learning_rate": 8.440392717955475e-08, "loss": 0.0238, "num_tokens": 50185636.0, "reward": 0.4479166865348816, "reward_std": 0.24183647334575653, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3126.0, "completions/mean_length": 2903.229248046875, "completions/mean_terminated_length": 969.8399658203125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.4, "frac_reward_zero_std": 0.5, "grad_norm": 0.1288723200559616, "kl": 0.0005869865417480469, "learning_rate": 8.416539554784089e-08, "loss": 0.0268, "num_tokens": 50480072.0, "reward": 0.2708333432674408, "reward_std": 0.21764282882213593, "rewards/format_reward/mean": 0.2708333432674408, "rewards/format_reward/std": 0.44672295451164246, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 2978.0, "completions/mean_length": 2545.979248046875, "completions/mean_terminated_length": 1211.3809814453125, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.4022857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 0.09528925269842148, "kl": 0.0005660057067871094, "learning_rate": 8.392544243589427e-08, "loss": 0.0357, "num_tokens": 50740494.0, "reward": 0.46875, "reward_std": 0.17337113618850708, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3576.0, "completions/mean_length": 2988.48974609375, "completions/mean_terminated_length": 1851.6060791015625, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.4045714285714286, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11670532822608948, "kl": 0.0006411870320638021, "learning_rate": 8.368407953869103e-08, "loss": 0.0658, "num_tokens": 51043175.0, "reward": 0.3541666865348816, "reward_std": 0.26735198497772217, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.48077061772346497, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.16666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3488.0, "completions/mean_length": 3167.71875, "completions/mean_terminated_length": 2156.75, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.40685714285714286, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0889914259314537, "kl": 0.0005296071370442709, "learning_rate": 8.344131861991828e-08, "loss": 0.0184, "num_tokens": 51363368.0, "reward": 0.3854166865348816, "reward_std": 0.20084445178508759, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3315.0, "completions/mean_length": 2948.70849609375, "completions/mean_terminated_length": 1551.0667724609375, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.40914285714285714, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08148004114627838, "kl": 0.0005030632019042969, "learning_rate": 8.319717151140072e-08, "loss": 0.0195, "num_tokens": 51662044.0, "reward": 0.3645833432674408, "reward_std": 0.14109627902507782, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3501.0, "completions/mean_length": 2793.03125, "completions/mean_terminated_length": 1637.0, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.4114285714285714, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12045343965291977, "kl": 0.0006186167399088541, "learning_rate": 8.295165011252396e-08, "loss": 0.0617, "num_tokens": 51945883.0, "reward": 0.4375, "reward_std": 0.23639902472496033, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3406.0, "completions/mean_length": 2654.479248046875, "completions/mean_terminated_length": 1407.5609130859375, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.4137142857142857, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09716926515102386, "kl": 0.0005747477213541666, "learning_rate": 8.270476638965461e-08, "loss": 0.0472, "num_tokens": 52216547.0, "reward": 0.46875, "reward_std": 0.21827873587608337, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.08333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3549.0, "completions/mean_length": 3025.53125, "completions/mean_terminated_length": 1521.9615478515625, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.416, "frac_reward_zero_std": 0.625, "grad_norm": 0.09838991612195969, "kl": 0.000541528065999349, "learning_rate": 8.245653237555705e-08, "loss": 0.0178, "num_tokens": 52522814.0, "reward": 0.2916666865348816, "reward_std": 0.1888476312160492, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.4569157063961029, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3472.0, "completions/mean_length": 2912.39599609375, "completions/mean_terminated_length": 1687.7059326171875, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.41828571428571426, "frac_reward_zero_std": 0.625, "grad_norm": 0.10540112853050232, "kl": 0.0005413691202799479, "learning_rate": 8.220696016880687e-08, "loss": 0.0488, "num_tokens": 52818964.0, "reward": 0.3854166865348816, "reward_std": 0.17728674411773682, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 2859.30224609375, "completions/mean_terminated_length": 1651.4722900390625, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "epoch": 0.4205714285714286, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10342766344547272, "kl": 0.0004857381184895833, "learning_rate": 8.195606193320136e-08, "loss": 0.0414, "num_tokens": 53109933.0, "reward": 0.4270833432674408, "reward_std": 0.20564600825309753, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3420.0, "completions/mean_length": 3069.3125, "completions/mean_terminated_length": 2086.727294921875, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 0.4228571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 0.12011417001485825, "kl": 0.0006421407063802084, "learning_rate": 8.170384989716656e-08, "loss": 0.0514, "num_tokens": 53420805.0, "reward": 0.3854166865348816, "reward_std": 0.22440218925476074, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3288.0, "completions/mean_length": 2728.21875, "completions/mean_terminated_length": 1627.9285888671875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.42514285714285716, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12889675796031952, "kl": 0.0006151199340820312, "learning_rate": 8.145033635316129e-08, "loss": 0.078, "num_tokens": 53699148.0, "reward": 0.5, "reward_std": 0.24511615931987762, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5026246905326843, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3010.0, "completions/mean_length": 2672.041748046875, "completions/mean_terminated_length": 1395.300048828125, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 0.42742857142857144, "frac_reward_zero_std": 0.75, "grad_norm": 0.09514724463224411, "kl": 0.0006281534830729166, "learning_rate": 8.119553365707802e-08, "loss": 0.0136, "num_tokens": 53971480.0, "reward": 0.4270833432674408, "reward_std": 0.1223401129245758, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 3584.0, "completions/max_terminated_length": 3444.0, "completions/mean_length": 3115.510498046875, "completions/mean_terminated_length": 1539.681884765625, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 0.4297142857142857, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08934146165847778, "kl": 0.0006567637125651041, "learning_rate": 8.093945422764069e-08, "loss": 0.016, "num_tokens": 54287393.0, "reward": 0.2708333432674408, "reward_std": 0.14305409789085388, "rewards/format_reward/mean": 0.2708333432674408, "rewards/format_reward/std": 0.44672295451164246, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3565.0, "completions/mean_length": 3076.89599609375, "completions/mean_terminated_length": 1555.5833740234375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.432, "frac_reward_zero_std": 0.5, "grad_norm": 0.10708770155906677, "kl": 0.0006875991821289062, "learning_rate": 8.068211054579943e-08, "loss": 0.008, "num_tokens": 54598999.0, "reward": 0.3125, "reward_std": 0.24859580397605896, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 2818.23974609375, "completions/mean_terminated_length": 1212.6129150390625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.4342857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 0.08781887590885162, "kl": 0.0005850791931152344, "learning_rate": 8.04235151541222e-08, "loss": 0.0144, "num_tokens": 54885888.0, "reward": 0.3541666865348816, "reward_std": 0.1801304817199707, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.48077061772346497, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8333333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3579.0, "completions/mean_length": 2877.25, "completions/mean_terminated_length": 2042.0, "completions/min_length": 677.0, "completions/min_terminated_length": 677.0, "epoch": 0.43657142857142855, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11006169021129608, "kl": 0.0006113052368164062, "learning_rate": 8.016368065618359e-08, "loss": 0.0358, "num_tokens": 55178484.0, "reward": 0.5208333730697632, "reward_std": 0.26735198497772217, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3515.0, "completions/mean_length": 2934.20849609375, "completions/mean_terminated_length": 1898.0540771484375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.43885714285714283, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10060418397188187, "kl": 0.0006313323974609375, "learning_rate": 7.990261971595047e-08, "loss": 0.043, "num_tokens": 55476494.0, "reward": 0.4166666865348816, "reward_std": 0.18536798655986786, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.4955946207046509, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3563.0, "completions/mean_length": 2798.729248046875, "completions/mean_terminated_length": 1908.755615234375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.44114285714285717, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11005173623561859, "kl": 0.000522454579671224, "learning_rate": 7.964034505716476e-08, "loss": 0.0196, "num_tokens": 55760394.0, "reward": 0.5625, "reward_std": 0.26343637704849243, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.4986824691295624, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3525.0, "completions/mean_length": 3145.041748046875, "completions/mean_terminated_length": 2130.896484375, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 0.44342857142857145, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11194142699241638, "kl": 0.0006106694539388021, "learning_rate": 7.93768694627233e-08, "loss": 0.0607, "num_tokens": 56078308.0, "reward": 0.3333333432674408, "reward_std": 0.25863486528396606, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3458.0, "completions/mean_length": 2681.30224609375, "completions/mean_terminated_length": 1176.8055419921875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.44571428571428573, "frac_reward_zero_std": 0.8125, "grad_norm": 0.09316386282444, "kl": 0.0005966822306315104, "learning_rate": 7.911220577405484e-08, "loss": 0.009, "num_tokens": 56352117.0, "reward": 0.40625, "reward_std": 0.0765465646982193, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3538.0, "completions/mean_length": 3059.58349609375, "completions/mean_terminated_length": 1719.407470703125, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.448, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0975174605846405, "kl": 0.0005820592244466146, "learning_rate": 7.884636689049422e-08, "loss": 0.0282, "num_tokens": 56662079.0, "reward": 0.3020833730697632, "reward_std": 0.1921273171901703, "rewards/format_reward/mean": 0.3020833432674408, "rewards/format_reward/std": 0.46157145500183105, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3539.0, "completions/mean_length": 2882.52099609375, "completions/mean_terminated_length": 1763.946044921875, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.4502857142857143, "frac_reward_zero_std": 0.3125, "grad_norm": 0.13356514275074005, "kl": 0.0005550384521484375, "learning_rate": 7.857936576865357e-08, "loss": 0.0199, "num_tokens": 56953957.0, "reward": 0.46875, "reward_std": 0.3116236925125122, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.08333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3445.0, "completions/mean_length": 3071.135498046875, "completions/mean_terminated_length": 1690.34619140625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.45257142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.10992588847875595, "kl": 0.0006163914998372396, "learning_rate": 7.831121542179086e-08, "loss": 0.0085, "num_tokens": 57264686.0, "reward": 0.3229166865348816, "reward_std": 0.24183647334575653, "rewards/format_reward/mean": 0.3229166567325592, "rewards/format_reward/std": 0.4700457453727722, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 2945.75, "completions/mean_terminated_length": 1607.4837646484375, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.45485714285714285, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10106715559959412, "kl": 0.0005825360616048177, "learning_rate": 7.804192891917572e-08, "loss": 0.0392, "num_tokens": 57562844.0, "reward": 0.3333333432674408, "reward_std": 0.2076038122177124, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3381.0, "completions/mean_length": 3020.75, "completions/mean_terminated_length": 1421.1199951171875, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.45714285714285713, "frac_reward_zero_std": 0.375, "grad_norm": 0.13515648245811462, "kl": 0.0005766550699869791, "learning_rate": 7.777151938545235e-08, "loss": 0.0411, "num_tokens": 57869174.0, "reward": 0.3125, "reward_std": 0.2861081659793854, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3466.0, "completions/mean_length": 3027.4375, "completions/mean_terminated_length": 1914.3125, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "epoch": 0.4594285714285714, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12819920480251312, "kl": 0.0006488164265950521, "learning_rate": 7.75e-08, "loss": -0.0065, "num_tokens": 58175660.0, "reward": 0.40625, "reward_std": 0.27019572257995605, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3552.0, "completions/mean_length": 2730.55224609375, "completions/mean_terminated_length": 1535.7249755859375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.4617142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.12962771952152252, "kl": 0.0005675951639811198, "learning_rate": 7.722738399629039e-08, "loss": 0.0167, "num_tokens": 58454473.0, "reward": 0.4270833432674408, "reward_std": 0.22831778228282928, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3376.0, "completions/mean_length": 2718.95849609375, "completions/mean_terminated_length": 1454.6666259765625, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.464, "frac_reward_zero_std": 0.5, "grad_norm": 0.11856161057949066, "kl": 0.0005909601847330729, "learning_rate": 7.695368466124297e-08, "loss": 0.0439, "num_tokens": 58730943.0, "reward": 0.4479166865348816, "reward_std": 0.23703491687774658, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3486.0, "completions/mean_length": 2858.854248046875, "completions/mean_terminated_length": 1650.27783203125, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.4662857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 0.09397979080677032, "kl": 0.0005482037862141927, "learning_rate": 7.667891533457717e-08, "loss": 0.0432, "num_tokens": 59021467.0, "reward": 0.4270833432674408, "reward_std": 0.19080542027950287, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3552.0, "completions/mean_length": 2655.635498046875, "completions/mean_terminated_length": 1238.657958984375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.4685714285714286, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13924960792064667, "kl": 0.0005423227945963541, "learning_rate": 7.640308940816239e-08, "loss": 0.0616, "num_tokens": 59293232.0, "reward": 0.4270833432674408, "reward_std": 0.2653941810131073, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3398.0, "completions/mean_length": 2803.885498046875, "completions/mean_terminated_length": 1503.6944580078125, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.47085714285714286, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09617812931537628, "kl": 0.0006184577941894531, "learning_rate": 7.612622032536508e-08, "loss": 0.0661, "num_tokens": 59577705.0, "reward": 0.3958333432674408, "reward_std": 0.1430540829896927, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0416666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3548.0, "completions/mean_length": 2478.39599609375, "completions/mean_terminated_length": 1417.9183349609375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.47314285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.12245610356330872, "kl": 0.0006500879923502604, "learning_rate": 7.584832158039378e-08, "loss": 0.068, "num_tokens": 59831855.0, "reward": 0.5729167461395264, "reward_std": 0.25055360794067383, "rewards/format_reward/mean": 0.5729166865348816, "rewards/format_reward/std": 0.4972512125968933, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 2997.36474609375, "completions/mean_terminated_length": 2139.974365234375, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.4754285714285714, "frac_reward_zero_std": 0.3125, "grad_norm": 0.12236214429140091, "kl": 0.0006504058837890625, "learning_rate": 7.556940671764124e-08, "loss": 0.0698, "num_tokens": 60136132.0, "reward": 0.5, "reward_std": 0.3406188488006592, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5026246905326843, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.16666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3427.0, "completions/mean_length": 3120.71875, "completions/mean_terminated_length": 1995.607177734375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.4777142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 0.09329062700271606, "kl": 0.0006885528564453125, "learning_rate": 7.528948933102439e-08, "loss": 0.0105, "num_tokens": 60451075.0, "reward": 0.3125000298023224, "reward_std": 0.1801304817199707, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3435.0, "completions/mean_length": 3035.822998046875, "completions/mean_terminated_length": 1391.291748046875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.48, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11040231585502625, "kl": 0.0006093978881835938, "learning_rate": 7.500858306332172e-08, "loss": 0.0659, "num_tokens": 60758588.0, "reward": 0.3125000298023224, "reward_std": 0.2721535265445709, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3099.0, "completions/mean_length": 3452.354248046875, "completions/mean_terminated_length": 2179.77783203125, "completions/min_length": 1042.0, "completions/min_terminated_length": 1042.0, "epoch": 0.48228571428571426, "frac_reward_zero_std": 0.625, "grad_norm": 0.08212711662054062, "kl": 0.0005893707275390625, "learning_rate": 7.472670160550848e-08, "loss": 0.0362, "num_tokens": 61106274.0, "reward": 0.1458333432674408, "reward_std": 0.19276320934295654, "rewards/format_reward/mean": 0.1458333283662796, "rewards/format_reward/std": 0.3547917604446411, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3243.0, "completions/mean_length": 2831.416748046875, "completions/mean_terminated_length": 1459.058837890625, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.4845714285714286, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11112417280673981, "kl": 0.0006930033365885416, "learning_rate": 7.444385869608922e-08, "loss": 0.0581, "num_tokens": 61393804.0, "reward": 0.3854166865348816, "reward_std": 0.22308027744293213, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3468.0, "completions/mean_length": 2847.4375, "completions/mean_terminated_length": 1441.2728271484375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.4868571428571429, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10535288602113724, "kl": 0.0006008148193359375, "learning_rate": 7.416006812042826e-08, "loss": 0.0266, "num_tokens": 61682806.0, "reward": 0.375, "reward_std": 0.2076038122177124, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.4866642653942108, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3546.0, "completions/mean_length": 2808.90625, "completions/mean_terminated_length": 1676.076904296875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.48914285714285716, "frac_reward_zero_std": 0.375, "grad_norm": 0.13144823908805847, "kl": 0.0006440480550130209, "learning_rate": 7.387534371007797e-08, "loss": 0.0766, "num_tokens": 61968043.0, "reward": 0.4270833432674408, "reward_std": 0.27934882044792175, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0833333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3489.0, "completions/mean_length": 2482.05224609375, "completions/mean_terminated_length": 1468.260009765625, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.49142857142857144, "frac_reward_zero_std": 0.5, "grad_norm": 0.14610359072685242, "kl": 0.0006596247355143229, "learning_rate": 7.358969934210438e-08, "loss": 0.0498, "num_tokens": 62221914.0, "reward": 0.5520833730697632, "reward_std": 0.2196006178855896, "rewards/format_reward/mean": 0.5520833134651184, "rewards/format_reward/std": 0.4998903274536133, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3544.0, "completions/mean_length": 2841.61474609375, "completions/mean_terminated_length": 1708.5, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.4937142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 0.07923561334609985, "kl": 0.0006917317708333334, "learning_rate": 7.3303148938411e-08, "loss": 0.0332, "num_tokens": 62511071.0, "reward": 0.4166666865348816, "reward_std": 0.11558075994253159, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.49559465050697327, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 2866.89599609375, "completions/mean_terminated_length": 1559.2353515625, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.496, "frac_reward_zero_std": 0.375, "grad_norm": 0.12923845648765564, "kl": 0.0005966822306315104, "learning_rate": 7.301570646506028e-08, "loss": 0.0594, "num_tokens": 62802661.0, "reward": 0.4270833432674408, "reward_std": 0.29766902327537537, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3530.0, "completions/mean_length": 2852.635498046875, "completions/mean_terminated_length": 1871.5364990234375, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.4982857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10057669878005981, "kl": 0.0007403691609700521, "learning_rate": 7.27273859315928e-08, "loss": 0.0616, "num_tokens": 63092162.0, "reward": 0.4583333432674408, "reward_std": 0.2076037973165512, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0833333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3418.0, "completions/mean_length": 2374.510498046875, "completions/mean_terminated_length": 1261.780029296875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.5005714285714286, "frac_reward_zero_std": 0.625, "grad_norm": 0.1514943689107895, "kl": 0.0006774266560872396, "learning_rate": 7.243820139034463e-08, "loss": 0.0621, "num_tokens": 63335175.0, "reward": 0.53125, "reward_std": 0.17728674411773682, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3493.0, "completions/mean_length": 2696.041748046875, "completions/mean_terminated_length": 1452.9000244140625, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.5028571428571429, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09723657369613647, "kl": 0.0006256103515625, "learning_rate": 7.214816693576235e-08, "loss": 0.0303, "num_tokens": 63610441.0, "reward": 0.4791666865348816, "reward_std": 0.15657275915145874, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.16666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 2935.53125, "completions/mean_terminated_length": 1360.6785888671875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.5051428571428571, "frac_reward_zero_std": 0.8125, "grad_norm": 0.06203387677669525, "kl": 0.0005817413330078125, "learning_rate": 7.185729670371604e-08, "loss": 0.0081, "num_tokens": 63908824.0, "reward": 0.3229166865348816, "reward_std": 0.09006524085998535, "rewards/format_reward/mean": 0.3229166567325592, "rewards/format_reward/std": 0.4700457453727722, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3261.0, "completions/mean_length": 2861.479248046875, "completions/mean_terminated_length": 1346.51611328125, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.5074285714285715, "frac_reward_zero_std": 0.5, "grad_norm": 0.12795600295066833, "kl": 0.0006310145060221354, "learning_rate": 7.156560487081051e-08, "loss": 0.0535, "num_tokens": 64200392.0, "reward": 0.34375, "reward_std": 0.24183645844459534, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4774521291255951, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3317.0, "completions/mean_length": 2594.78125, "completions/mean_terminated_length": 1267.7803955078125, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 0.5097142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 0.09798948466777802, "kl": 0.000594933827718099, "learning_rate": 7.127310565369415e-08, "loss": 0.0282, "num_tokens": 64465505.0, "reward": 0.4479166865348816, "reward_std": 0.12234010547399521, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3265.0, "completions/mean_length": 3218.697998046875, "completions/mean_terminated_length": 2181.239990234375, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.512, "frac_reward_zero_std": 0.25, "grad_norm": 0.11903919279575348, "kl": 0.0006157557169596354, "learning_rate": 7.097981330836617e-08, "loss": 0.0783, "num_tokens": 64790640.0, "reward": 0.3333333432674408, "reward_std": 0.3506578803062439, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3265.0, "completions/mean_length": 2455.5, "completions/mean_terminated_length": 941.6585083007812, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.5142857142857142, "frac_reward_zero_std": 0.75, "grad_norm": 0.07495436072349548, "kl": 0.0005944569905598959, "learning_rate": 7.068574212948168e-08, "loss": -0.0009, "num_tokens": 65042346.0, "reward": 0.4375, "reward_std": 0.11077922582626343, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.16666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3180.0, "completions/mean_length": 3073.77099609375, "completions/mean_terminated_length": 1834.6429443359375, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 0.5165714285714286, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1402408331632614, "kl": 0.0006856918334960938, "learning_rate": 7.039090644965509e-08, "loss": 0.055, "num_tokens": 65353040.0, "reward": 0.3645833432674408, "reward_std": 0.3116236627101898, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3357.0, "completions/mean_length": 2995.33349609375, "completions/mean_terminated_length": 1761.0322265625, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.5188571428571429, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0822618380188942, "kl": 0.0005601247151692709, "learning_rate": 7.009532063876148e-08, "loss": 0.043, "num_tokens": 65656558.0, "reward": 0.4270833432674408, "reward_std": 0.21436314284801483, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3571.0, "completions/mean_length": 2981.666748046875, "completions/mean_terminated_length": 1883.2940673828125, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.5211428571428571, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12317512929439545, "kl": 0.0006227493286132812, "learning_rate": 6.979899910323624e-08, "loss": 0.0284, "num_tokens": 65959184.0, "reward": 0.3541666865348816, "reward_std": 0.27606913447380066, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.48077061772346497, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3531.0, "completions/mean_length": 2456.197998046875, "completions/mean_terminated_length": 1461.0784912109375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.5234285714285715, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10438801348209381, "kl": 0.0005610783894856771, "learning_rate": 6.950195628537299e-08, "loss": 0.0672, "num_tokens": 66209973.0, "reward": 0.5520833730697632, "reward_std": 0.22308027744293213, "rewards/format_reward/mean": 0.5520833134651184, "rewards/format_reward/std": 0.4998903274536133, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3477.0, "completions/mean_length": 2736.041748046875, "completions/mean_terminated_length": 1322.77783203125, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.5257142857142857, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07311544567346573, "kl": 0.0006850560506184896, "learning_rate": 6.920420666261962e-08, "loss": 0.0119, "num_tokens": 66488395.0, "reward": 0.4166666865348816, "reward_std": 0.08330589532852173, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.4955946207046509, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3501.0, "completions/mean_length": 2743.08349609375, "completions/mean_terminated_length": 1341.5555419921875, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.528, "frac_reward_zero_std": 0.5, "grad_norm": 0.1263621300458908, "kl": 0.0006767908732096354, "learning_rate": 6.890576474687262e-08, "loss": 0.0659, "num_tokens": 66767253.0, "reward": 0.4166666865348816, "reward_std": 0.23987865447998047, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.49559465050697327, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3556.0, "completions/mean_length": 2718.635498046875, "completions/mean_terminated_length": 1453.871826171875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.5302857142857142, "frac_reward_zero_std": 0.375, "grad_norm": 0.11518718302249908, "kl": 0.0006265640258789062, "learning_rate": 6.860664508377e-08, "loss": 0.043, "num_tokens": 67044628.0, "reward": 0.4375, "reward_std": 0.29090970754623413, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.7083333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3562.0, "completions/mean_length": 2087.52099609375, "completions/mean_terminated_length": 1373.8154296875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.5325714285714286, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12469115853309631, "kl": 0.0006173451741536459, "learning_rate": 6.83068622519821e-08, "loss": 0.04, "num_tokens": 67260312.0, "reward": 0.7083333730697632, "reward_std": 0.22112248837947845, "rewards/format_reward/mean": 0.7083333134651184, "rewards/format_reward/std": 0.4569157063961029, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3559.0, "completions/mean_length": 2580.197998046875, "completions/mean_terminated_length": 1113.1025390625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.5348571428571428, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07785586267709732, "kl": 0.0006580352783203125, "learning_rate": 6.800643086250122e-08, "loss": -0.0065, "num_tokens": 67522945.0, "reward": 0.4270833432674408, "reward_std": 0.085263691842556, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3516.0, "completions/mean_length": 2848.385498046875, "completions/mean_terminated_length": 1818.5250244140625, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.5371428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 0.10632475465536118, "kl": 0.0005680720011393229, "learning_rate": 6.770536555792944e-08, "loss": 0.0955, "num_tokens": 67812518.0, "reward": 0.4270833432674408, "reward_std": 0.24183645844459534, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3554.0, "completions/mean_length": 2449.1875, "completions/mean_terminated_length": 1050.465087890625, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.5394285714285715, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13193044066429138, "kl": 0.0006238619486490885, "learning_rate": 6.740368101176495e-08, "loss": 0.035, "num_tokens": 68063360.0, "reward": 0.5104166865348816, "reward_std": 0.20564600825309753, "rewards/format_reward/mean": 0.5104166865348816, "rewards/format_reward/std": 0.5025155544281006, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9583333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 2447.40625, "completions/mean_terminated_length": 1262.44677734375, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.5417142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 0.11480724811553955, "kl": 0.0006078084309895834, "learning_rate": 6.710139192768695e-08, "loss": 0.0312, "num_tokens": 68314673.0, "reward": 0.53125, "reward_std": 0.17728674411773682, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3333333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3576.0, "completions/mean_length": 2243.0625, "completions/mean_terminated_length": 1285.25, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.544, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07083650678396225, "kl": 0.0006602605183919271, "learning_rate": 6.67985130388389e-08, "loss": 0.0415, "num_tokens": 68545403.0, "reward": 0.6145833730697632, "reward_std": 0.09878238290548325, "rewards/format_reward/mean": 0.6145833134651184, "rewards/format_reward/std": 0.4892484247684479, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3442.0, "completions/mean_length": 2674.08349609375, "completions/mean_terminated_length": 1344.2052001953125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.5462857142857143, "frac_reward_zero_std": 0.375, "grad_norm": 0.12556946277618408, "kl": 0.0006933212280273438, "learning_rate": 6.649505910711058e-08, "loss": 0.0903, "num_tokens": 68817835.0, "reward": 0.4270833432674408, "reward_std": 0.29766905307769775, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 2710.0, "completions/mean_length": 2733.510498046875, "completions/mean_terminated_length": 1182.61767578125, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.5485714285714286, "frac_reward_zero_std": 0.8125, "grad_norm": 0.07574421167373657, "kl": 0.0006259282430013021, "learning_rate": 6.619104492241847e-08, "loss": 0.0495, "num_tokens": 69096842.0, "reward": 0.3541666865348816, "reward_std": 0.10074018687009811, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.48077061772346497, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.16666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3507.0, "completions/mean_length": 3001.28125, "completions/mean_terminated_length": 1586.107177734375, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.5508571428571428, "frac_reward_zero_std": 0.3125, "grad_norm": 0.13523098826408386, "kl": 0.0006052652994791666, "learning_rate": 6.588648530198504e-08, "loss": 0.0916, "num_tokens": 69401945.0, "reward": 0.3333333432674408, "reward_std": 0.2961471974849701, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3518.0, "completions/mean_length": 2763.11474609375, "completions/mean_terminated_length": 1454.1351318359375, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "epoch": 0.5531428571428572, "frac_reward_zero_std": 0.5, "grad_norm": 0.12402091920375824, "kl": 0.000682830810546875, "learning_rate": 6.558139508961654e-08, "loss": 0.0864, "num_tokens": 69682702.0, "reward": 0.4166666865348816, "reward_std": 0.24859580397605896, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.4955946207046509, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 2449.0, "completions/mean_length": 2568.71875, "completions/mean_terminated_length": 799.2285766601562, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.5554285714285714, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09158474206924438, "kl": 0.0006755193074544271, "learning_rate": 6.527578915497951e-08, "loss": 0.0157, "num_tokens": 69945055.0, "reward": 0.3958333432674408, "reward_std": 0.1343369483947754, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1666666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3262.0, "completions/mean_length": 2285.42724609375, "completions/mean_terminated_length": 1186.6346435546875, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.5577142857142857, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09291835129261017, "kl": 0.0005892117818196615, "learning_rate": 6.496968239287604e-08, "loss": 0.028, "num_tokens": 70179798.0, "reward": 0.5729166865348816, "reward_std": 0.14501188695430756, "rewards/format_reward/mean": 0.5729166865348816, "rewards/format_reward/std": 0.4972511827945709, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3365.0, "completions/mean_length": 2913.15625, "completions/mean_terminated_length": 1689.8529052734375, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.56, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0874626636505127, "kl": 0.0006361007690429688, "learning_rate": 6.466308972251785e-08, "loss": 0.0119, "num_tokens": 70476069.0, "reward": 0.3645833432674408, "reward_std": 0.15461497008800507, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3309.0, "completions/mean_length": 2953.375, "completions/mean_terminated_length": 1803.4117431640625, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.5622857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.1325761079788208, "kl": 0.0006734530131022135, "learning_rate": 6.435602608679917e-08, "loss": 0.0452, "num_tokens": 70775853.0, "reward": 0.3750000596046448, "reward_std": 0.23116151988506317, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.4866642653942108, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3502.0, "completions/mean_length": 2759.03125, "completions/mean_terminated_length": 1184.0909423828125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.5645714285714286, "frac_reward_zero_std": 0.625, "grad_norm": 0.10333006083965302, "kl": 0.0006189346313476562, "learning_rate": 6.40485064515684e-08, "loss": 0.036, "num_tokens": 71056518.0, "reward": 0.375, "reward_std": 0.1801304817199707, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.4866642653942108, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3492.0, "completions/mean_length": 2932.635498046875, "completions/mean_terminated_length": 1082.760009765625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.5668571428571428, "frac_reward_zero_std": 0.75, "grad_norm": 0.08525275439023972, "kl": 0.0007171630859375, "learning_rate": 6.374054580489874e-08, "loss": 0.0231, "num_tokens": 71354473.0, "reward": 0.2708333432674408, "reward_std": 0.11558075994253159, "rewards/format_reward/mean": 0.2708333432674408, "rewards/format_reward/std": 0.44672295451164246, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3256.0, "completions/mean_length": 2645.21875, "completions/mean_terminated_length": 1273.15380859375, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.5691428571428572, "frac_reward_zero_std": 0.5, "grad_norm": 0.12027940899133682, "kl": 0.0006109873453776041, "learning_rate": 6.343215915635761e-08, "loss": 0.0539, "num_tokens": 71624548.0, "reward": 0.4270833432674408, "reward_std": 0.219600647687912, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3553.0, "completions/mean_length": 2841.36474609375, "completions/mean_terminated_length": 1657.1622314453125, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.5714285714285714, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12890154123306274, "kl": 0.0006311734517415365, "learning_rate": 6.31233615362752e-08, "loss": 0.0543, "num_tokens": 71913285.0, "reward": 0.4479166865348816, "reward_std": 0.28763002157211304, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3280.0, "completions/mean_length": 2801.885498046875, "completions/mean_terminated_length": 994.9310302734375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.5737142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 0.09506624937057495, "kl": 0.0006351470947265625, "learning_rate": 6.281416799501187e-08, "loss": 0.0062, "num_tokens": 72198220.0, "reward": 0.3541666865348816, "reward_std": 0.16661179065704346, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.4807705879211426, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3412.0, "completions/mean_length": 2869.21875, "completions/mean_terminated_length": 1677.9166259765625, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.576, "frac_reward_zero_std": 0.5, "grad_norm": 0.11403991281986237, "kl": 0.0006430943806966146, "learning_rate": 6.250459360222459e-08, "loss": 0.0382, "num_tokens": 72489517.0, "reward": 0.3854166865348816, "reward_std": 0.22440217435359955, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3557.0, "completions/mean_length": 2824.0625, "completions/mean_terminated_length": 1438.2940673828125, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.5782857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09145499765872955, "kl": 0.0006418228149414062, "learning_rate": 6.219465344613258e-08, "loss": 0.0355, "num_tokens": 72776083.0, "reward": 0.4270833432674408, "reward_std": 0.1921273171901703, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3336.0, "completions/mean_length": 2542.729248046875, "completions/mean_terminated_length": 1259.3023681640625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.5805714285714285, "frac_reward_zero_std": 0.5, "grad_norm": 0.10661627352237701, "kl": 0.0005591710408528646, "learning_rate": 6.188436263278172e-08, "loss": 0.0466, "num_tokens": 73036325.0, "reward": 0.5, "reward_std": 0.2350771129131317, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5026246905326843, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3234.0, "completions/mean_length": 3043.23974609375, "completions/mean_terminated_length": 1420.9583740234375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.5828571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 0.08879940211772919, "kl": 0.0006265640258789062, "learning_rate": 6.157373628530852e-08, "loss": 0.0172, "num_tokens": 73343968.0, "reward": 0.2916666865348816, "reward_std": 0.16661179065704346, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.4569156765937805, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3504.0, "completions/mean_length": 2902.73974609375, "completions/mean_terminated_length": 1540.21875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.5851428571428572, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10758217424154282, "kl": 0.0006440480550130209, "learning_rate": 6.126278954320294e-08, "loss": 0.0365, "num_tokens": 73640103.0, "reward": 0.3958333432674408, "reward_std": 0.1988866627216339, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0416666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 2624.229248046875, "completions/mean_terminated_length": 1703.632568359375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.5874285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.12144730240106583, "kl": 0.0005919138590494791, "learning_rate": 6.095153756157051e-08, "loss": 0.0444, "num_tokens": 73907665.0, "reward": 0.5312500596046448, "reward_std": 0.23311930894851685, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3429.0, "completions/mean_length": 2751.89599609375, "completions/mean_terminated_length": 1087.6875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.5897142857142857, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09886621683835983, "kl": 0.0006812413533528646, "learning_rate": 6.063999551039369e-08, "loss": 0.0392, "num_tokens": 74187315.0, "reward": 0.3541666865348816, "reward_std": 0.1343369334936142, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.48077061772346497, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4166666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3496.0, "completions/mean_length": 2210.229248046875, "completions/mean_terminated_length": 1310.17236328125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.592, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11603335291147232, "kl": 0.0007600784301757812, "learning_rate": 6.032817857379255e-08, "loss": 0.0531, "num_tokens": 74414479.0, "reward": 0.625, "reward_std": 0.19408512115478516, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.4866642653942108, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3373.0, "completions/mean_length": 2494.25, "completions/mean_terminated_length": 1259.2000732421875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.5942857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11546017974615097, "kl": 0.0006551742553710938, "learning_rate": 6.001610194928464e-08, "loss": 0.0249, "num_tokens": 74670565.0, "reward": 0.5208333730697632, "reward_std": 0.2076037973165512, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3528.0, "completions/mean_length": 3008.58349609375, "completions/mean_terminated_length": 2049.5556640625, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.5965714285714285, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11220080405473709, "kl": 0.0006383260091145834, "learning_rate": 5.970378084704441e-08, "loss": 0.0344, "num_tokens": 74975733.0, "reward": 0.3854166865348816, "reward_std": 0.21436314284801483, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9583333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3460.0, "completions/mean_length": 2530.25, "completions/mean_terminated_length": 1431.6595458984375, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.5988571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 0.12949800491333008, "kl": 0.0006148020426432291, "learning_rate": 5.9391230489161725e-08, "loss": -0.0257, "num_tokens": 75233955.0, "reward": 0.53125, "reward_std": 0.2379208654165268, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3579.0, "completions/mean_length": 2763.8125, "completions/mean_terminated_length": 1663.5609130859375, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.6011428571428571, "frac_reward_zero_std": 0.625, "grad_norm": 0.09129273146390915, "kl": 0.0005540847778320312, "learning_rate": 5.907846610890012e-08, "loss": 0.0452, "num_tokens": 75515427.0, "reward": 0.4479166865348816, "reward_std": 0.16856959462165833, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3337.0, "completions/mean_length": 2752.916748046875, "completions/mean_terminated_length": 1304.4571533203125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.6034285714285714, "frac_reward_zero_std": 0.375, "grad_norm": 0.14278699457645416, "kl": 0.0006055831909179688, "learning_rate": 5.8765502949954205e-08, "loss": 0.0164, "num_tokens": 75795661.0, "reward": 0.40625, "reward_std": 0.2928674817085266, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9166666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3352.0, "completions/mean_length": 2579.5, "completions/mean_terminated_length": 1487.6522216796875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.6057142857142858, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12508751451969147, "kl": 0.0006745656331380209, "learning_rate": 5.845235626570684e-08, "loss": 0.0209, "num_tokens": 76059787.0, "reward": 0.5104166865348816, "reward_std": 0.2653941810131073, "rewards/format_reward/mean": 0.5104166865348816, "rewards/format_reward/std": 0.5025156140327454, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3554.0, "completions/mean_length": 2894.8125, "completions/mean_terminated_length": 1970.2926025390625, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.608, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11293845623731613, "kl": 0.0006990432739257812, "learning_rate": 5.813904131848564e-08, "loss": 0.05, "num_tokens": 76353439.0, "reward": 0.4791666865348816, "reward_std": 0.2721535563468933, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3391.0, "completions/mean_length": 2822.21875, "completions/mean_terminated_length": 1552.5833740234375, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.6102857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 0.09845055639743805, "kl": 0.000606536865234375, "learning_rate": 5.7825573378819105e-08, "loss": 0.0232, "num_tokens": 76640764.0, "reward": 0.40625, "reward_std": 0.17728674411773682, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3550.0, "completions/mean_length": 2686.6875, "completions/mean_terminated_length": 1430.4500732421875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.6125714285714285, "frac_reward_zero_std": 0.5, "grad_norm": 0.10997117310762405, "kl": 0.0006361007690429688, "learning_rate": 5.7511967724692364e-08, "loss": 0.0231, "num_tokens": 76914838.0, "reward": 0.4479166865348816, "reward_std": 0.23311933875083923, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3470.0, "completions/mean_length": 2866.010498046875, "completions/mean_terminated_length": 1721.108154296875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.6148571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 0.09718002378940582, "kl": 0.0006148020426432291, "learning_rate": 5.719823964080261e-08, "loss": 0.0337, "num_tokens": 77206871.0, "reward": 0.40625, "reward_std": 0.17728674411773682, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3485.0, "completions/mean_length": 2799.197998046875, "completions/mean_terminated_length": 1431.4000244140625, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.6171428571428571, "frac_reward_zero_std": 0.25, "grad_norm": 0.15031415224075317, "kl": 0.0007104873657226562, "learning_rate": 5.688440441781398e-08, "loss": 0.065, "num_tokens": 77490990.0, "reward": 0.4166666865348816, "reward_std": 0.3506578803062439, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.4955946207046509, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 2725.28125, "completions/mean_terminated_length": 1573.3414306640625, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.6194285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.10987356305122375, "kl": 0.0006017684936523438, "learning_rate": 5.6570477351612554e-08, "loss": 0.0556, "num_tokens": 77768895.0, "reward": 0.4791666865348816, "reward_std": 0.22635996341705322, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3483.0, "completions/mean_length": 2799.166748046875, "completions/mean_terminated_length": 1652.1025390625, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.6217142857142857, "frac_reward_zero_std": 0.375, "grad_norm": 0.12976664304733276, "kl": 0.0006278355916341146, "learning_rate": 5.6256473742560605e-08, "loss": 0.0381, "num_tokens": 78053101.0, "reward": 0.46875, "reward_std": 0.30638620257377625, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.2083333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3533.0, "completions/mean_length": 2288.42724609375, "completions/mean_terminated_length": 1237.3018798828125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.624, "frac_reward_zero_std": 0.5, "grad_norm": 0.1335023194551468, "kl": 0.0007028579711914062, "learning_rate": 5.594240889475106e-08, "loss": 0.0614, "num_tokens": 78288264.0, "reward": 0.5729166865348816, "reward_std": 0.2640722990036011, "rewards/format_reward/mean": 0.5729166865348816, "rewards/format_reward/std": 0.4972512423992157, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8333333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3493.0, "completions/mean_length": 2633.885498046875, "completions/mean_terminated_length": 1511.0228271484375, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.6262857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.13292840123176575, "kl": 0.0007330576578776041, "learning_rate": 5.562829811526154e-08, "loss": 0.0492, "num_tokens": 78556615.0, "reward": 0.5208333730697632, "reward_std": 0.24468021094799042, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 2690.84375, "completions/mean_terminated_length": 1202.25, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.6285714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.11393062770366669, "kl": 0.0006831487019856771, "learning_rate": 5.5314156713408264e-08, "loss": 0.0937, "num_tokens": 78830620.0, "reward": 0.40625, "reward_std": 0.24183647334575653, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3519.0, "completions/mean_length": 2429.05224609375, "completions/mean_terminated_length": 1274.104248046875, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.6308571428571429, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11953698098659515, "kl": 0.0007349650065104166, "learning_rate": 5.5e-08, "loss": 0.049, "num_tokens": 79079043.0, "reward": 0.5520833730697632, "reward_std": 0.20956158638000488, "rewards/format_reward/mean": 0.5520833134651184, "rewards/format_reward/std": 0.4998903274536133, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3483.0, "completions/mean_length": 2592.02099609375, "completions/mean_terminated_length": 1369.348876953125, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.6331428571428571, "frac_reward_zero_std": 0.75, "grad_norm": 0.09889426827430725, "kl": 0.0006306966145833334, "learning_rate": 5.468584328659173e-08, "loss": -0.0167, "num_tokens": 79344689.0, "reward": 0.4895833432674408, "reward_std": 0.12234010547399521, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3580.0, "completions/mean_length": 2784.625, "completions/mean_terminated_length": 1756.857177734375, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.6354285714285715, "frac_reward_zero_std": 0.375, "grad_norm": 0.13919898867607117, "kl": 0.00070953369140625, "learning_rate": 5.4371701884738466e-08, "loss": 0.0669, "num_tokens": 79628219.0, "reward": 0.5104166865348816, "reward_std": 0.30638617277145386, "rewards/format_reward/mean": 0.5104166865348816, "rewards/format_reward/std": 0.5025155544281006, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.16666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3101.0, "completions/mean_length": 2894.71875, "completions/mean_terminated_length": 1220.75, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.6377142857142857, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08552590757608414, "kl": 0.0006707509358723959, "learning_rate": 5.405759110524893e-08, "loss": 0.0469, "num_tokens": 79921580.0, "reward": 0.3229166865348816, "reward_std": 0.14981341361999512, "rewards/format_reward/mean": 0.3229166567325592, "rewards/format_reward/std": 0.4700457453727722, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3127.0, "completions/mean_length": 2573.40625, "completions/mean_terminated_length": 1327.7906494140625, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.64, "frac_reward_zero_std": 0.5, "grad_norm": 0.1325606405735016, "kl": 0.0006453196207682291, "learning_rate": 5.374352625743941e-08, "loss": 0.0791, "num_tokens": 80183969.0, "reward": 0.46875, "reward_std": 0.23311930894851685, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3517.0, "completions/mean_length": 2715.625, "completions/mean_terminated_length": 1645.3023681640625, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.6422857142857142, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12638314068317413, "kl": 0.0006529490152994791, "learning_rate": 5.342952264838747e-08, "loss": 0.0544, "num_tokens": 80461163.0, "reward": 0.4895833432674408, "reward_std": 0.26059263944625854, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3418.0, "completions/mean_length": 2836.510498046875, "completions/mean_terminated_length": 1833.7803955078125, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 0.6445714285714286, "frac_reward_zero_std": 0.375, "grad_norm": 0.13566777110099792, "kl": 0.0007076263427734375, "learning_rate": 5.3115595582186024e-08, "loss": 0.0592, "num_tokens": 80749686.0, "reward": 0.4583333432674408, "reward_std": 0.28219255805015564, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3335.0, "completions/mean_length": 2805.625, "completions/mean_terminated_length": 1761.46337890625, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.6468571428571429, "frac_reward_zero_std": 0.375, "grad_norm": 0.11968239396810532, "kl": 0.0007022221883138021, "learning_rate": 5.28017603591974e-08, "loss": 0.0728, "num_tokens": 81034146.0, "reward": 0.4583333432674408, "reward_std": 0.29962682723999023, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 3584.0, "completions/max_terminated_length": 3411.0, "completions/mean_length": 3168.6875, "completions/mean_terminated_length": 1771.727294921875, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.6491428571428571, "frac_reward_zero_std": 0.6875, "grad_norm": 0.06619197875261307, "kl": 0.0005556742350260416, "learning_rate": 5.248803227530763e-08, "loss": 0.0483, "num_tokens": 81355140.0, "reward": 0.25, "reward_std": 0.15657275915145874, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.435285747051239, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3318.0, "completions/mean_length": 2547.0625, "completions/mean_terminated_length": 1371.86669921875, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.6514285714285715, "frac_reward_zero_std": 0.5, "grad_norm": 0.11932481825351715, "kl": 0.0006405512491861979, "learning_rate": 5.21744266211809e-08, "loss": 0.0696, "num_tokens": 81615654.0, "reward": 0.4791666865348816, "reward_std": 0.24468021094799042, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3562.0, "completions/mean_length": 2983.0625, "completions/mean_terminated_length": 1594.689697265625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.6537142857142857, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11437805742025375, "kl": 0.0006144841512044271, "learning_rate": 5.1860958681514355e-08, "loss": -0.0055, "num_tokens": 81917802.0, "reward": 0.3229166865348816, "reward_std": 0.1921273171901703, "rewards/format_reward/mean": 0.3229166567325592, "rewards/format_reward/std": 0.4700457453727722, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3505.0, "completions/mean_length": 2701.80224609375, "completions/mean_terminated_length": 1412.4359130859375, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.656, "frac_reward_zero_std": 0.625, "grad_norm": 0.10379407554864883, "kl": 0.000617822011311849, "learning_rate": 5.1547643734293155e-08, "loss": 0.0689, "num_tokens": 82193189.0, "reward": 0.4270833432674408, "reward_std": 0.17337113618850708, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 2696.09375, "completions/mean_terminated_length": 1216.25, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.6582857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 0.11394447088241577, "kl": 0.0006869633992513021, "learning_rate": 5.1234497050045814e-08, "loss": 0.0097, "num_tokens": 82468148.0, "reward": 0.3958333432674408, "reward_std": 0.1530931293964386, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3506.0, "completions/mean_length": 2272.510498046875, "completions/mean_terminated_length": 1252.4630126953125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.6605714285714286, "frac_reward_zero_std": 0.375, "grad_norm": 0.16180695593357086, "kl": 0.0006192525227864584, "learning_rate": 5.09215338910999e-08, "loss": 0.0779, "num_tokens": 82701429.0, "reward": 0.5833333730697632, "reward_std": 0.3044283986091614, "rewards/format_reward/mean": 0.5833333134651184, "rewards/format_reward/std": 0.4955945909023285, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3547.0, "completions/mean_length": 2717.0625, "completions/mean_terminated_length": 1450.0, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.6628571428571428, "frac_reward_zero_std": 0.5625, "grad_norm": 0.08857004344463348, "kl": 0.0007041295369466146, "learning_rate": 5.060876951083828e-08, "loss": 0.0593, "num_tokens": 82978797.0, "reward": 0.4479166865348816, "reward_std": 0.21916469931602478, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3273.0, "completions/mean_length": 2966.260498046875, "completions/mean_terminated_length": 1539.0689697265625, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.6651428571428571, "frac_reward_zero_std": 0.8125, "grad_norm": 0.06730318069458008, "kl": 0.0006262461344401041, "learning_rate": 5.02962191529556e-08, "loss": 0.0011, "num_tokens": 83279800.0, "reward": 0.34375, "reward_std": 0.0852636992931366, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4774521291255951, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9583333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3522.0, "completions/mean_length": 2521.010498046875, "completions/mean_terminated_length": 1412.7872314453125, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.6674285714285715, "frac_reward_zero_std": 0.4375, "grad_norm": 0.10721820592880249, "kl": 0.0006542205810546875, "learning_rate": 4.9983898050715357e-08, "loss": 0.0605, "num_tokens": 83538101.0, "reward": 0.53125, "reward_std": 0.2653941810131073, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3549.0, "completions/mean_length": 2342.34375, "completions/mean_terminated_length": 1246.7647705078125, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.6697142857142857, "frac_reward_zero_std": 0.3125, "grad_norm": 0.16255320608615875, "kl": 0.0007025400797526041, "learning_rate": 4.9671821426207447e-08, "loss": 0.0837, "num_tokens": 83779070.0, "reward": 0.53125, "reward_std": 0.3164252042770386, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3576.0, "completions/mean_length": 3046.42724609375, "completions/mean_terminated_length": 1863.7667236328125, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.672, "frac_reward_zero_std": 0.625, "grad_norm": 0.09268605709075928, "kl": 0.0006469090779622396, "learning_rate": 4.93600044896063e-08, "loss": 0.0276, "num_tokens": 84088195.0, "reward": 0.34375, "reward_std": 0.18208830058574677, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4774521291255951, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3484.0, "completions/mean_length": 2709.229248046875, "completions/mean_terminated_length": 1584.5238037109375, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 0.6742857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.08126060664653778, "kl": 0.0006958643595377604, "learning_rate": 4.9048462438429486e-08, "loss": 0.0407, "num_tokens": 84364775.0, "reward": 0.4583333432674408, "reward_std": 0.11558075994253159, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3373.0, "completions/mean_length": 2988.854248046875, "completions/mean_terminated_length": 1467.9259033203125, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.6765714285714286, "frac_reward_zero_std": 0.5625, "grad_norm": 0.14968274533748627, "kl": 0.0007352828979492188, "learning_rate": 4.873721045679706e-08, "loss": 0.043, "num_tokens": 84667329.0, "reward": 0.3229166865348816, "reward_std": 0.21827873587608337, "rewards/format_reward/mean": 0.3229166567325592, "rewards/format_reward/std": 0.4700457453727722, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3432.0, "completions/mean_length": 2834.760498046875, "completions/mean_terminated_length": 1468.5, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.6788571428571428, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11504609137773514, "kl": 0.0006415049235026041, "learning_rate": 4.842626371469148e-08, "loss": 0.0437, "num_tokens": 84956068.0, "reward": 0.4270833432674408, "reward_std": 0.20084445178508759, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3535.0, "completions/mean_length": 2677.36474609375, "completions/mean_terminated_length": 1352.2821044921875, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.6811428571428572, "frac_reward_zero_std": 0.375, "grad_norm": 0.14034387469291687, "kl": 0.0006526311238606771, "learning_rate": 4.811563736721829e-08, "loss": 0.0091, "num_tokens": 85229037.0, "reward": 0.4375, "reward_std": 0.2773910164833069, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3537.0, "completions/mean_length": 2894.11474609375, "completions/mean_terminated_length": 1131.0740966796875, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.6834285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.06885220855474472, "kl": 0.0006777445475260416, "learning_rate": 4.7805346553867434e-08, "loss": 0.0079, "num_tokens": 85523816.0, "reward": 0.2916666567325592, "reward_std": 0.10206207633018494, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.4569156765937805, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3319.0, "completions/mean_length": 2698.15625, "completions/mean_terminated_length": 1403.4615478515625, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.6857142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 0.12070653587579727, "kl": 0.0006907780965169271, "learning_rate": 4.7495406397775394e-08, "loss": 0.0504, "num_tokens": 85798883.0, "reward": 0.4270833432674408, "reward_std": 0.16856960952281952, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3530.0, "completions/mean_length": 3193.9375, "completions/mean_terminated_length": 1800.857177734375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.688, "frac_reward_zero_std": 0.4375, "grad_norm": 0.10006493330001831, "kl": 0.0006546974182128906, "learning_rate": 4.718583200498813e-08, "loss": 0.0494, "num_tokens": 86121749.0, "reward": 0.2708333432674408, "reward_std": 0.2856721878051758, "rewards/format_reward/mean": 0.2708333432674408, "rewards/format_reward/std": 0.44672295451164246, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 2942.0, "completions/mean_length": 2248.5625, "completions/mean_terminated_length": 1070.2353515625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.6902857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1160050481557846, "kl": 0.0007260640462239584, "learning_rate": 4.6876638463724804e-08, "loss": 0.0177, "num_tokens": 86353043.0, "reward": 0.6354166865348816, "reward_std": 0.21436314284801483, "rewards/format_reward/mean": 0.6354166865348816, "rewards/format_reward/std": 0.4838397204875946, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3340.0, "completions/mean_length": 2885.58349609375, "completions/mean_terminated_length": 1552.242431640625, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.6925714285714286, "frac_reward_zero_std": 0.625, "grad_norm": 0.10667046159505844, "kl": 0.0007079442342122396, "learning_rate": 4.656784084364238e-08, "loss": 0.0059, "num_tokens": 86646205.0, "reward": 0.3645833432674408, "reward_std": 0.17337113618850708, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3581.0, "completions/mean_length": 2266.83349609375, "completions/mean_terminated_length": 1365.614013671875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.6948571428571428, "frac_reward_zero_std": 0.875, "grad_norm": 0.062273427844047546, "kl": 0.0006469090779622396, "learning_rate": 4.6259454195101264e-08, "loss": 0.0107, "num_tokens": 86879223.0, "reward": 0.6145833730697632, "reward_std": 0.05779037997126579, "rewards/format_reward/mean": 0.6145833134651184, "rewards/format_reward/std": 0.4892484247684479, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.08333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3108.0, "completions/mean_length": 2927.39599609375, "completions/mean_terminated_length": 1159.615478515625, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.6971428571428572, "frac_reward_zero_std": 0.5625, "grad_norm": 0.08911117911338806, "kl": 0.0006799697875976562, "learning_rate": 4.59514935484316e-08, "loss": 0.0798, "num_tokens": 87177995.0, "reward": 0.28125, "reward_std": 0.1921273171901703, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.45196935534477234, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3402.0, "completions/mean_length": 2868.885498046875, "completions/mean_terminated_length": 1438.65625, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.6994285714285714, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09344348311424255, "kl": 0.000652313232421875, "learning_rate": 4.564397391320084e-08, "loss": 0.0681, "num_tokens": 87469650.0, "reward": 0.34375, "reward_std": 0.20084445178508759, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4774521291255951, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3469.0, "completions/mean_length": 3219.0, "completions/mean_terminated_length": 2182.39990234375, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "epoch": 0.7017142857142857, "frac_reward_zero_std": 0.375, "grad_norm": 0.11024210602045059, "kl": 0.0006062189737955729, "learning_rate": 4.533691027748215e-08, "loss": 0.0365, "num_tokens": 87795084.0, "reward": 0.2916666865348816, "reward_std": 0.2957112491130829, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.4569156765937805, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3483.0, "completions/mean_length": 2657.55224609375, "completions/mean_terminated_length": 1243.5, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.704, "frac_reward_zero_std": 0.375, "grad_norm": 0.145328551530838, "kl": 0.0007899602254231771, "learning_rate": 4.5030317607123966e-08, "loss": 0.0615, "num_tokens": 88065407.0, "reward": 0.4270833432674408, "reward_std": 0.2841503620147705, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 2607.572998046875, "completions/mean_terminated_length": 1404.0697021484375, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.7062857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11172831803560257, "kl": 0.0006608963012695312, "learning_rate": 4.4724210845020494e-08, "loss": 0.0783, "num_tokens": 88331868.0, "reward": 0.4583333432674408, "reward_std": 0.22112248837947845, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3572.0, "completions/mean_length": 2582.479248046875, "completions/mean_terminated_length": 1238.9755859375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.7085714285714285, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12015148997306824, "kl": 0.0005992253621419271, "learning_rate": 4.441860491038345e-08, "loss": 0.032, "num_tokens": 88595230.0, "reward": 0.4895833432674408, "reward_std": 0.2653941810131073, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3057.0, "completions/mean_length": 2851.479248046875, "completions/mean_terminated_length": 1315.54833984375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.7108571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 0.0867365375161171, "kl": 0.0006038347880045573, "learning_rate": 4.4113514698014955e-08, "loss": 0.0452, "num_tokens": 88885868.0, "reward": 0.34375, "reward_std": 0.15985244512557983, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4774521291255951, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3521.0, "completions/mean_length": 2841.166748046875, "completions/mean_terminated_length": 1355.5, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.7131428571428572, "frac_reward_zero_std": 0.375, "grad_norm": 0.1219867542386055, "kl": 0.0006688435872395834, "learning_rate": 4.380895507758154e-08, "loss": 0.059, "num_tokens": 89174574.0, "reward": 0.375, "reward_std": 0.28219255805015564, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.4866642653942108, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 2503.61474609375, "completions/mean_terminated_length": 1171.9766845703125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.7154285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 0.11462200433015823, "kl": 0.0006494522094726562, "learning_rate": 4.350494089288943e-08, "loss": 0.0293, "num_tokens": 89431583.0, "reward": 0.5104167461395264, "reward_std": 0.17337113618850708, "rewards/format_reward/mean": 0.5104166865348816, "rewards/format_reward/std": 0.5025155544281006, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3104.0, "completions/mean_length": 2577.260498046875, "completions/mean_terminated_length": 1226.756103515625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.7177142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.09503225237131119, "kl": 0.0007216135660807291, "learning_rate": 4.3201486961161095e-08, "loss": 0.0493, "num_tokens": 89694846.0, "reward": 0.46875, "reward_std": 0.22440217435359955, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3560.0, "completions/mean_length": 2438.635498046875, "completions/mean_terminated_length": 1140.5555419921875, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.72, "frac_reward_zero_std": 0.5, "grad_norm": 0.12994083762168884, "kl": 0.0006510416666666666, "learning_rate": 4.289860807231305e-08, "loss": 0.0514, "num_tokens": 89945743.0, "reward": 0.5208333730697632, "reward_std": 0.22635996341705322, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3217.0, "completions/mean_length": 2738.52099609375, "completions/mean_terminated_length": 1604.3414306640625, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.7222857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 0.10149212926626205, "kl": 0.0007270177205403646, "learning_rate": 4.2596318988235035e-08, "loss": 0.0241, "num_tokens": 90225123.0, "reward": 0.4583333432674408, "reward_std": 0.1801304817199707, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3518.0, "completions/mean_length": 2962.86474609375, "completions/mean_terminated_length": 1596.36669921875, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.7245714285714285, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10409843921661377, "kl": 0.0006618499755859375, "learning_rate": 4.2294634442070557e-08, "loss": 0.0227, "num_tokens": 90526292.0, "reward": 0.3645833432674408, "reward_std": 0.20084445178508759, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3224.0, "completions/mean_length": 2703.625, "completions/mean_terminated_length": 1571.71435546875, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.7268571428571429, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11185863614082336, "kl": 0.0006682078043619791, "learning_rate": 4.1993569137498776e-08, "loss": 0.0496, "num_tokens": 90802334.0, "reward": 0.5, "reward_std": 0.25863486528396606, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5026246905326843, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3489.0, "completions/mean_length": 3009.260498046875, "completions/mean_terminated_length": 1540.4814453125, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.7291428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 0.12368905544281006, "kl": 0.0007559458414713541, "learning_rate": 4.1693137748017915e-08, "loss": 0.0715, "num_tokens": 91107879.0, "reward": 0.3020833432674408, "reward_std": 0.23311930894851685, "rewards/format_reward/mean": 0.3020833432674408, "rewards/format_reward/std": 0.46157145500183105, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3557.0, "completions/mean_length": 2733.260498046875, "completions/mean_terminated_length": 1181.9117431640625, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.7314285714285714, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1384938657283783, "kl": 0.0007680257161458334, "learning_rate": 4.1393354916230006e-08, "loss": 0.0818, "num_tokens": 91386046.0, "reward": 0.4375, "reward_std": 0.24511614441871643, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0416666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3416.0, "completions/mean_length": 2418.979248046875, "completions/mean_terminated_length": 1301.5101318359375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.7337142857142858, "frac_reward_zero_std": 0.375, "grad_norm": 0.14306552708148956, "kl": 0.0007654825846354166, "learning_rate": 4.1094235253127375e-08, "loss": 0.0949, "num_tokens": 91633496.0, "reward": 0.53125, "reward_std": 0.3111877143383026, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3522.0, "completions/mean_length": 2838.3125, "completions/mean_terminated_length": 1414.727294921875, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.736, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13922828435897827, "kl": 0.0007216135660807291, "learning_rate": 4.079579333738039e-08, "loss": 0.0487, "num_tokens": 91922564.0, "reward": 0.3645833432674408, "reward_std": 0.2653941810131073, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3510.0, "completions/mean_length": 2946.822998046875, "completions/mean_terminated_length": 1610.806396484375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.7382857142857143, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12766923010349274, "kl": 0.0007079442342122396, "learning_rate": 4.049804371462701e-08, "loss": 0.0639, "num_tokens": 92221317.0, "reward": 0.3645833432674408, "reward_std": 0.26059263944625854, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3496.0, "completions/mean_length": 2670.479248046875, "completions/mean_terminated_length": 1495.952392578125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.7405714285714285, "frac_reward_zero_std": 0.5, "grad_norm": 0.11286967992782593, "kl": 0.0006717046101888021, "learning_rate": 4.020100089676376e-08, "loss": 0.046, "num_tokens": 92493745.0, "reward": 0.4895833432674408, "reward_std": 0.2466380000114441, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 2596.0, "completions/mean_length": 1620.2083740234375, "completions/mean_terminated_length": 727.5758056640625, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.7428571428571429, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12034157663583755, "kl": 0.0006987253824869791, "learning_rate": 3.990467936123853e-08, "loss": 0.073, "num_tokens": 92664135.0, "reward": 0.6875, "reward_std": 0.18536798655986786, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.4659455418586731, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9583333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 2718.0, "completions/mean_length": 2402.6875, "completions/mean_terminated_length": 1171.1063232421875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.7451428571428571, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10850421339273453, "kl": 0.0007120768229166666, "learning_rate": 3.960909355034491e-08, "loss": 0.0525, "num_tokens": 92910705.0, "reward": 0.5104167461395264, "reward_std": 0.1921273171901703, "rewards/format_reward/mean": 0.5104166865348816, "rewards/format_reward/std": 0.5025155544281006, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3081.0, "completions/mean_length": 3023.33349609375, "completions/mean_terminated_length": 1431.0399169921875, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.7474285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.1040889248251915, "kl": 0.0006821950276692709, "learning_rate": 3.931425787051832e-08, "loss": 0.0867, "num_tokens": 93217169.0, "reward": 0.3020833730697632, "reward_std": 0.23311930894851685, "rewards/format_reward/mean": 0.3020833432674408, "rewards/format_reward/std": 0.46157145500183105, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9166666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3288.0, "completions/mean_length": 2443.197998046875, "completions/mean_terminated_length": 1203.1956787109375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.7497142857142857, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0960441380739212, "kl": 0.0006745656331380209, "learning_rate": 3.9020186691633835e-08, "loss": 0.0155, "num_tokens": 93467232.0, "reward": 0.46875, "reward_std": 0.14109627902507782, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3566.0, "completions/mean_length": 2851.48974609375, "completions/mean_terminated_length": 1515.7353515625, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 0.752, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12683966755867004, "kl": 0.0006411870320638021, "learning_rate": 3.8726894346305846e-08, "loss": 0.0939, "num_tokens": 93757343.0, "reward": 0.375, "reward_std": 0.26735198497772217, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.4866642653942108, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3498.0, "completions/mean_length": 2779.979248046875, "completions/mean_terminated_length": 1439.9444580078125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.7542857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10126888751983643, "kl": 0.0005882581075032552, "learning_rate": 3.843439512918949e-08, "loss": 0.0151, "num_tokens": 94039881.0, "reward": 0.3854166865348816, "reward_std": 0.20084446668624878, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.48924845457077026, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3549.0, "completions/mean_length": 2738.9375, "completions/mean_terminated_length": 1697.348876953125, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.7565714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.12018129974603653, "kl": 0.0006844202677408854, "learning_rate": 3.814270329628395e-08, "loss": 0.0451, "num_tokens": 94318815.0, "reward": 0.5416666865348816, "reward_std": 0.25731295347213745, "rewards/format_reward/mean": 0.5416666865348816, "rewards/format_reward/std": 0.5008764266967773, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3470.0, "completions/mean_length": 2548.42724609375, "completions/mean_terminated_length": 1272.023193359375, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.7588571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 0.1146484985947609, "kl": 0.0006783803304036459, "learning_rate": 3.785183306423767e-08, "loss": 0.0466, "num_tokens": 94579376.0, "reward": 0.4479166865348816, "reward_std": 0.18208828568458557, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3445.0, "completions/mean_length": 2462.11474609375, "completions/mean_terminated_length": 891.4750366210938, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.7611428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 0.1057872623205185, "kl": 0.0007009506225585938, "learning_rate": 3.756179860965537e-08, "loss": 0.0786, "num_tokens": 94830823.0, "reward": 0.4479166865348816, "reward_std": 0.22440217435359955, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9166666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3357.0, "completions/mean_length": 2491.541748046875, "completions/mean_terminated_length": 1304.0870361328125, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.7634285714285715, "frac_reward_zero_std": 0.5, "grad_norm": 0.1354176104068756, "kl": 0.0006554921468098959, "learning_rate": 3.72726140684072e-08, "loss": 0.0624, "num_tokens": 95085887.0, "reward": 0.4895833432674408, "reward_std": 0.23703491687774658, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3452.0, "completions/mean_length": 2790.34375, "completions/mean_terminated_length": 1467.5833740234375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.7657142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.10556024312973022, "kl": 0.0007130304972330729, "learning_rate": 3.698429353493974e-08, "loss": 0.041, "num_tokens": 95369774.0, "reward": 0.4375, "reward_std": 0.23116151988506317, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8333333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3576.0, "completions/mean_length": 2687.52099609375, "completions/mean_terminated_length": 1628.0455322265625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.768, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1153315082192421, "kl": 0.0007044474283854166, "learning_rate": 3.669685106158899e-08, "loss": 0.0709, "num_tokens": 95644972.0, "reward": 0.4895833432674408, "reward_std": 0.2566770315170288, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3554.0, "completions/mean_length": 2809.30224609375, "completions/mean_terminated_length": 1459.1142578125, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.7702857142857142, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11351940035820007, "kl": 0.0006767908732096354, "learning_rate": 3.641030065789562e-08, "loss": 0.0491, "num_tokens": 95931273.0, "reward": 0.40625, "reward_std": 0.25187548995018005, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3510.0, "completions/mean_length": 2894.135498046875, "completions/mean_terminated_length": 1514.40625, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.7725714285714286, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1154508963227272, "kl": 0.0006427764892578125, "learning_rate": 3.612465628992203e-08, "loss": 0.0637, "num_tokens": 96225760.0, "reward": 0.3958333432674408, "reward_std": 0.2076037973165512, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4583333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3414.0, "completions/mean_length": 2150.77099609375, "completions/mean_terminated_length": 1251.966064453125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.7748571428571429, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12287288159132004, "kl": 0.0007944107055664062, "learning_rate": 3.583993187957173e-08, "loss": 0.093, "num_tokens": 96448392.0, "reward": 0.6354166865348816, "reward_std": 0.21436314284801483, "rewards/format_reward/mean": 0.6354166865348816, "rewards/format_reward/std": 0.4838397204875946, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3502.0, "completions/mean_length": 2877.80224609375, "completions/mean_terminated_length": 1647.0, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "epoch": 0.7771428571428571, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12801849842071533, "kl": 0.0006701151529947916, "learning_rate": 3.555614130391079e-08, "loss": 0.1074, "num_tokens": 96740495.0, "reward": 0.3958333432674408, "reward_std": 0.25863486528396606, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3425.0, "completions/mean_length": 2869.46875, "completions/mean_terminated_length": 1371.258056640625, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 0.7794285714285715, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08208385109901428, "kl": 0.0006910959879557291, "learning_rate": 3.527329839449151e-08, "loss": 0.0053, "num_tokens": 97031972.0, "reward": 0.3854166865348816, "reward_std": 0.1498134285211563, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.08333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3543.0, "completions/mean_length": 2997.42724609375, "completions/mean_terminated_length": 1418.1923828125, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.7817142857142857, "frac_reward_zero_std": 0.4375, "grad_norm": 0.10631714016199112, "kl": 0.0007766087849934896, "learning_rate": 3.4991416936678275e-08, "loss": 0.072, "num_tokens": 97336189.0, "reward": 0.3125000298023224, "reward_std": 0.26735198497772217, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3217.0, "completions/mean_length": 2630.229248046875, "completions/mean_terminated_length": 1350.7803955078125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.784, "frac_reward_zero_std": 0.5, "grad_norm": 0.11815189570188522, "kl": 0.0007492701212565104, "learning_rate": 3.4710510668975625e-08, "loss": 0.0648, "num_tokens": 97604297.0, "reward": 0.46875, "reward_std": 0.24183647334575653, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3423.0, "completions/mean_length": 2484.42724609375, "completions/mean_terminated_length": 1629.2037353515625, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.7862857142857143, "frac_reward_zero_std": 0.375, "grad_norm": 0.14397567510604858, "kl": 0.0006866455078125, "learning_rate": 3.4430593282358775e-08, "loss": 0.0537, "num_tokens": 97858234.0, "reward": 0.59375, "reward_std": 0.292867511510849, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.4937104284763336, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3552.0, "completions/mean_length": 3289.385498046875, "completions/mean_terminated_length": 2237.1904296875, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 0.7885714285714286, "frac_reward_zero_std": 0.375, "grad_norm": 0.11778136342763901, "kl": 0.0006767908732096354, "learning_rate": 3.4151678419606236e-08, "loss": 0.0647, "num_tokens": 98190275.0, "reward": 0.2916666865348816, "reward_std": 0.29090970754623413, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.4569156765937805, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3513.0, "completions/mean_length": 2899.73974609375, "completions/mean_terminated_length": 1808.6217041015625, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.7908571428571428, "frac_reward_zero_std": 0.375, "grad_norm": 0.1161077618598938, "kl": 0.0007422765096028646, "learning_rate": 3.387377967463493e-08, "loss": 0.0585, "num_tokens": 98484130.0, "reward": 0.4166666865348816, "reward_std": 0.29962682723999023, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.4955946207046509, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3493.0, "completions/mean_length": 2883.55224609375, "completions/mean_terminated_length": 1606.2647705078125, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.7931428571428571, "frac_reward_zero_std": 0.375, "grad_norm": 0.12117592990398407, "kl": 0.0007146199544270834, "learning_rate": 3.359691059183761e-08, "loss": 0.072, "num_tokens": 98777283.0, "reward": 0.3854166865348816, "reward_std": 0.30638617277145386, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3457.0, "completions/mean_length": 2407.635498046875, "completions/mean_terminated_length": 1074.4222412109375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.7954285714285714, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09884366393089294, "kl": 0.0006717046101888021, "learning_rate": 3.332108466542281e-08, "loss": 0.0584, "num_tokens": 99023944.0, "reward": 0.4687500596046448, "reward_std": 0.14981341361999512, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.08333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3320.0, "completions/mean_length": 3037.947998046875, "completions/mean_terminated_length": 1567.8077392578125, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "epoch": 0.7977142857142857, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09411304444074631, "kl": 0.0007225672403971354, "learning_rate": 3.3046315338757025e-08, "loss": 0.0547, "num_tokens": 99332093.0, "reward": 0.2916666865348816, "reward_std": 0.2163209170103073, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.45691564679145813, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3508.0, "completions/mean_length": 2614.635498046875, "completions/mean_terminated_length": 1068.8919677734375, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.8, "frac_reward_zero_std": 0.8125, "grad_norm": 0.05556942895054817, "kl": 0.0007216135660807291, "learning_rate": 3.2772616003709616e-08, "loss": 0.0156, "num_tokens": 99598110.0, "reward": 0.3958333432674408, "reward_std": 0.08330589532852173, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 3584.0, "completions/max_terminated_length": 3128.0, "completions/mean_length": 3123.5, "completions/mean_terminated_length": 1373.5999755859375, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.8022857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.10989365726709366, "kl": 0.0006758371988932291, "learning_rate": 3.250000000000001e-08, "loss": 0.0664, "num_tokens": 99914568.0, "reward": 0.2291666716337204, "reward_std": 0.23116151988506317, "rewards/format_reward/mean": 0.2291666716337204, "rewards/format_reward/std": 0.42250296473503113, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 2720.541748046875, "completions/mean_terminated_length": 910.0645141601562, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.8045714285714286, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09264727681875229, "kl": 0.0007136662801106771, "learning_rate": 3.222848061454764e-08, "loss": 0.0383, "num_tokens": 100191934.0, "reward": 0.3541666865348816, "reward_std": 0.1343369334936142, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.48077061772346497, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 2522.14599609375, "completions/mean_terminated_length": 1097.707275390625, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.8068571428571428, "frac_reward_zero_std": 0.625, "grad_norm": 0.1318623125553131, "kl": 0.0012060801188151042, "learning_rate": 3.195807108082429e-08, "loss": 0.0476, "num_tokens": 100449906.0, "reward": 0.4583333432674408, "reward_std": 0.18404607474803925, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3136.0, "completions/mean_length": 2691.5, "completions/mean_terminated_length": 1329.26318359375, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.8091428571428572, "frac_reward_zero_std": 0.5, "grad_norm": 0.10259856283664703, "kl": 0.000599066416422526, "learning_rate": 3.168878457820915e-08, "loss": 0.0466, "num_tokens": 100723950.0, "reward": 0.4270833432674408, "reward_std": 0.22440217435359955, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3548.0, "completions/mean_length": 2866.635498046875, "completions/mean_terminated_length": 1771.7105712890625, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.8114285714285714, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12641818821430206, "kl": 0.0007648468017578125, "learning_rate": 3.1420634231346446e-08, "loss": 0.056, "num_tokens": 101015539.0, "reward": 0.4270833432674408, "reward_std": 0.2828284502029419, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5416666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3431.0, "completions/mean_length": 2200.28125, "completions/mean_terminated_length": 1406.34423828125, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.8137142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.13699595630168915, "kl": 0.0007009506225585938, "learning_rate": 3.1153633109505784e-08, "loss": 0.0562, "num_tokens": 101242588.0, "reward": 0.65625, "reward_std": 0.2466380000114441, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4774521291255951, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3553.0, "completions/mean_length": 2807.86474609375, "completions/mean_terminated_length": 1851.2325439453125, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.816, "frac_reward_zero_std": 0.375, "grad_norm": 0.13702301681041718, "kl": 0.0007762908935546875, "learning_rate": 3.088779422594514e-08, "loss": 0.0514, "num_tokens": 101528007.0, "reward": 0.4791666865348816, "reward_std": 0.29962682723999023, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 2373.041748046875, "completions/mean_terminated_length": 1304.549072265625, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.8182857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12015900760889053, "kl": 0.0007022221883138021, "learning_rate": 3.062313053727671e-08, "loss": 0.0235, "num_tokens": 101771329.0, "reward": 0.5729166865348816, "reward_std": 0.20084445178508759, "rewards/format_reward/mean": 0.5729166865348816, "rewards/format_reward/std": 0.4972511827945709, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 2831.0, "completions/mean_length": 2700.229248046875, "completions/mean_terminated_length": 1351.3157958984375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.8205714285714286, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12044034153223038, "kl": 0.0006585121154785156, "learning_rate": 3.035965494283524e-08, "loss": 0.0362, "num_tokens": 102045719.0, "reward": 0.4895833432674408, "reward_std": 0.20956158638000488, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3560.0, "completions/mean_length": 2882.760498046875, "completions/mean_terminated_length": 1714.02783203125, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 0.8228571428571428, "frac_reward_zero_std": 0.5, "grad_norm": 0.11738189309835434, "kl": 0.0007073084513346354, "learning_rate": 3.0097380284049524e-08, "loss": 0.0712, "num_tokens": 102339192.0, "reward": 0.40625, "reward_std": 0.23311933875083923, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.08333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3363.0, "completions/mean_length": 3011.21875, "completions/mean_terminated_length": 1469.115478515625, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.8251428571428572, "frac_reward_zero_std": 0.625, "grad_norm": 0.08504962176084518, "kl": 0.0006786982218424479, "learning_rate": 2.983631934381639e-08, "loss": 0.0216, "num_tokens": 102644433.0, "reward": 0.3020833432674408, "reward_std": 0.16856959462165833, "rewards/format_reward/mean": 0.3020833432674408, "rewards/format_reward/std": 0.46157145500183105, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3492.0, "completions/mean_length": 2842.625, "completions/mean_terminated_length": 1711.0526123046875, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 0.8274285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 0.10463716834783554, "kl": 0.0006786982218424479, "learning_rate": 2.957648484587779e-08, "loss": 0.0259, "num_tokens": 102933069.0, "reward": 0.4479166865348816, "reward_std": 0.18208828568458557, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 2837.0, "completions/mean_length": 2513.48974609375, "completions/mean_terminated_length": 1014.7750244140625, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.8297142857142857, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10111965984106064, "kl": 0.000690460205078125, "learning_rate": 2.931788945420058e-08, "loss": 0.056, "num_tokens": 103190318.0, "reward": 0.4375, "reward_std": 0.1430540829896927, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3434.0, "completions/mean_length": 2549.52099609375, "completions/mean_terminated_length": 1274.465087890625, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.832, "frac_reward_zero_std": 0.5, "grad_norm": 0.12226539850234985, "kl": 0.0006707509358723959, "learning_rate": 2.906054577235931e-08, "loss": 0.036, "num_tokens": 103451272.0, "reward": 0.4895833432674408, "reward_std": 0.24183647334575653, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0833333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3411.0, "completions/mean_length": 2429.125, "completions/mean_terminated_length": 1366.6400146484375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8342857142857143, "frac_reward_zero_std": 0.375, "grad_norm": 0.16473808884620667, "kl": 0.0008948644002278646, "learning_rate": 2.8804466342921984e-08, "loss": 0.0599, "num_tokens": 103700398.0, "reward": 0.5416666865348816, "reward_std": 0.2773910164833069, "rewards/format_reward/mean": 0.5416666865348816, "rewards/format_reward/std": 0.5008764266967773, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3532.0, "completions/mean_length": 2656.1875, "completions/mean_terminated_length": 1357.25, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.8365714285714285, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1141432523727417, "kl": 0.0007975896199544271, "learning_rate": 2.8549663646838718e-08, "loss": 0.0871, "num_tokens": 103971184.0, "reward": 0.4375, "reward_std": 0.26343637704849243, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3487.0, "completions/mean_length": 2758.75, "completions/mean_terminated_length": 1651.707275390625, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.8388571428571429, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12193860113620758, "kl": 0.0007162094116210938, "learning_rate": 2.8296150102833438e-08, "loss": 0.0744, "num_tokens": 104252638.0, "reward": 0.4479166865348816, "reward_std": 0.29634714126586914, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3575.0, "completions/mean_length": 2476.61474609375, "completions/mean_terminated_length": 1221.5777587890625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8411428571428572, "frac_reward_zero_std": 0.4375, "grad_norm": 0.15184471011161804, "kl": 0.0006783803304036459, "learning_rate": 2.8043938066798646e-08, "loss": 0.0577, "num_tokens": 104505573.0, "reward": 0.5, "reward_std": 0.279984712600708, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5026246905326843, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3153.0, "completions/mean_length": 2976.09375, "completions/mean_terminated_length": 1571.6207275390625, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.8434285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 0.09829284250736237, "kl": 0.0007152557373046875, "learning_rate": 2.7793039831193132e-08, "loss": 0.051, "num_tokens": 104807070.0, "reward": 0.3020833432674408, "reward_std": 0.17337113618850708, "rewards/format_reward/mean": 0.3020833432674408, "rewards/format_reward/std": 0.46157145500183105, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3407.0, "completions/mean_length": 2551.53125, "completions/mean_terminated_length": 1106.0750732421875, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.8457142857142858, "frac_reward_zero_std": 0.625, "grad_norm": 0.11349817365407944, "kl": 0.0007206598917643229, "learning_rate": 2.7543467624442956e-08, "loss": 0.032, "num_tokens": 105067743.0, "reward": 0.4479166865348816, "reward_std": 0.16856960952281952, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3571.0, "completions/mean_length": 2872.08349609375, "completions/mean_terminated_length": 1736.8648681640625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.848, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09595135599374771, "kl": 0.0007193883260091146, "learning_rate": 2.729523361034538e-08, "loss": 0.0229, "num_tokens": 105359933.0, "reward": 0.4791666865348816, "reward_std": 0.22112248837947845, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.2083333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3099.0, "completions/mean_length": 2185.729248046875, "completions/mean_terminated_length": 1051.2830810546875, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.8502857142857143, "frac_reward_zero_std": 0.8125, "grad_norm": 0.10922639071941376, "kl": 0.0007880528767903646, "learning_rate": 2.7048349887476035e-08, "loss": 0.0458, "num_tokens": 105585075.0, "reward": 0.5625, "reward_std": 0.09202304482460022, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.4986824691295624, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3152.0, "completions/mean_length": 2533.77099609375, "completions/mean_terminated_length": 1239.3023681640625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.8525714285714285, "frac_reward_zero_std": 0.625, "grad_norm": 0.11121516674757004, "kl": 0.0008538564046223959, "learning_rate": 2.6802828488599292e-08, "loss": 0.0657, "num_tokens": 105843623.0, "reward": 0.4583333432674408, "reward_std": 0.16661179065704346, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 3584.0, "completions/max_terminated_length": 3351.0, "completions/mean_length": 3105.53125, "completions/mean_terminated_length": 1496.1363525390625, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 0.8548571428571429, "frac_reward_zero_std": 0.6875, "grad_norm": 0.08799436688423157, "kl": 0.000644683837890625, "learning_rate": 2.655868138008171e-08, "loss": 0.0094, "num_tokens": 106158698.0, "reward": 0.2604166865348816, "reward_std": 0.13629473745822906, "rewards/format_reward/mean": 0.2604166567325592, "rewards/format_reward/std": 0.4411657154560089, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3215.0, "completions/mean_length": 2966.28125, "completions/mean_terminated_length": 1607.300048828125, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 0.8571428571428571, "frac_reward_zero_std": 0.375, "grad_norm": 0.12971101701259613, "kl": 0.0007556279500325521, "learning_rate": 2.631592046130896e-08, "loss": 0.0489, "num_tokens": 106459577.0, "reward": 0.3645833432674408, "reward_std": 0.2841503620147705, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9583333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3452.0, "completions/mean_length": 2502.979248046875, "completions/mean_terminated_length": 1375.9573974609375, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.8594285714285714, "frac_reward_zero_std": 0.375, "grad_norm": 0.13539092242717743, "kl": 0.0008398691813151041, "learning_rate": 2.6074557564105726e-08, "loss": 0.0468, "num_tokens": 106715913.0, "reward": 0.53125, "reward_std": 0.2841503620147705, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 2986.354248046875, "completions/mean_terminated_length": 1990.27783203125, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.8617142857142858, "frac_reward_zero_std": 0.3125, "grad_norm": 0.11568745225667953, "kl": 0.0006910959879557291, "learning_rate": 2.583460445215911e-08, "loss": 0.0814, "num_tokens": 107019031.0, "reward": 0.40625, "reward_std": 0.2941893935203552, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 2766.322998046875, "completions/mean_terminated_length": 1051.838623046875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.864, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0754118263721466, "kl": 0.00070953369140625, "learning_rate": 2.559607282044525e-08, "loss": -0.0142, "num_tokens": 107300384.0, "reward": 0.3541666865348816, "reward_std": 0.08330589532852173, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.4807705879211426, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3510.0, "completions/mean_length": 2744.45849609375, "completions/mean_terminated_length": 1618.243896484375, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.8662857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.11120415478944778, "kl": 0.0007082621256510416, "learning_rate": 2.5358974294659374e-08, "loss": 0.0461, "num_tokens": 107579446.0, "reward": 0.4791666865348816, "reward_std": 0.2350771278142929, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3386.0, "completions/mean_length": 2754.791748046875, "completions/mean_terminated_length": 1542.871826171875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.8685714285714285, "frac_reward_zero_std": 0.6875, "grad_norm": 0.07520893216133118, "kl": 0.0006279945373535156, "learning_rate": 2.5123320430649132e-08, "loss": 0.0355, "num_tokens": 107860820.0, "reward": 0.4166666865348816, "reward_std": 0.1343369334936142, "rewards/format_reward/mean": 0.4166666567325592, "rewards/format_reward/std": 0.49559465050697327, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 2954.354248046875, "completions/mean_terminated_length": 1695.0625, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.8708571428571429, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12509968876838684, "kl": 0.0007979075113932291, "learning_rate": 2.4889122713851394e-08, "loss": 0.0302, "num_tokens": 108159756.0, "reward": 0.3854166865348816, "reward_std": 0.2780269384384155, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0416666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3580.0, "completions/mean_length": 2414.58349609375, "completions/mean_terminated_length": 1292.89794921875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.8731428571428571, "frac_reward_zero_std": 0.5, "grad_norm": 0.14563868939876556, "kl": 0.0008230209350585938, "learning_rate": 2.465639255873246e-08, "loss": 0.1217, "num_tokens": 108407714.0, "reward": 0.5208333730697632, "reward_std": 0.2437942624092102, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3543.0, "completions/mean_length": 3013.78125, "completions/mean_terminated_length": 1873.34375, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.8754285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 0.08111817389726639, "kl": 0.0006319681803385416, "learning_rate": 2.4425141308231766e-08, "loss": 0.0282, "num_tokens": 108712769.0, "reward": 0.3645833432674408, "reward_std": 0.16856959462165833, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.08333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3499.0, "completions/mean_length": 2961.375, "completions/mean_terminated_length": 1285.0770263671875, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 0.8777142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 0.0816965252161026, "kl": 0.0006729761759440104, "learning_rate": 2.4195380233209007e-08, "loss": 0.0401, "num_tokens": 109013531.0, "reward": 0.3229166865348816, "reward_std": 0.17337113618850708, "rewards/format_reward/mean": 0.3229166567325592, "rewards/format_reward/std": 0.4700457453727722, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0833333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3508.0, "completions/mean_length": 2472.34375, "completions/mean_terminated_length": 1449.6199951171875, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.88, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1090082898736, "kl": 0.0006866455078125, "learning_rate": 2.396712053189486e-08, "loss": 0.0471, "num_tokens": 109266878.0, "reward": 0.5520833730697632, "reward_std": 0.20564600825309753, "rewards/format_reward/mean": 0.5520833134651184, "rewards/format_reward/std": 0.4998903274536133, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9583333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3579.0, "completions/mean_length": 2502.010498046875, "completions/mean_terminated_length": 1373.9786376953125, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.8822857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13692055642604828, "kl": 0.0008602142333984375, "learning_rate": 2.3740373329345117e-08, "loss": 0.034, "num_tokens": 109522185.0, "reward": 0.53125, "reward_std": 0.21436314284801483, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3546.0, "completions/mean_length": 2516.27099609375, "completions/mean_terminated_length": 1306.1778564453125, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.8845714285714286, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10094159841537476, "kl": 0.0007845560709635416, "learning_rate": 2.3515149676898552e-08, "loss": 0.0066, "num_tokens": 109779251.0, "reward": 0.4895833432674408, "reward_std": 0.14981341361999512, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3519.0, "completions/mean_length": 2394.791748046875, "completions/mean_terminated_length": 1205.5833740234375, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.8868571428571429, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13009528815746307, "kl": 0.000751495361328125, "learning_rate": 2.3291460551638238e-08, "loss": 0.0254, "num_tokens": 110024655.0, "reward": 0.5208333730697632, "reward_std": 0.24511615931987762, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3308.0, "completions/mean_length": 2202.5625, "completions/mean_terminated_length": 1257.368408203125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8891428571428571, "frac_reward_zero_std": 0.625, "grad_norm": 0.12453688681125641, "kl": 0.0007718404134114584, "learning_rate": 2.306931685585657e-08, "loss": 0.0292, "num_tokens": 110250729.0, "reward": 0.6041666865348816, "reward_std": 0.1888476312160492, "rewards/format_reward/mean": 0.6041666865348816, "rewards/format_reward/std": 0.4915960133075714, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3554.0, "completions/mean_length": 3042.70849609375, "completions/mean_terminated_length": 1505.43994140625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.8914285714285715, "frac_reward_zero_std": 0.75, "grad_norm": 0.06971344351768494, "kl": 0.0007470448811848959, "learning_rate": 2.284872941652386e-08, "loss": -0.0, "num_tokens": 110558957.0, "reward": 0.3125, "reward_std": 0.11558075994253159, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3572.0, "completions/mean_length": 2606.4375, "completions/mean_terminated_length": 1237.8499755859375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.8937142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 0.0966411605477333, "kl": 0.0006777445475260416, "learning_rate": 2.2629708984760707e-08, "loss": 0.0423, "num_tokens": 110824961.0, "reward": 0.4375, "reward_std": 0.18404607474803925, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 2747.0, "completions/mean_length": 2658.072998046875, "completions/mean_terminated_length": 890.3939819335938, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.896, "frac_reward_zero_std": 0.75, "grad_norm": 0.06006186455488205, "kl": 0.0006895065307617188, "learning_rate": 2.2412266235313973e-08, "loss": 0.0232, "num_tokens": 111095826.0, "reward": 0.3541666865348816, "reward_std": 0.10206206887960434, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.48077061772346497, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3393.0, "completions/mean_length": 2778.03125, "completions/mean_terminated_length": 1239.3636474609375, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.8982857142857142, "frac_reward_zero_std": 0.625, "grad_norm": 0.13048560917377472, "kl": 0.0007575352986653646, "learning_rate": 2.2196411766036488e-08, "loss": -0.0169, "num_tokens": 111378747.0, "reward": 0.3958333432674408, "reward_std": 0.18404607474803925, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3331.0, "completions/mean_length": 2708.5625, "completions/mean_terminated_length": 1182.800048828125, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.9005714285714286, "frac_reward_zero_std": 0.8125, "grad_norm": 0.09630251675844193, "kl": 0.0006806055704752604, "learning_rate": 2.1982156097370558e-08, "loss": 0.0195, "num_tokens": 111653961.0, "reward": 0.40625, "reward_std": 0.085263691842556, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3521.0, "completions/mean_length": 2641.98974609375, "completions/mean_terminated_length": 1323.175048828125, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.9028571428571428, "frac_reward_zero_std": 0.625, "grad_norm": 0.10200249403715134, "kl": 0.0007108052571614584, "learning_rate": 2.1769509671835225e-08, "loss": 0.0547, "num_tokens": 111923726.0, "reward": 0.4375, "reward_std": 0.16661179065704346, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3534.0, "completions/mean_length": 2701.8125, "completions/mean_terminated_length": 1231.5, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.9051428571428571, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10444901883602142, "kl": 0.000721136728922526, "learning_rate": 2.1558482853517254e-08, "loss": 0.0883, "num_tokens": 112199078.0, "reward": 0.3958333432674408, "reward_std": 0.21151940524578094, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3227.0, "completions/mean_length": 3058.40625, "completions/mean_terminated_length": 1715.2222900390625, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 0.9074285714285715, "frac_reward_zero_std": 0.5, "grad_norm": 0.13178245723247528, "kl": 0.0007457733154296875, "learning_rate": 2.1349085927566073e-08, "loss": 0.0902, "num_tokens": 112508141.0, "reward": 0.2916666865348816, "reward_std": 0.21764282882213593, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.45691564679145813, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1666666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3503.0, "completions/mean_length": 2329.71875, "completions/mean_terminated_length": 1268.4039306640625, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.9097142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.12231790274381638, "kl": 0.0008589426676432291, "learning_rate": 2.1141329099692406e-08, "loss": 0.0171, "num_tokens": 112747184.0, "reward": 0.5625, "reward_std": 0.23987865447998047, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.4986824691295624, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8333333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3501.0, "completions/mean_length": 2524.78125, "completions/mean_terminated_length": 1272.977294921875, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.912, "frac_reward_zero_std": 0.6875, "grad_norm": 0.12483935058116913, "kl": 0.0007489522298177084, "learning_rate": 2.093522249567097e-08, "loss": 0.0385, "num_tokens": 113004785.0, "reward": 0.4895833432674408, "reward_std": 0.14981341361999512, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0416666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3218.0, "completions/mean_length": 2349.260498046875, "completions/mean_terminated_length": 1164.9183349609375, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.9142857142857143, "frac_reward_zero_std": 0.625, "grad_norm": 0.12561240792274475, "kl": 0.0006993611653645834, "learning_rate": 2.0730776160846852e-08, "loss": 0.0626, "num_tokens": 113245656.0, "reward": 0.5104166865348816, "reward_std": 0.16856959462165833, "rewards/format_reward/mean": 0.5104166865348816, "rewards/format_reward/std": 0.5025155544281006, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3573.0, "completions/mean_length": 2966.760498046875, "completions/mean_terminated_length": 1732.28125, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.9165714285714286, "frac_reward_zero_std": 0.3125, "grad_norm": 0.12740644812583923, "kl": 0.000766754150390625, "learning_rate": 2.0528000059645996e-08, "loss": 0.0233, "num_tokens": 113547043.0, "reward": 0.3854166865348816, "reward_std": 0.3203408420085907, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9583333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3461.0, "completions/mean_length": 2363.92724609375, "completions/mean_terminated_length": 1091.9361572265625, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.9188571428571428, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11828096956014633, "kl": 0.0008023579915364584, "learning_rate": 2.032690407508949e-08, "loss": 0.0352, "num_tokens": 113790360.0, "reward": 0.5416666865348816, "reward_std": 0.2076037973165512, "rewards/format_reward/mean": 0.5416666865348816, "rewards/format_reward/std": 0.5008764266967773, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0416666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3477.0, "completions/mean_length": 2552.23974609375, "completions/mean_terminated_length": 1562.591796875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.9211428571428572, "frac_reward_zero_std": 0.5, "grad_norm": 0.11285211890935898, "kl": 0.0007266998291015625, "learning_rate": 2.0127498008311923e-08, "loss": 0.0397, "num_tokens": 114051569.0, "reward": 0.5208333730697632, "reward_std": 0.23987865447998047, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3406.0, "completions/mean_length": 2857.42724609375, "completions/mean_terminated_length": 1333.9676513671875, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.9234285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.10632804781198502, "kl": 0.0008452733357747396, "learning_rate": 1.9929791578083655e-08, "loss": 0.064, "num_tokens": 114341740.0, "reward": 0.3541666865348816, "reward_std": 0.22635996341705322, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.48077061772346497, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3487.0, "completions/mean_length": 3149.375, "completions/mean_terminated_length": 2038.6666259765625, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 0.9257142857142857, "frac_reward_zero_std": 0.3125, "grad_norm": 0.11690174788236618, "kl": 0.0006685256958007812, "learning_rate": 1.9733794420337212e-08, "loss": 0.0665, "num_tokens": 114660916.0, "reward": 0.34375, "reward_std": 0.34346264600753784, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4774521291255951, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3197.0, "completions/mean_length": 2787.822998046875, "completions/mean_terminated_length": 1400.199951171875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.928, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10178104043006897, "kl": 0.0007681846618652344, "learning_rate": 1.9539516087697516e-08, "loss": 0.03, "num_tokens": 114944339.0, "reward": 0.3854166865348816, "reward_std": 0.12757760286331177, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3472.0, "completions/mean_length": 3340.135498046875, "completions/mean_terminated_length": 2023.2667236328125, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 0.9302857142857143, "frac_reward_zero_std": 0.4375, "grad_norm": 0.10888979583978653, "kl": 0.0007346471150716146, "learning_rate": 1.9346966049016424e-08, "loss": 0.064, "num_tokens": 115281648.0, "reward": 0.2083333432674408, "reward_std": 0.24511615931987762, "rewards/format_reward/mean": 0.2083333283662796, "rewards/format_reward/std": 0.40824830532073975, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3355.0, "completions/mean_length": 3007.58349609375, "completions/mean_terminated_length": 1675.862060546875, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.9325714285714286, "frac_reward_zero_std": 0.5, "grad_norm": 0.11224064230918884, "kl": 0.0006457964579264323, "learning_rate": 1.9156153688911168e-08, "loss": 0.0819, "num_tokens": 115586336.0, "reward": 0.3125, "reward_std": 0.243794247508049, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3566.0, "completions/mean_length": 2656.28125, "completions/mean_terminated_length": 1357.4749755859375, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.9348571428571428, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13253681361675262, "kl": 0.0007356007893880209, "learning_rate": 1.8967088307307e-08, "loss": 0.0449, "num_tokens": 115858313.0, "reward": 0.4479166865348816, "reward_std": 0.2653941810131073, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0416666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3412.0, "completions/mean_length": 2570.791748046875, "completions/mean_terminated_length": 1598.938720703125, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.9371428571428572, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11055982112884521, "kl": 0.0006931622823079427, "learning_rate": 1.877977911898387e-08, "loss": 0.0235, "num_tokens": 116121279.0, "reward": 0.5520833730697632, "reward_std": 0.25187548995018005, "rewards/format_reward/mean": 0.5520833134651184, "rewards/format_reward/std": 0.4998903274536133, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9166666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3325.0, "completions/mean_length": 2561.8125, "completions/mean_terminated_length": 1450.7391357421875, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.9394285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.12361273914575577, "kl": 0.0007197062174479166, "learning_rate": 1.8594235253127372e-08, "loss": 0.0484, "num_tokens": 116382873.0, "reward": 0.5416667461395264, "reward_std": 0.25731295347213745, "rewards/format_reward/mean": 0.5416666865348816, "rewards/format_reward/std": 0.5008764266967773, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3556.0, "completions/mean_length": 2813.83349609375, "completions/mean_terminated_length": 1585.729736328125, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.9417142857142857, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11615951359272003, "kl": 0.000743865966796875, "learning_rate": 1.841046575288376e-08, "loss": 0.0666, "num_tokens": 116669063.0, "reward": 0.4270833432674408, "reward_std": 0.2653941810131073, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9166666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3359.0, "completions/mean_length": 2618.96875, "completions/mean_terminated_length": 1570.021728515625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.944, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1295614242553711, "kl": 0.0006841023763020834, "learning_rate": 1.822847957491922e-08, "loss": 0.0537, "num_tokens": 116936600.0, "reward": 0.5520833730697632, "reward_std": 0.3299438953399658, "rewards/format_reward/mean": 0.5520833134651184, "rewards/format_reward/std": 0.4998903274536133, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3509.0, "completions/mean_length": 2781.52099609375, "completions/mean_terminated_length": 1501.8919677734375, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 0.9462857142857143, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10412473976612091, "kl": 0.0008223851521809896, "learning_rate": 1.804828558898332e-08, "loss": 0.0331, "num_tokens": 117219448.0, "reward": 0.3958333432674408, "reward_std": 0.1343369334936142, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3551.0, "completions/mean_length": 2655.20849609375, "completions/mean_terminated_length": 1602.5777587890625, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.9485714285714286, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12450996041297913, "kl": 0.0008185704549153646, "learning_rate": 1.786989257747672e-08, "loss": 0.0722, "num_tokens": 117490158.0, "reward": 0.4895833432674408, "reward_std": 0.2566770315170288, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3497.0, "completions/mean_length": 2702.28125, "completions/mean_terminated_length": 1568.6429443359375, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.9508571428571428, "frac_reward_zero_std": 0.5, "grad_norm": 0.12505842745304108, "kl": 0.0007251103719075521, "learning_rate": 1.7693309235023126e-08, "loss": 0.0789, "num_tokens": 117765849.0, "reward": 0.46875, "reward_std": 0.22440215945243835, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3350.0, "completions/mean_length": 2607.354248046875, "completions/mean_terminated_length": 1351.666748046875, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.9531428571428572, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13574348390102386, "kl": 0.0007998148600260416, "learning_rate": 1.7518544168045523e-08, "loss": 0.0835, "num_tokens": 118032127.0, "reward": 0.4479166865348816, "reward_std": 0.25187548995018005, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 2550.36474609375, "completions/mean_terminated_length": 1221.40478515625, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.9554285714285714, "frac_reward_zero_std": 0.75, "grad_norm": 0.08835748583078384, "kl": 0.0007673899332682291, "learning_rate": 1.7345605894346727e-08, "loss": 0.0245, "num_tokens": 118292700.0, "reward": 0.4583333432674408, "reward_std": 0.11077921837568283, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3562.0, "completions/mean_length": 2624.25, "completions/mean_terminated_length": 1221.5384521484375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.9577142857142857, "frac_reward_zero_std": 0.5625, "grad_norm": 0.15390397608280182, "kl": 0.00080108642578125, "learning_rate": 1.717450284269421e-08, "loss": 0.0616, "num_tokens": 118561356.0, "reward": 0.4375, "reward_std": 0.1988866627216339, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3364.0, "completions/mean_length": 2631.77099609375, "completions/mean_terminated_length": 1407.4761962890625, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.96, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11759354919195175, "kl": 0.0007991790771484375, "learning_rate": 1.7005243352409332e-08, "loss": 0.0723, "num_tokens": 118830308.0, "reward": 0.4479166865348816, "reward_std": 0.21436314284801483, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3442.0, "completions/mean_length": 2744.541748046875, "completions/mean_terminated_length": 1405.9459228515625, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.9622857142857143, "frac_reward_zero_std": 0.375, "grad_norm": 0.12916982173919678, "kl": 0.0007009506225585938, "learning_rate": 1.6837835672960833e-08, "loss": 0.1051, "num_tokens": 119109948.0, "reward": 0.40625, "reward_std": 0.30638620257377625, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3468.0, "completions/mean_length": 2610.416748046875, "completions/mean_terminated_length": 1410.4185791015625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.9645714285714285, "frac_reward_zero_std": 0.625, "grad_norm": 0.09457219392061234, "kl": 0.0007375081380208334, "learning_rate": 1.6672287963562854e-08, "loss": 0.0157, "num_tokens": 119376136.0, "reward": 0.5, "reward_std": 0.18404607474803925, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5026246905326843, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8333333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3403.0, "completions/mean_length": 2588.291748046875, "completions/mean_terminated_length": 1411.5455322265625, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.9668571428571429, "frac_reward_zero_std": 0.625, "grad_norm": 0.09154385328292847, "kl": 0.0007031758626302084, "learning_rate": 1.6508608292777203e-08, "loss": 0.0326, "num_tokens": 119640710.0, "reward": 0.4791666865348816, "reward_std": 0.16661179065704346, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.16666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3546.0, "completions/mean_length": 2996.447998046875, "completions/mean_terminated_length": 1569.5357666015625, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "epoch": 0.9691428571428572, "frac_reward_zero_std": 0.5625, "grad_norm": 0.08817347139120102, "kl": 0.0007321039835611979, "learning_rate": 1.6346804638120097e-08, "loss": 0.0139, "num_tokens": 119945019.0, "reward": 0.3333333134651184, "reward_std": 0.1988866627216339, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3490.0, "completions/mean_length": 2831.479248046875, "completions/mean_terminated_length": 1821.9998779296875, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.9714285714285714, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11002353578805923, "kl": 0.0006011327107747396, "learning_rate": 1.6186884885673413e-08, "loss": 0.0259, "num_tokens": 120232825.0, "reward": 0.4479166865348816, "reward_std": 0.26930975914001465, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3507.0, "completions/mean_length": 2616.33349609375, "completions/mean_terminated_length": 1519.64453125, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.9737142857142858, "frac_reward_zero_std": 0.375, "grad_norm": 0.12342694401741028, "kl": 0.0007171630859375, "learning_rate": 1.602885682970026e-08, "loss": 0.06, "num_tokens": 120499863.0, "reward": 0.5416666865348816, "reward_std": 0.28219255805015564, "rewards/format_reward/mean": 0.5416666865348816, "rewards/format_reward/std": 0.5008764266967773, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3290.0, "completions/mean_length": 2501.697998046875, "completions/mean_terminated_length": 1167.6976318359375, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.976, "frac_reward_zero_std": 0.625, "grad_norm": 0.09393564611673355, "kl": 0.000789642333984375, "learning_rate": 1.5872728172265146e-08, "loss": 0.0455, "num_tokens": 120756130.0, "reward": 0.46875, "reward_std": 0.16856960952281952, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 2545.83349609375, "completions/mean_terminated_length": 1369.2445068359375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.9782857142857143, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09636654704809189, "kl": 0.0007244745890299479, "learning_rate": 1.571850652285857e-08, "loss": 0.0211, "num_tokens": 121016226.0, "reward": 0.5104166865348816, "reward_std": 0.1585305631160736, "rewards/format_reward/mean": 0.5104166865348816, "rewards/format_reward/std": 0.5025155544281006, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3367.0, "completions/mean_length": 2642.48974609375, "completions/mean_terminated_length": 1575.4444580078125, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.9805714285714285, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11780349910259247, "kl": 0.0007855097452799479, "learning_rate": 1.5566199398026148e-08, "loss": 0.0286, "num_tokens": 121284857.0, "reward": 0.5, "reward_std": 0.2076037973165512, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5026246905326843, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7916666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3425.0, "completions/mean_length": 2600.59375, "completions/mean_terminated_length": 1388.4884033203125, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.9828571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 0.14266683161258698, "kl": 0.000858306884765625, "learning_rate": 1.5415814221002266e-08, "loss": 0.0254, "num_tokens": 121550366.0, "reward": 0.46875, "reward_std": 0.22440217435359955, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3494.0, "completions/mean_length": 3056.322998046875, "completions/mean_terminated_length": 1837.2069091796875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.9851428571428571, "frac_reward_zero_std": 0.375, "grad_norm": 0.13045914471149445, "kl": 0.0007486343383789062, "learning_rate": 1.5267358321348288e-08, "loss": 0.0534, "num_tokens": 121859649.0, "reward": 0.3229166865348816, "reward_std": 0.28806596994400024, "rewards/format_reward/mean": 0.3229166567325592, "rewards/format_reward/std": 0.4700457453727722, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3398.0, "completions/mean_length": 2815.17724609375, "completions/mean_terminated_length": 1347.42431640625, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.9874285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 0.1318049281835556, "kl": 0.0009895960489908855, "learning_rate": 1.5120838934595337e-08, "loss": 0.0314, "num_tokens": 122145320.0, "reward": 0.3541666865348816, "reward_std": 0.17532894015312195, "rewards/format_reward/mean": 0.3541666567325592, "rewards/format_reward/std": 0.4807705879211426, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3580.0, "completions/mean_length": 2879.77099609375, "completions/mean_terminated_length": 1471.3125, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.9897142857142858, "frac_reward_zero_std": 0.375, "grad_norm": 0.12791548669338226, "kl": 0.0007193883260091146, "learning_rate": 1.4976263201891612e-08, "loss": 0.1038, "num_tokens": 122437462.0, "reward": 0.3854166865348816, "reward_std": 0.28806596994400024, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3412.0, "completions/mean_length": 2634.229248046875, "completions/mean_terminated_length": 1413.09521484375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.992, "frac_reward_zero_std": 0.375, "grad_norm": 0.13095231354236603, "kl": 0.0008246103922526041, "learning_rate": 1.483363816965435e-08, "loss": 0.0711, "num_tokens": 122705498.0, "reward": 0.5, "reward_std": 0.29962682723999023, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5026246905326843, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3562.0, "completions/mean_length": 2763.479248046875, "completions/mean_terminated_length": 1511.105224609375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.9942857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10234726965427399, "kl": 0.0006532669067382812, "learning_rate": 1.469297078922642e-08, "loss": 0.0564, "num_tokens": 122986410.0, "reward": 0.4375, "reward_std": 0.2076038122177124, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.33333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3428.0, "completions/mean_length": 2980.33349609375, "completions/mean_terminated_length": 1773.0, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.9965714285714286, "frac_reward_zero_std": 0.375, "grad_norm": 0.11328182369470596, "kl": 0.0007276535034179688, "learning_rate": 1.4554267916537493e-08, "loss": 0.0574, "num_tokens": 123289448.0, "reward": 0.375, "reward_std": 0.3044283986091614, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.4866642653942108, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.4583333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3196.0, "completions/mean_length": 2099.27099609375, "completions/mean_terminated_length": 1168.16943359375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.9988571428571429, "frac_reward_zero_std": 0.5, "grad_norm": 0.130823016166687, "kl": 0.0008134841918945312, "learning_rate": 1.4417536311769885e-08, "loss": 0.0887, "num_tokens": 123506164.0, "reward": 0.6145833730697632, "reward_std": 0.22831778228282928, "rewards/format_reward/mean": 0.6145833134651184, "rewards/format_reward/std": 0.4892484247684479, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3551.0, "completions/mean_length": 2794.90625, "completions/mean_terminated_length": 1479.75, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 1.0022857142857142, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10344143956899643, "kl": 0.000720977783203125, "learning_rate": 1.4282782639029129e-08, "loss": 0.0621, "num_tokens": 123790399.0, "reward": 0.4583333432674408, "reward_std": 0.1988866627216339, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 2578.5625, "completions/mean_terminated_length": 1439.066650390625, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 1.0045714285714287, "frac_reward_zero_std": 0.5, "grad_norm": 0.10817626118659973, "kl": 0.00086212158203125, "learning_rate": 1.4150013466019115e-08, "loss": 0.0487, "num_tokens": 124053823.0, "reward": 0.5104166865348816, "reward_std": 0.23311930894851685, "rewards/format_reward/mean": 0.5104166865348816, "rewards/format_reward/std": 0.5025155544281006, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6666666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3413.0, "completions/mean_length": 2653.84375, "completions/mean_terminated_length": 1351.625, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "epoch": 1.006857142857143, "frac_reward_zero_std": 0.375, "grad_norm": 0.14167174696922302, "kl": 0.0007429122924804688, "learning_rate": 1.4019235263722034e-08, "loss": 0.0554, "num_tokens": 124324456.0, "reward": 0.4791666865348816, "reward_std": 0.2996268570423126, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3550.0, "completions/mean_length": 2666.854248046875, "completions/mean_terminated_length": 1627.4222412109375, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 1.0091428571428571, "frac_reward_zero_std": 0.3125, "grad_norm": 0.1454349160194397, "kl": 0.0007111231486002604, "learning_rate": 1.3890454406082957e-08, "loss": 0.0507, "num_tokens": 124595780.0, "reward": 0.5208333730697632, "reward_std": 0.34542036056518555, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.08333333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3391.0, "completions/mean_length": 3058.17724609375, "completions/mean_terminated_length": 1642.5001220703125, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "epoch": 1.0114285714285713, "frac_reward_zero_std": 0.625, "grad_norm": 0.0833083987236023, "kl": 0.0007244745890299479, "learning_rate": 1.3763677169699217e-08, "loss": 0.0255, "num_tokens": 124905115.0, "reward": 0.3020833432674408, "reward_std": 0.17728674411773682, "rewards/format_reward/mean": 0.3020833432674408, "rewards/format_reward/std": 0.46157145500183105, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3451.0, "completions/mean_length": 2548.885498046875, "completions/mean_terminated_length": 1160.3170166015625, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 1.0137142857142858, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09871827065944672, "kl": 0.000720977783203125, "learning_rate": 1.3638909733514453e-08, "loss": 0.0578, "num_tokens": 125165906.0, "reward": 0.5, "reward_std": 0.2076037973165512, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5026246905326843, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3484.0, "completions/mean_length": 2776.33349609375, "completions/mean_terminated_length": 1737.90478515625, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 1.016, "frac_reward_zero_std": 0.625, "grad_norm": 0.10498413443565369, "kl": 0.0008420944213867188, "learning_rate": 1.3516158178517482e-08, "loss": 0.0018, "num_tokens": 125448592.0, "reward": 0.5104166865348816, "reward_std": 0.18208830058574677, "rewards/format_reward/mean": 0.5104166865348816, "rewards/format_reward/std": 0.5025155544281006, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8333333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 2626.33349609375, "completions/mean_terminated_length": 1494.5455322265625, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 1.0182857142857142, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09597191214561462, "kl": 0.0006589889526367188, "learning_rate": 1.3395428487445914e-08, "loss": 0.0328, "num_tokens": 125716848.0, "reward": 0.4583333432674408, "reward_std": 0.1613743156194687, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3397.0, "completions/mean_length": 2971.6875, "completions/mean_terminated_length": 1624.60009765625, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 1.0205714285714285, "frac_reward_zero_std": 0.4375, "grad_norm": 0.10853156447410583, "kl": 0.0006860097249348959, "learning_rate": 1.327672654449457e-08, "loss": 0.0547, "num_tokens": 126017958.0, "reward": 0.3854166865348816, "reward_std": 0.2653941810131073, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 2404.8125, "completions/mean_terminated_length": 1364.35302734375, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 1.022857142857143, "frac_reward_zero_std": 0.6875, "grad_norm": 0.10824251919984818, "kl": 0.0008029937744140625, "learning_rate": 1.316005813502869e-08, "loss": 0.0302, "num_tokens": 126264390.0, "reward": 0.53125, "reward_std": 0.14109627902507782, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3539.0, "completions/mean_length": 2996.34375, "completions/mean_terminated_length": 1494.5555419921875, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 1.0251428571428571, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11421850323677063, "kl": 0.0007171630859375, "learning_rate": 1.3045428945301954e-08, "loss": 0.0505, "num_tokens": 126568275.0, "reward": 0.3020833432674408, "reward_std": 0.25187548995018005, "rewards/format_reward/mean": 0.3020833432674408, "rewards/format_reward/std": 0.46157145500183105, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3333333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3491.0, "completions/mean_length": 2175.30224609375, "completions/mean_terminated_length": 1169.08935546875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 1.0274285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.11491094529628754, "kl": 0.0010004043579101562, "learning_rate": 1.2932844562179351e-08, "loss": 0.0465, "num_tokens": 126793280.0, "reward": 0.6041666865348816, "reward_std": 0.21764282882213593, "rewards/format_reward/mean": 0.6041666865348816, "rewards/format_reward/std": 0.4915960133075714, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3553.0, "completions/mean_length": 2659.11474609375, "completions/mean_terminated_length": 1418.41455078125, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 1.0297142857142858, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10888926684856415, "kl": 0.0007184346516927084, "learning_rate": 1.2822310472864883e-08, "loss": 0.0329, "num_tokens": 127064491.0, "reward": 0.46875, "reward_std": 0.20084445178508759, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 2820.33349609375, "completions/mean_terminated_length": 1140.2667236328125, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 1.032, "frac_reward_zero_std": 0.625, "grad_norm": 0.09330493211746216, "kl": 0.0008147557576497396, "learning_rate": 1.2713832064634124e-08, "loss": 0.0562, "num_tokens": 127351083.0, "reward": 0.3333333432674408, "reward_std": 0.17532894015312195, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0416666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3511.0, "completions/mean_length": 2383.77099609375, "completions/mean_terminated_length": 1232.5306396484375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 1.0342857142857143, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12870053946971893, "kl": 0.00078582763671875, "learning_rate": 1.260741462457165e-08, "loss": 0.0507, "num_tokens": 127595621.0, "reward": 0.5729166865348816, "reward_std": 0.2741113305091858, "rewards/format_reward/mean": 0.5729166865348816, "rewards/format_reward/std": 0.4972512125968933, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3564.0, "completions/mean_length": 2839.89599609375, "completions/mean_terminated_length": 1419.3333740234375, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 1.0365714285714285, "frac_reward_zero_std": 0.4375, "grad_norm": 0.11888577044010162, "kl": 0.000759124755859375, "learning_rate": 1.2503063339313355e-08, "loss": 0.0509, "num_tokens": 127884727.0, "reward": 0.3958333432674408, "reward_std": 0.2712675929069519, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9583333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3508.0, "completions/mean_length": 2449.48974609375, "completions/mean_terminated_length": 1266.7021484375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 1.038857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11443246155977249, "kl": 0.0007734298706054688, "learning_rate": 1.2400783294793667e-08, "loss": 0.0354, "num_tokens": 128135298.0, "reward": 0.5, "reward_std": 0.19408512115478516, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5026246905326843, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3454.0, "completions/mean_length": 2976.09375, "completions/mean_terminated_length": 1571.6207275390625, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 1.0411428571428571, "frac_reward_zero_std": 0.5625, "grad_norm": 0.099580317735672, "kl": 0.0007212956746419271, "learning_rate": 1.2300579475997656e-08, "loss": 0.0599, "num_tokens": 128437083.0, "reward": 0.3125, "reward_std": 0.19408512115478516, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4659455418586731, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1666666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3349.0, "completions/mean_length": 2190.25, "completions/mean_terminated_length": 1010.923095703125, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 1.0434285714285714, "frac_reward_zero_std": 0.875, "grad_norm": 0.05977644771337509, "kl": 0.000743865966796875, "learning_rate": 1.2202456766718091e-08, "loss": 0.0024, "num_tokens": 128662839.0, "reward": 0.5729166865348816, "reward_std": 0.06650752574205399, "rewards/format_reward/mean": 0.5729166865348816, "rewards/format_reward/std": 0.4972512423992157, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9166666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3560.0, "completions/mean_length": 2611.46875, "completions/mean_terminated_length": 1554.36962890625, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 1.0457142857142858, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09623143821954727, "kl": 0.0007073084513346354, "learning_rate": 1.2106419949317387e-08, "loss": -0.007, "num_tokens": 128929506.0, "reward": 0.5208333730697632, "reward_std": 0.1430540680885315, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3443.0, "completions/mean_length": 2269.17724609375, "completions/mean_terminated_length": 954.3541870117188, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 1.048, "frac_reward_zero_std": 0.75, "grad_norm": 0.10378686338663101, "kl": 0.0007356007893880209, "learning_rate": 1.2012473704494538e-08, "loss": 0.0184, "num_tokens": 129162539.0, "reward": 0.5208333730697632, "reward_std": 0.11558075994253159, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 2979.0, "completions/mean_length": 3132.71875, "completions/mean_terminated_length": 1521.0, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 1.0502857142857143, "frac_reward_zero_std": 0.375, "grad_norm": 0.13919900357723236, "kl": 0.00072479248046875, "learning_rate": 1.1920622611056975e-08, "loss": 0.0613, "num_tokens": 129479390.0, "reward": 0.28125, "reward_std": 0.2841503620147705, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.45196935534477234, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0416666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3358.0, "completions/mean_length": 2476.572998046875, "completions/mean_terminated_length": 1414.346923828125, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 1.0525714285714285, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11369922757148743, "kl": 0.0008138020833333334, "learning_rate": 1.1830871145697411e-08, "loss": 0.0575, "num_tokens": 129733245.0, "reward": 0.5104166865348816, "reward_std": 0.19604292511940002, "rewards/format_reward/mean": 0.5104166865348816, "rewards/format_reward/std": 0.5025155544281006, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3547.0, "completions/mean_length": 2336.166748046875, "completions/mean_terminated_length": 1088.3333740234375, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 1.054857142857143, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09804533421993256, "kl": 0.0007867813110351562, "learning_rate": 1.174322368277565e-08, "loss": 0.0555, "num_tokens": 129973975.0, "reward": 0.53125, "reward_std": 0.14981341361999512, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3211.0, "completions/mean_length": 2410.416748046875, "completions/mean_terminated_length": 1236.8333740234375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 1.0571428571428572, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13963499665260315, "kl": 0.0008455912272135416, "learning_rate": 1.1657684494105385e-08, "loss": 0.0665, "num_tokens": 130221215.0, "reward": 0.53125, "reward_std": 0.20956161618232727, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5016420483589172, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.20833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3518.0, "completions/mean_length": 2983.822998046875, "completions/mean_terminated_length": 1597.2069091796875, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 1.0594285714285714, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0957186296582222, "kl": 0.0007654825846354166, "learning_rate": 1.1574257748745986e-08, "loss": 0.0327, "num_tokens": 130525020.0, "reward": 0.3333333432674408, "reward_std": 0.1613743007183075, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3175.0, "completions/mean_length": 2496.75, "completions/mean_terminated_length": 1038.243896484375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 1.0617142857142856, "frac_reward_zero_std": 0.6875, "grad_norm": 0.09010104835033417, "kl": 0.0008306503295898438, "learning_rate": 1.1492947512799328e-08, "loss": 0.0234, "num_tokens": 130781148.0, "reward": 0.447916716337204, "reward_std": 0.14109627902507782, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3473.0, "completions/mean_length": 2685.0, "completions/mean_terminated_length": 1479.0242919921875, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 1.064, "frac_reward_zero_std": 0.5, "grad_norm": 0.10904668271541595, "kl": 0.0007090568542480469, "learning_rate": 1.1413757749211601e-08, "loss": 0.0332, "num_tokens": 131054898.0, "reward": 0.5104166865348816, "reward_std": 0.23703491687774658, "rewards/format_reward/mean": 0.5104166865348816, "rewards/format_reward/std": 0.5025155544281006, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 2529.375, "completions/mean_terminated_length": 1709.111083984375, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 1.0662857142857143, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1257898211479187, "kl": 0.0007638931274414062, "learning_rate": 1.133669231758016e-08, "loss": 0.058, "num_tokens": 131312892.0, "reward": 0.6354166865348816, "reward_std": 0.3002627491950989, "rewards/format_reward/mean": 0.6354166865348816, "rewards/format_reward/std": 0.4838397204875946, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3306.0, "completions/mean_length": 2662.25, "completions/mean_terminated_length": 1477.1429443359375, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 1.0685714285714285, "frac_reward_zero_std": 0.4375, "grad_norm": 0.13364644348621368, "kl": 0.0008055369059244791, "learning_rate": 1.1261754973965421e-08, "loss": 0.1014, "num_tokens": 131584518.0, "reward": 0.4583333432674408, "reward_std": 0.25863486528396606, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.04166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 2952.0, "completions/mean_length": 2969.385498046875, "completions/mean_terminated_length": 1223.8800048828125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 1.070857142857143, "frac_reward_zero_std": 0.5, "grad_norm": 0.1145143136382103, "kl": 0.0007276535034179688, "learning_rate": 1.1188949370707786e-08, "loss": 0.0772, "num_tokens": 131885917.0, "reward": 0.2916666865348816, "reward_std": 0.23987865447998047, "rewards/format_reward/mean": 0.2916666567325592, "rewards/format_reward/std": 0.4569156765937805, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3372.0, "completions/mean_length": 2589.39599609375, "completions/mean_terminated_length": 1310.6190185546875, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 1.0731428571428572, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11043540388345718, "kl": 0.000732421875, "learning_rate": 1.1118279056249653e-08, "loss": 0.0381, "num_tokens": 132150159.0, "reward": 0.5, "reward_std": 0.2076038122177124, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5026246905326843, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3374.0, "completions/mean_length": 2956.89599609375, "completions/mean_terminated_length": 1759.697021484375, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 1.0754285714285714, "frac_reward_zero_std": 0.625, "grad_norm": 0.09861429780721664, "kl": 0.0007677078247070312, "learning_rate": 1.1049747474962444e-08, "loss": 0.0378, "num_tokens": 132450659.0, "reward": 0.3645833432674408, "reward_std": 0.19080542027950287, "rewards/format_reward/mean": 0.3645833432674408, "rewards/format_reward/std": 0.4838397204875946, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3477.0, "completions/mean_length": 2792.86474609375, "completions/mean_terminated_length": 1585.3421630859375, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 1.0777142857142856, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1034030020236969, "kl": 0.0007429122924804688, "learning_rate": 1.0983357966978745e-08, "loss": 0.0572, "num_tokens": 132734950.0, "reward": 0.4270833432674408, "reward_std": 0.2741113305091858, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3517.0, "completions/mean_length": 2577.30224609375, "completions/mean_terminated_length": 1436.3778076171875, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 1.08, "frac_reward_zero_std": 0.5, "grad_norm": 0.12331518530845642, "kl": 0.0008172988891601562, "learning_rate": 1.0919113768029517e-08, "loss": 0.0361, "num_tokens": 132998463.0, "reward": 0.5, "reward_std": 0.21764282882213593, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.5026246905326843, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1666666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 2995.0, "completions/mean_length": 2257.385498046875, "completions/mean_terminated_length": 1134.865478515625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 1.0822857142857143, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14478808641433716, "kl": 0.0008268356323242188, "learning_rate": 1.0857018009286381e-08, "loss": 0.0633, "num_tokens": 133231096.0, "reward": 0.5729166865348816, "reward_std": 0.2653941810131073, "rewards/format_reward/mean": 0.5729166865348816, "rewards/format_reward/std": 0.4972512125968933, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3297.0, "completions/mean_length": 2741.822998046875, "completions/mean_terminated_length": 1134.0303955078125, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 1.0845714285714285, "frac_reward_zero_std": 0.3125, "grad_norm": 0.14106322824954987, "kl": 0.0009167989095052084, "learning_rate": 1.0797073717209012e-08, "loss": 0.0717, "num_tokens": 133510343.0, "reward": 0.3854166865348816, "reward_std": 0.2941893935203552, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3558.0, "completions/mean_length": 2573.041748046875, "completions/mean_terminated_length": 1562.0833740234375, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 1.0868571428571427, "frac_reward_zero_std": 0.5625, "grad_norm": 0.13431628048419952, "kl": 0.0008484522501627604, "learning_rate": 1.0739283813397638e-08, "loss": 0.0011, "num_tokens": 133773627.0, "reward": 0.5208333730697632, "reward_std": 0.19408512115478516, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3459.0, "completions/mean_length": 2703.70849609375, "completions/mean_terminated_length": 1236.5555419921875, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 1.0891428571428572, "frac_reward_zero_std": 0.75, "grad_norm": 0.09678907692432404, "kl": 0.00064849853515625, "learning_rate": 1.068365111445064e-08, "loss": 0.0249, "num_tokens": 134049515.0, "reward": 0.40625, "reward_std": 0.12234009802341461, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3427.0, "completions/mean_length": 2804.916748046875, "completions/mean_terminated_length": 1506.4444580078125, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 1.0914285714285714, "frac_reward_zero_std": 0.5625, "grad_norm": 0.08967314660549164, "kl": 0.0007128715515136719, "learning_rate": 1.063017833182728e-08, "loss": 0.0376, "num_tokens": 134334663.0, "reward": 0.3958333432674408, "reward_std": 0.2163209319114685, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3474.0, "completions/mean_length": 2728.822998046875, "completions/mean_terminated_length": 1365.1622314453125, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 1.0937142857142856, "frac_reward_zero_std": 0.5625, "grad_norm": 0.11985599249601364, "kl": 0.0007877349853515625, "learning_rate": 1.0578868071715544e-08, "loss": 0.0121, "num_tokens": 134612590.0, "reward": 0.3958333432674408, "reward_std": 0.20280227065086365, "rewards/format_reward/mean": 0.3958333432674408, "rewards/format_reward/std": 0.4915960133075714, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3489.0, "completions/mean_length": 2677.479248046875, "completions/mean_terminated_length": 1231.9459228515625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 1.096, "frac_reward_zero_std": 0.5625, "grad_norm": 0.1189078688621521, "kl": 0.0007257461547851562, "learning_rate": 1.0529722834905124e-08, "loss": 0.0703, "num_tokens": 134885042.0, "reward": 0.4479166865348816, "reward_std": 0.20956161618232727, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3549.0, "completions/mean_length": 2686.36474609375, "completions/mean_terminated_length": 1190.3055419921875, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 1.0982857142857143, "frac_reward_zero_std": 0.75, "grad_norm": 0.0946437418460846, "kl": 0.0006910959879557291, "learning_rate": 1.0482745016665526e-08, "loss": 0.0257, "num_tokens": 135160339.0, "reward": 0.3854166865348816, "reward_std": 0.12234010547399521, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9583333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3202.0, "completions/mean_length": 2500.46875, "completions/mean_terminated_length": 1370.8297119140625, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 1.1005714285714285, "frac_reward_zero_std": 0.5625, "grad_norm": 0.09965284168720245, "kl": 0.0007975896199544271, "learning_rate": 1.0437936906629333e-08, "loss": 0.0608, "num_tokens": 135415948.0, "reward": 0.5208333730697632, "reward_std": 0.21240532398223877, "rewards/format_reward/mean": 0.5208333134651184, "rewards/format_reward/std": 0.5021882057189941, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3085.0, "completions/mean_length": 2721.40625, "completions/mean_terminated_length": 1404.8157958984375, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 1.1028571428571428, "frac_reward_zero_std": 0.5, "grad_norm": 0.12250243872404099, "kl": 0.0008351008097330729, "learning_rate": 1.0395300688680625e-08, "loss": 0.0288, "num_tokens": 135693367.0, "reward": 0.4270833432674408, "reward_std": 0.23311930894851685, "rewards/format_reward/mean": 0.4270833432674408, "rewards/format_reward/std": 0.4972511827945709, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3527.0, "completions/mean_length": 2201.854248046875, "completions/mean_terminated_length": 1256.1754150390625, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 1.1051428571428572, "frac_reward_zero_std": 0.75, "grad_norm": 0.09815461933612823, "kl": 0.0008169809977213541, "learning_rate": 1.0354838440848502e-08, "loss": 0.0174, "num_tokens": 135919343.0, "reward": 0.6458333730697632, "reward_std": 0.11077921092510223, "rewards/format_reward/mean": 0.6458333134651184, "rewards/format_reward/std": 0.4807705879211426, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0416666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3390.0, "completions/mean_length": 2413.447998046875, "completions/mean_terminated_length": 1290.6734619140625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 1.1074285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.10431455820798874, "kl": 0.0007483164469401041, "learning_rate": 1.0316552135205838e-08, "loss": 0.0223, "num_tokens": 136167132.0, "reward": 0.5729166865348816, "reward_std": 0.2379208505153656, "rewards/format_reward/mean": 0.5729166865348816, "rewards/format_reward/std": 0.4972511827945709, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3554.0, "completions/mean_length": 2874.67724609375, "completions/mean_terminated_length": 1837.974365234375, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 1.1097142857142857, "frac_reward_zero_std": 0.625, "grad_norm": 0.09176606684923172, "kl": 0.0006389617919921875, "learning_rate": 1.0280443637773164e-08, "loss": 0.0483, "num_tokens": 136460291.0, "reward": 0.4479166865348816, "reward_std": 0.18208828568458557, "rewards/format_reward/mean": 0.4479166567325592, "rewards/format_reward/std": 0.49989035725593567, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.2083333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3410.0, "completions/mean_length": 2256.104248046875, "completions/mean_terminated_length": 1178.7547607421875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 1.112, "frac_reward_zero_std": 0.3125, "grad_norm": 0.16725574433803558, "kl": 0.0009609858194986979, "learning_rate": 1.0246514708427701e-08, "loss": 0.0685, "num_tokens": 136692033.0, "reward": 0.5937500596046448, "reward_std": 0.32122674584388733, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.4937104284763336, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.29166666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3506.0, "completions/mean_length": 3046.80224609375, "completions/mean_terminated_length": 1920.4193115234375, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 1.1142857142857143, "frac_reward_zero_std": 0.25, "grad_norm": 0.14524559676647186, "kl": 0.000804901123046875, "learning_rate": 1.0214767000817596e-08, "loss": 0.0883, "num_tokens": 137000780.0, "reward": 0.3333333432674408, "reward_std": 0.3419407308101654, "rewards/format_reward/mean": 0.3333333432674408, "rewards/format_reward/std": 0.4738790988922119, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0833333333333335, "completions/max_length": 3584.0, "completions/max_terminated_length": 3573.0, "completions/mean_length": 2507.23974609375, "completions/mean_terminated_length": 1516.6199951171875, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 1.1165714285714285, "frac_reward_zero_std": 0.375, "grad_norm": 0.14165270328521729, "kl": 0.0008672078450520834, "learning_rate": 1.0185202062281335e-08, "loss": 0.0481, "num_tokens": 137256997.0, "reward": 0.5729166865348816, "reward_std": 0.3103017807006836, "rewards/format_reward/mean": 0.5729166865348816, "rewards/format_reward/std": 0.4972511827945709, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.7083333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3275.0, "completions/mean_length": 2566.15625, "completions/mean_terminated_length": 1200.756103515625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 1.1188571428571428, "frac_reward_zero_std": 0.75, "grad_norm": 0.10389388352632523, "kl": 0.0008487701416015625, "learning_rate": 1.0157821333772304e-08, "loss": 0.031, "num_tokens": 137518762.0, "reward": 0.4375, "reward_std": 0.10206207633018494, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.45833333333333326, "completions/max_length": 3584.0, "completions/max_terminated_length": 3457.0, "completions/mean_length": 2818.52099609375, "completions/mean_terminated_length": 1484.4000244140625, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 1.1211428571428572, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10865319520235062, "kl": 0.0007346471150716146, "learning_rate": 1.0132626149788589e-08, "loss": 0.0265, "num_tokens": 137806122.0, "reward": 0.3854166865348816, "reward_std": 0.21436314284801483, "rewards/format_reward/mean": 0.3854166567325592, "rewards/format_reward/std": 0.4892484247684479, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1666666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3546.0, "completions/mean_length": 2398.77099609375, "completions/mean_terminated_length": 1395.8846435546875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 1.1234285714285714, "frac_reward_zero_std": 0.5, "grad_norm": 0.1486992985010147, "kl": 0.0010083516438802083, "learning_rate": 1.0109617738307912e-08, "loss": 0.025, "num_tokens": 138051566.0, "reward": 0.5729166865348816, "reward_std": 0.22440217435359955, "rewards/format_reward/mean": 0.5729166865348816, "rewards/format_reward/std": 0.4972512125968933, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.2916666666666665, "completions/max_length": 3584.0, "completions/max_terminated_length": 3512.0, "completions/mean_length": 2436.635498046875, "completions/mean_terminated_length": 1581.3272705078125, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 1.1257142857142857, "frac_reward_zero_std": 0.4375, "grad_norm": 0.119545117020607, "kl": 0.0006758371988932291, "learning_rate": 1.008879722072778e-08, "loss": 0.0611, "num_tokens": 138301035.0, "reward": 0.625, "reward_std": 0.25863486528396606, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.4866642653942108, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5416666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 2884.760498046875, "completions/mean_terminated_length": 1769.7568359375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 1.1280000000000001, "frac_reward_zero_std": 0.4375, "grad_norm": 0.14157073199748993, "kl": 0.0008312861124674479, "learning_rate": 1.0070165611810855e-08, "loss": 0.0592, "num_tokens": 138594076.0, "reward": 0.40625, "reward_std": 0.2653941810131073, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9166666666666667, "completions/max_length": 3584.0, "completions/max_terminated_length": 3542.0, "completions/mean_length": 2582.59375, "completions/mean_terminated_length": 1494.1087646484375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 1.1302857142857143, "frac_reward_zero_std": 0.5625, "grad_norm": 0.12795785069465637, "kl": 0.0007610321044921875, "learning_rate": 1.0053723819635471e-08, "loss": -0.0032, "num_tokens": 138857653.0, "reward": 0.4791666865348816, "reward_std": 0.1988866627216339, "rewards/format_reward/mean": 0.4791666567325592, "rewards/format_reward/std": 0.5021882057189941, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8333333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3286.0, "completions/mean_length": 2581.46875, "completions/mean_terminated_length": 1396.6591796875, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 1.1325714285714286, "frac_reward_zero_std": 0.4375, "grad_norm": 0.12635618448257446, "kl": 0.0006399154663085938, "learning_rate": 1.0039472645551372e-08, "loss": 0.027, "num_tokens": 139121680.0, "reward": 0.46875, "reward_std": 0.25187548995018005, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5016420483589172, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3185.0, "completions/mean_length": 2634.80224609375, "completions/mean_terminated_length": 1414.40478515625, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 1.1348571428571428, "frac_reward_zero_std": 0.5625, "grad_norm": 0.10370950400829315, "kl": 0.0008077621459960938, "learning_rate": 1.002741278414069e-08, "loss": 0.064, "num_tokens": 139390827.0, "reward": 0.4583333432674408, "reward_std": 0.20280227065086365, "rewards/format_reward/mean": 0.4583333432674408, "rewards/format_reward/std": 0.5008764266967773, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5833333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3311.0, "completions/mean_length": 2563.65625, "completions/mean_terminated_length": 1006.2894897460938, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 1.1371428571428572, "frac_reward_zero_std": 0.625, "grad_norm": 0.11594365537166595, "kl": 0.0008608500162760416, "learning_rate": 1.0017544823184054e-08, "loss": 0.0579, "num_tokens": 139652034.0, "reward": 0.4375, "reward_std": 0.16661179065704346, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4986824691295624, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.41666666666666674, "completions/max_length": 3584.0, "completions/max_terminated_length": 3412.0, "completions/mean_length": 2895.83349609375, "completions/mean_terminated_length": 1640.941162109375, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 1.1394285714285715, "frac_reward_zero_std": 0.25, "grad_norm": 0.15313191711902618, "kl": 0.0007664362589518229, "learning_rate": 1.0009869243631952e-08, "loss": 0.1271, "num_tokens": 139946336.0, "reward": 0.40625, "reward_std": 0.3622187376022339, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.4937104284763336, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8333333333333333, "completions/max_length": 3584.0, "completions/max_terminated_length": 3562.0, "completions/mean_length": 2671.0, "completions/mean_terminated_length": 1592.0, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 1.1417142857142857, "frac_reward_zero_std": 0.375, "grad_norm": 0.1362977772951126, "kl": 0.0007994969685872396, "learning_rate": 1.000438641958131e-08, "loss": 0.0793, "num_tokens": 140218496.0, "reward": 0.4895833432674408, "reward_std": 0.29766905307769775, "rewards/format_reward/mean": 0.4895833432674408, "rewards/format_reward/std": 0.5025155544281006, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3491.0, "completions/mean_length": 2254.0625, "completions/mean_terminated_length": 1219.6666259765625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 1.144, "frac_reward_zero_std": 0.625, "grad_norm": 0.112995445728302, "kl": 0.0009085337320963541, "learning_rate": 1.0001096618257237e-08, "loss": 0.0594, "num_tokens": 140449730.0, "reward": 0.5833333730697632, "reward_std": 0.17532894015312195, "rewards/format_reward/mean": 0.5833333134651184, "rewards/format_reward/std": 0.4955945909023285, "step": 500 }, { "epoch": 1.144, "step": 500, "total_flos": 0.0, "train_loss": 0.04486549567896873, "train_runtime": 27119.3128, "train_samples_per_second": 1.77, "train_steps_per_second": 0.018 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 140449730, "num_train_epochs": 2, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }