{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 100.0, "global_step": 2480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 400.0, "completions/mean_length": 83.025390625, "completions/min_length": 35.0, "epoch": 0.0020161290322580645, "frac_reward_zero_std": 0.03125, "grad_norm": 1.9140625, "kl": 0.0, "learning_rate": 8.064516129032257e-09, "loss": -3.341119736433029e-08, "reward": 1.4725391864776611, "reward_std": 0.30708229541778564, "rewards/FidelityReward/mean": 0.6659104228019714, "rewards/FidelityReward/std": 0.2469281256198883, "rewards/JudgeFidelityReward/mean": 0.6913825273513794, "rewards/JudgeFidelityReward/std": 0.19780534505844116, "rewards/SelfEvolvingFormatReward/mean": 0.921875, "rewards/SelfEvolvingFormatReward/std": 0.26863065361976624, "step": 1 }, { "clip_ratio/high_max": 0.0003955696302000433, "clip_ratio/high_mean": 9.703805699246004e-05, "clip_ratio/low_mean": 3.0476796382572502e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00012751486792694777, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/mean_length": 84.046875, "completions/min_length": 35.5, "epoch": 0.010080645161290322, "frac_reward_zero_std": 0.0234375, "grad_norm": 2.0, "kl": 0.000465831661131233, "learning_rate": 4.032258064516129e-08, "loss": -1.5831948985578492e-05, "reward": 1.5170481204986572, "reward_std": 0.29907265305519104, "rewards/FidelityReward/mean": 0.6938515901565552, "rewards/FidelityReward/std": 0.24894677102565765, "rewards/JudgeFidelityReward/mean": 0.7333070635795593, "rewards/JudgeFidelityReward/std": 0.19331620633602142, "rewards/SelfEvolvingFormatReward/mean": 0.9130859375, "rewards/SelfEvolvingFormatReward/std": 0.2819792479276657, "step": 5 }, { "clip_ratio/high_max": 0.0018400220200419427, "clip_ratio/high_mean": 0.0002574840851593763, "clip_ratio/low_mean": 0.0001428135103196837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040029758820310236, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/mean_length": 83.98046875, "completions/min_length": 34.5, "epoch": 0.020161290322580645, "frac_reward_zero_std": 0.0234375, "grad_norm": 2.28125, "kl": 0.0006003510090522469, "learning_rate": 8.064516129032257e-08, "loss": 3.2237876439467072e-06, "reward": 1.5520434379577637, "reward_std": 0.2824975848197937, "rewards/FidelityReward/mean": 0.7159407436847687, "rewards/FidelityReward/std": 0.2423701509833336, "rewards/JudgeFidelityReward/mean": 0.7366586327552795, "rewards/JudgeFidelityReward/std": 0.18773136287927628, "rewards/SelfEvolvingFormatReward/mean": 0.935546875, "rewards/SelfEvolvingFormatReward/std": 0.24566945433616638, "step": 10 }, { "clip_ratio/high_max": 0.0010373346973210572, "clip_ratio/high_mean": 0.0002380473248194903, "clip_ratio/low_mean": 3.705812559928745e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002751054475083947, "completions/clipped_ratio": 0.0, "completions/max_length": 260.6666666666667, "completions/mean_length": 85.25130208333333, "completions/min_length": 33.333333333333336, "epoch": 0.03024193548387097, "frac_reward_zero_std": 0.026041666666666668, "grad_norm": 2.1875, "kl": 0.0006040297914296389, "learning_rate": 1.2096774193548387e-07, "loss": 9.134033462032676e-06, "reward": 1.5618563890457153, "reward_std": 0.2734067539374034, "rewards/FidelityReward/mean": 0.7167681455612183, "rewards/FidelityReward/std": 0.24290741980075836, "rewards/JudgeFidelityReward/mean": 0.759186863899231, "rewards/JudgeFidelityReward/std": 0.17257178823153177, "rewards/SelfEvolvingFormatReward/mean": 0.9309895833333334, "rewards/SelfEvolvingFormatReward/std": 0.2521162231763204, "step": 15 }, { "clip_ratio/high_max": 0.0018639445304870605, "clip_ratio/high_mean": 0.0002940816630143672, "clip_ratio/low_mean": 5.6302115990547466e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003503837855532765, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 84.921875, "completions/min_length": 32.0, "epoch": 0.04032258064516129, "frac_reward_zero_std": 0.015625, "grad_norm": 2.234375, "kl": 0.000614993937779218, "learning_rate": 1.6129032258064515e-07, "loss": -8.611950906924903e-05, "reward": 1.4881038069725037, "reward_std": 0.33177706599235535, "rewards/FidelityReward/mean": 0.6714164614677429, "rewards/FidelityReward/std": 0.2654803842306137, "rewards/JudgeFidelityReward/mean": 0.7339603900909424, "rewards/JudgeFidelityReward/std": 0.19380413740873337, "rewards/SelfEvolvingFormatReward/mean": 0.8994140625, "rewards/SelfEvolvingFormatReward/std": 0.30071787536144257, "step": 20 }, { "clip_ratio/high_max": 0.0009386672172695398, "clip_ratio/high_mean": 0.00014495097566395997, "clip_ratio/low_mean": 6.874765967950225e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021369863534346223, "completions/clipped_ratio": 0.0, "completions/max_length": 269.3333333333333, "completions/mean_length": 84.60807291666667, "completions/min_length": 33.666666666666664, "epoch": 0.05040322580645161, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.0006068679387681187, "learning_rate": 2.0161290322580642e-07, "loss": -4.827802185900509e-05, "reward": 1.523406704266866, "reward_std": 0.31530120968818665, "rewards/FidelityReward/mean": 0.7010689775149027, "rewards/FidelityReward/std": 0.24856622020403543, "rewards/JudgeFidelityReward/mean": 0.7371233701705933, "rewards/JudgeFidelityReward/std": 0.19468070566654205, "rewards/SelfEvolvingFormatReward/mean": 0.9075520833333334, "rewards/SelfEvolvingFormatReward/std": 0.2892843882242839, "step": 25 }, { "clip_ratio/high_max": 0.0011445968644693493, "clip_ratio/high_mean": 0.00019135457696393133, "clip_ratio/low_mean": 0.00010915926250163466, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003005138481967151, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/mean_length": 83.111328125, "completions/min_length": 34.0, "epoch": 0.06048387096774194, "frac_reward_zero_std": 0.0078125, "grad_norm": 2.234375, "kl": 0.0006161233293823897, "learning_rate": 2.4193548387096775e-07, "loss": 7.412603008560836e-05, "reward": 1.5558416843414307, "reward_std": 0.2915067821741104, "rewards/FidelityReward/mean": 0.7191564738750458, "rewards/FidelityReward/std": 0.23383090645074844, "rewards/JudgeFidelityReward/mean": 0.7436829209327698, "rewards/JudgeFidelityReward/std": 0.18884624540805817, "rewards/SelfEvolvingFormatReward/mean": 0.9296875, "rewards/SelfEvolvingFormatReward/std": 0.25580864399671555, "step": 30 }, { "clip_ratio/high_max": 0.0008240688359364868, "clip_ratio/high_mean": 9.66818886809051e-05, "clip_ratio/low_mean": 0.000124332751147449, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022101463982835413, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/mean_length": 85.27473958333333, "completions/min_length": 34.333333333333336, "epoch": 0.07056451612903226, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 1.8671875, "kl": 0.0006032256176695227, "learning_rate": 2.8225806451612905e-07, "loss": 0.0001631146064028144, "reward": 1.5063324769337971, "reward_std": 0.2825668454170227, "rewards/FidelityReward/mean": 0.6849972804387411, "rewards/FidelityReward/std": 0.24318399528662363, "rewards/JudgeFidelityReward/mean": 0.714936097462972, "rewards/JudgeFidelityReward/std": 0.19125725825627646, "rewards/SelfEvolvingFormatReward/mean": 0.927734375, "rewards/SelfEvolvingFormatReward/std": 0.2584351698557536, "step": 35 }, { "clip_ratio/high_max": 0.0016912896186113357, "clip_ratio/high_mean": 0.00029051738092675803, "clip_ratio/low_mean": 6.171761779114605e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003522349987179041, "completions/clipped_ratio": 0.0, "completions/max_length": 216.5, "completions/mean_length": 82.69140625, "completions/min_length": 33.5, "epoch": 0.08064516129032258, "frac_reward_zero_std": 0.0078125, "grad_norm": 1.953125, "kl": 0.0005958651192486287, "learning_rate": 3.225806451612903e-07, "loss": 2.32588907238096e-05, "reward": 1.5534016489982605, "reward_std": 0.27970564365386963, "rewards/FidelityReward/mean": 0.7073060274124146, "rewards/FidelityReward/std": 0.2350827232003212, "rewards/JudgeFidelityReward/mean": 0.7556677758693695, "rewards/JudgeFidelityReward/std": 0.18645796179771423, "rewards/SelfEvolvingFormatReward/mean": 0.9365234375, "rewards/SelfEvolvingFormatReward/std": 0.2440483644604683, "step": 40 }, { "clip_ratio/high_max": 0.0010300514288246632, "clip_ratio/high_mean": 0.00019017686718143524, "clip_ratio/low_mean": 4.54500928753987e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023562696296721696, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/mean_length": 84.11328125, "completions/min_length": 34.0, "epoch": 0.0907258064516129, "frac_reward_zero_std": 0.026041666666666668, "grad_norm": 2.015625, "kl": 0.0006185251637361944, "learning_rate": 3.629032258064516e-07, "loss": 8.804664248600602e-05, "reward": 1.5195033152898152, "reward_std": 0.30204079548517865, "rewards/FidelityReward/mean": 0.6910918951034546, "rewards/FidelityReward/std": 0.24656595289707184, "rewards/JudgeFidelityReward/mean": 0.7369010249773661, "rewards/JudgeFidelityReward/std": 0.18724274138609567, "rewards/SelfEvolvingFormatReward/mean": 0.919921875, "rewards/SelfEvolvingFormatReward/std": 0.2708932012319565, "step": 45 }, { "clip_ratio/high_max": 0.0017187905963510276, "clip_ratio/high_mean": 0.0002562408451922238, "clip_ratio/low_mean": 8.681837935000657e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034305922454223035, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/mean_length": 82.962890625, "completions/min_length": 35.0, "epoch": 0.10080645161290322, "frac_reward_zero_std": 0.0078125, "grad_norm": 1.8828125, "kl": 0.0006359495222568512, "learning_rate": 4.0322580645161285e-07, "loss": 1.6868184320628644e-05, "reward": 1.5993143916130066, "reward_std": 0.2793116122484207, "rewards/FidelityReward/mean": 0.7489027380943298, "rewards/FidelityReward/std": 0.23794669657945633, "rewards/JudgeFidelityReward/mean": 0.7711357772350311, "rewards/JudgeFidelityReward/std": 0.18593823909759521, "rewards/SelfEvolvingFormatReward/mean": 0.9296875, "rewards/SelfEvolvingFormatReward/std": 0.25520503520965576, "step": 50 }, { "clip_ratio/high_max": 0.0014902840368449688, "clip_ratio/high_mean": 0.00018177840393036605, "clip_ratio/low_mean": 6.261957169044763e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024439797271043063, "completions/clipped_ratio": 0.0, "completions/max_length": 257.6666666666667, "completions/mean_length": 84.94010416666667, "completions/min_length": 35.333333333333336, "epoch": 0.11088709677419355, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 2.015625, "kl": 0.0006544376374222338, "learning_rate": 4.4354838709677415e-07, "loss": -8.603411260992289e-05, "reward": 1.5739852587382, "reward_std": 0.2881820797920227, "rewards/FidelityReward/mean": 0.7278466622034708, "rewards/FidelityReward/std": 0.2421208620071411, "rewards/JudgeFidelityReward/mean": 0.7625898122787476, "rewards/JudgeFidelityReward/std": 0.18427442014217377, "rewards/SelfEvolvingFormatReward/mean": 0.9296875, "rewards/SelfEvolvingFormatReward/std": 0.2556998133659363, "step": 55 }, { "clip_ratio/high_max": 0.0013187840348109602, "clip_ratio/high_mean": 0.0002486101584509015, "clip_ratio/low_mean": 0.0001100381079595536, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003586482780519873, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/mean_length": 81.806640625, "completions/min_length": 31.5, "epoch": 0.12096774193548387, "frac_reward_zero_std": 0.0078125, "grad_norm": 2.0625, "kl": 0.0006801960989832878, "learning_rate": 4.838709677419355e-07, "loss": -3.423981834203005e-05, "reward": 1.5208874940872192, "reward_std": 0.28224435448646545, "rewards/FidelityReward/mean": 0.6905507445335388, "rewards/FidelityReward/std": 0.24190565943717957, "rewards/JudgeFidelityReward/mean": 0.7202439308166504, "rewards/JudgeFidelityReward/std": 0.20244060456752777, "rewards/SelfEvolvingFormatReward/mean": 0.9404296875, "rewards/SelfEvolvingFormatReward/std": 0.23582115024328232, "step": 60 }, { "clip_ratio/high_max": 0.0008415541145950556, "clip_ratio/high_mean": 0.00012556220754049717, "clip_ratio/low_mean": 6.293882033787667e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018850102787837386, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/mean_length": 84.14127604166667, "completions/min_length": 33.0, "epoch": 0.1310483870967742, "frac_reward_zero_std": 0.015625, "grad_norm": 2.046875, "kl": 0.0007268791203387081, "learning_rate": 5.241935483870967e-07, "loss": 0.00012445036554709077, "reward": 1.5368211666742961, "reward_std": 0.2771211862564087, "rewards/FidelityReward/mean": 0.7057323853174845, "rewards/FidelityReward/std": 0.23341776430606842, "rewards/JudgeFidelityReward/mean": 0.7240265011787415, "rewards/JudgeFidelityReward/std": 0.1970389982064565, "rewards/SelfEvolvingFormatReward/mean": 0.9381510416666666, "rewards/SelfEvolvingFormatReward/std": 0.24083273112773895, "step": 65 }, { "clip_ratio/high_max": 0.0013855044264346362, "clip_ratio/high_mean": 0.000246424728538841, "clip_ratio/low_mean": 6.09443333814852e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003073690633755177, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/mean_length": 82.802734375, "completions/min_length": 30.0, "epoch": 0.14112903225806453, "frac_reward_zero_std": 0.0234375, "grad_norm": 2.171875, "kl": 0.0007644508732482791, "learning_rate": 5.645161290322581e-07, "loss": 8.918824605643749e-05, "reward": 1.5882129669189453, "reward_std": 0.26117023825645447, "rewards/FidelityReward/mean": 0.7358672618865967, "rewards/FidelityReward/std": 0.22414089739322662, "rewards/JudgeFidelityReward/mean": 0.761332094669342, "rewards/JudgeFidelityReward/std": 0.19293152540922165, "rewards/SelfEvolvingFormatReward/mean": 0.943359375, "rewards/SelfEvolvingFormatReward/std": 0.2312259078025818, "step": 70 }, { "clip_ratio/high_max": 0.0008347946684807539, "clip_ratio/high_mean": 0.0001437059952877462, "clip_ratio/low_mean": 3.926618956029415e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018297217902727426, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/mean_length": 82.69075520833333, "completions/min_length": 35.0, "epoch": 0.15120967741935484, "frac_reward_zero_std": 0.005208333333333333, "grad_norm": 2.0, "kl": 0.0008381051477044821, "learning_rate": 6.048387096774193e-07, "loss": 1.0538168135099113e-05, "reward": 1.5497348705927532, "reward_std": 0.27539127071698505, "rewards/FidelityReward/mean": 0.70872696240743, "rewards/FidelityReward/std": 0.2394062578678131, "rewards/JudgeFidelityReward/mean": 0.7419115900993347, "rewards/JudgeFidelityReward/std": 0.19384910662968954, "rewards/SelfEvolvingFormatReward/mean": 0.9401041666666666, "rewards/SelfEvolvingFormatReward/std": 0.23668219645818075, "step": 75 }, { "clip_ratio/high_max": 0.0017386329360306262, "clip_ratio/high_mean": 0.0002428928390145302, "clip_ratio/low_mean": 7.970837541506625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032260119915008547, "completions/clipped_ratio": 0.0, "completions/max_length": 240.5, "completions/mean_length": 78.9189453125, "completions/min_length": 32.0, "epoch": 0.16129032258064516, "frac_reward_zero_std": 0.0234375, "grad_norm": 2.171875, "kl": 0.0009459469583816826, "learning_rate": 6.451612903225806e-07, "loss": 9.938172297552228e-05, "reward": 1.607564926147461, "reward_std": 0.2438170164823532, "rewards/FidelityReward/mean": 0.748298704624176, "rewards/FidelityReward/std": 0.217288538813591, "rewards/JudgeFidelityReward/mean": 0.7663840353488922, "rewards/JudgeFidelityReward/std": 0.17495376616716385, "rewards/SelfEvolvingFormatReward/mean": 0.9521484375, "rewards/SelfEvolvingFormatReward/std": 0.21335314959287643, "step": 80 }, { "clip_ratio/high_max": 0.0011021152138710022, "clip_ratio/high_mean": 0.00018953101243823767, "clip_ratio/low_mean": 8.801565854810178e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002775466535240412, "completions/clipped_ratio": 0.0, "completions/max_length": 253.33333333333334, "completions/mean_length": 80.13802083333333, "completions/min_length": 31.666666666666668, "epoch": 0.17137096774193547, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 2.0625, "kl": 0.0011025516781955958, "learning_rate": 6.854838709677419e-07, "loss": 4.285447357688099e-06, "reward": 1.543899695078532, "reward_std": 0.24417642255624136, "rewards/FidelityReward/mean": 0.7041803002357483, "rewards/FidelityReward/std": 0.21819900969664255, "rewards/JudgeFidelityReward/mean": 0.7191524108250936, "rewards/JudgeFidelityReward/std": 0.18975775440533957, "rewards/SelfEvolvingFormatReward/mean": 0.9602864583333334, "rewards/SelfEvolvingFormatReward/std": 0.1929956873257955, "step": 85 }, { "clip_ratio/high_max": 0.0014753022463992239, "clip_ratio/high_mean": 0.00023771343985572456, "clip_ratio/low_mean": 7.19962721632328e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003097097156569362, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/mean_length": 79.8466796875, "completions/min_length": 33.5, "epoch": 0.1814516129032258, "frac_reward_zero_std": 0.0078125, "grad_norm": 2.09375, "kl": 0.0012224479112774133, "learning_rate": 7.258064516129032e-07, "loss": 3.96807212382555e-05, "reward": 1.5505490899085999, "reward_std": 0.25176796317100525, "rewards/FidelityReward/mean": 0.701090395450592, "rewards/FidelityReward/std": 0.22741086035966873, "rewards/JudgeFidelityReward/mean": 0.7321203351020813, "rewards/JudgeFidelityReward/std": 0.18953080475330353, "rewards/SelfEvolvingFormatReward/mean": 0.966796875, "rewards/SelfEvolvingFormatReward/std": 0.1792587712407112, "step": 90 }, { "clip_ratio/high_max": 0.0010682762367650867, "clip_ratio/high_mean": 0.00011254341807216406, "clip_ratio/low_mean": 8.812950982246548e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002006729308050126, "completions/clipped_ratio": 0.0, "completions/max_length": 228.33333333333334, "completions/mean_length": 79.626953125, "completions/min_length": 34.333333333333336, "epoch": 0.19153225806451613, "frac_reward_zero_std": 0.015625, "grad_norm": 2.046875, "kl": 0.0012598703848198055, "learning_rate": 7.661290322580645e-07, "loss": 2.3605137539561837e-05, "reward": 1.5882473786671956, "reward_std": 0.23901163041591644, "rewards/FidelityReward/mean": 0.7269989649454752, "rewards/FidelityReward/std": 0.2195963660875956, "rewards/JudgeFidelityReward/mean": 0.7524447441101074, "rewards/JudgeFidelityReward/std": 0.1922989934682846, "rewards/SelfEvolvingFormatReward/mean": 0.9700520833333334, "rewards/SelfEvolvingFormatReward/std": 0.17058913906415304, "step": 95 }, { "clip_ratio/high_max": 0.0015767351258546114, "clip_ratio/high_mean": 0.00018301881791558117, "clip_ratio/low_mean": 0.00011908303131349384, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000302101846318692, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/mean_length": 75.5859375, "completions/min_length": 34.5, "epoch": 0.20161290322580644, "frac_reward_zero_std": 0.015625, "grad_norm": 2.203125, "kl": 0.001484180218540132, "learning_rate": 8.064516129032257e-07, "loss": 5.3167378064244984e-05, "reward": 1.5624274015426636, "reward_std": 0.24988488852977753, "rewards/FidelityReward/mean": 0.7150861620903015, "rewards/FidelityReward/std": 0.22486238926649094, "rewards/JudgeFidelityReward/mean": 0.7356980443000793, "rewards/JudgeFidelityReward/std": 0.18139829486608505, "rewards/SelfEvolvingFormatReward/mean": 0.958984375, "rewards/SelfEvolvingFormatReward/std": 0.19845908880233765, "step": 100 }, { "clip_ratio/high_max": 0.0011270861141383648, "clip_ratio/high_mean": 0.0002605646150186658, "clip_ratio/low_mean": 9.188727708533406e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000352451903745532, "completions/clipped_ratio": 0.0, "completions/max_length": 241.66666666666666, "completions/mean_length": 77.45963541666667, "completions/min_length": 33.333333333333336, "epoch": 0.21169354838709678, "frac_reward_zero_std": 0.026041666666666668, "grad_norm": 2.296875, "kl": 0.0016492645721882583, "learning_rate": 8.467741935483871e-07, "loss": 9.726565913297236e-05, "reward": 1.5855333805084229, "reward_std": 0.22333314021428427, "rewards/FidelityReward/mean": 0.7239622672398885, "rewards/FidelityReward/std": 0.20881072680155435, "rewards/JudgeFidelityReward/mean": 0.7374650637308756, "rewards/JudgeFidelityReward/std": 0.1802077293395996, "rewards/SelfEvolvingFormatReward/mean": 0.9856770833333334, "rewards/SelfEvolvingFormatReward/std": 0.11576238026221593, "step": 105 }, { "clip_ratio/high_max": 0.0014054573839530349, "clip_ratio/high_mean": 0.0002160567615646869, "clip_ratio/low_mean": 0.00012156231387052684, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033761907834559677, "completions/clipped_ratio": 0.0, "completions/max_length": 261.5, "completions/mean_length": 79.6416015625, "completions/min_length": 35.5, "epoch": 0.2217741935483871, "frac_reward_zero_std": 0.0234375, "grad_norm": 1.9921875, "kl": 0.0018147001508623361, "learning_rate": 8.870967741935483e-07, "loss": -1.3762910384684801e-06, "reward": 1.6003873348236084, "reward_std": 0.22921056300401688, "rewards/FidelityReward/mean": 0.7335622310638428, "rewards/FidelityReward/std": 0.22698397934436798, "rewards/JudgeFidelityReward/mean": 0.7639235556125641, "rewards/JudgeFidelityReward/std": 0.20205450057983398, "rewards/SelfEvolvingFormatReward/mean": 0.9697265625, "rewards/SelfEvolvingFormatReward/std": 0.17032553255558014, "step": 110 }, { "clip_ratio/high_max": 0.0012337662745267152, "clip_ratio/high_mean": 0.00014613077510148287, "clip_ratio/low_mean": 0.00013706095051020385, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002831917372532189, "completions/clipped_ratio": 0.0, "completions/max_length": 270.3333333333333, "completions/mean_length": 77.35286458333333, "completions/min_length": 35.0, "epoch": 0.2318548387096774, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 2.25, "kl": 0.0018735050922259688, "learning_rate": 9.274193548387096e-07, "loss": 0.00014458682853728532, "reward": 1.5853607654571533, "reward_std": 0.21422413488229117, "rewards/FidelityReward/mean": 0.7166234453519186, "rewards/FidelityReward/std": 0.22866526246070862, "rewards/JudgeFidelityReward/mean": 0.7563547690709432, "rewards/JudgeFidelityReward/std": 0.1760805050532023, "rewards/SelfEvolvingFormatReward/mean": 0.9811197916666666, "rewards/SelfEvolvingFormatReward/std": 0.13570595035950342, "step": 115 }, { "clip_ratio/high_max": 0.0010924964677542447, "clip_ratio/high_mean": 0.00022730201890226455, "clip_ratio/low_mean": 9.093200205825269e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031823401805013417, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/mean_length": 76.9814453125, "completions/min_length": 31.0, "epoch": 0.24193548387096775, "frac_reward_zero_std": 0.046875, "grad_norm": 1.9375, "kl": 0.0020392364356666805, "learning_rate": 9.67741935483871e-07, "loss": 0.00012581683695316314, "reward": 1.5939275622367859, "reward_std": 0.22385435551404953, "rewards/FidelityReward/mean": 0.7348611354827881, "rewards/FidelityReward/std": 0.20509084314107895, "rewards/JudgeFidelityReward/mean": 0.7357109785079956, "rewards/JudgeFidelityReward/std": 0.1850816160440445, "rewards/SelfEvolvingFormatReward/mean": 0.982421875, "rewards/SelfEvolvingFormatReward/std": 0.1313294731080532, "step": 120 }, { "clip_ratio/high_max": 0.0010922894114628434, "clip_ratio/high_mean": 0.00013160479720681907, "clip_ratio/low_mean": 9.251010487787426e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022411488462239503, "completions/clipped_ratio": 0.0, "completions/max_length": 226.66666666666666, "completions/mean_length": 78.02213541666667, "completions/min_length": 33.333333333333336, "epoch": 0.25201612903225806, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.0020740612177178263, "learning_rate": 9.999995554822397e-07, "loss": 8.637291612103581e-05, "reward": 1.5850813786188762, "reward_std": 0.22248146931330362, "rewards/FidelityReward/mean": 0.721980094909668, "rewards/FidelityReward/std": 0.21191427608331045, "rewards/JudgeFidelityReward/mean": 0.7379212379455566, "rewards/JudgeFidelityReward/std": 0.190004234512647, "rewards/SelfEvolvingFormatReward/mean": 0.98828125, "rewards/SelfEvolvingFormatReward/std": 0.10746473570664723, "step": 125 }, { "clip_ratio/high_max": 0.0024858191143721344, "clip_ratio/high_mean": 0.000343940791208297, "clip_ratio/low_mean": 0.00011018849472748116, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00045412929030135273, "completions/clipped_ratio": 0.0, "completions/max_length": 244.5, "completions/mean_length": 77.3408203125, "completions/min_length": 31.5, "epoch": 0.2620967741935484, "frac_reward_zero_std": 0.0078125, "grad_norm": 2.15625, "kl": 0.002147960872389376, "learning_rate": 9.999839974436198e-07, "loss": 7.243729778565467e-05, "reward": 1.5837594866752625, "reward_std": 0.22494005411863327, "rewards/FidelityReward/mean": 0.7223173975944519, "rewards/FidelityReward/std": 0.20597579330205917, "rewards/JudgeFidelityReward/mean": 0.7394856810569763, "rewards/JudgeFidelityReward/std": 0.19079061597585678, "rewards/SelfEvolvingFormatReward/mean": 0.9833984375, "rewards/SelfEvolvingFormatReward/std": 0.1273781843483448, "step": 130 }, { "clip_ratio/high_max": 0.0009946716018021107, "clip_ratio/high_mean": 0.00016552326851524413, "clip_ratio/low_mean": 0.00011321480851620436, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027873807121068237, "completions/clipped_ratio": 0.0, "completions/max_length": 236.33333333333334, "completions/mean_length": 76.99739583333333, "completions/min_length": 31.0, "epoch": 0.2721774193548387, "frac_reward_zero_std": 0.03125, "grad_norm": 2.09375, "kl": 0.002148947631940246, "learning_rate": 9.999462143073634e-07, "loss": 0.00010601768735796213, "reward": 1.629795988400777, "reward_std": 0.20415509243806204, "rewards/FidelityReward/mean": 0.7573599020640055, "rewards/FidelityReward/std": 0.20649411280949911, "rewards/JudgeFidelityReward/mean": 0.763752301534017, "rewards/JudgeFidelityReward/std": 0.18547001481056213, "rewards/SelfEvolvingFormatReward/mean": 0.9811197916666666, "rewards/SelfEvolvingFormatReward/std": 0.13456753144661585, "step": 135 }, { "clip_ratio/high_max": 0.001659097522497177, "clip_ratio/high_mean": 0.00032043434912338854, "clip_ratio/low_mean": 0.00014474288618657739, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00046517723239958286, "completions/clipped_ratio": 0.0, "completions/max_length": 232.5, "completions/mean_length": 77.650390625, "completions/min_length": 30.5, "epoch": 0.28225806451612906, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.21875, "kl": 0.0022402712143957614, "learning_rate": 9.998862077529917e-07, "loss": 3.2673205714672805e-05, "reward": 1.6258726716041565, "reward_std": 0.20635931193828583, "rewards/FidelityReward/mean": 0.7515695691108704, "rewards/FidelityReward/std": 0.19817717373371124, "rewards/JudgeFidelityReward/mean": 0.7671607136726379, "rewards/JudgeFidelityReward/std": 0.1727551743388176, "rewards/SelfEvolvingFormatReward/mean": 0.9814453125, "rewards/SelfEvolvingFormatReward/std": 0.13502933084964752, "step": 140 }, { "clip_ratio/high_max": 0.0011442322749644518, "clip_ratio/high_mean": 0.00017806708929128944, "clip_ratio/low_mean": 8.629497606307268e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000264362059533596, "completions/clipped_ratio": 0.0, "completions/max_length": 259.6666666666667, "completions/mean_length": 76.6171875, "completions/min_length": 32.666666666666664, "epoch": 0.2923387096774194, "frac_reward_zero_std": 0.036458333333333336, "grad_norm": 2.015625, "kl": 0.002282397821545601, "learning_rate": 9.998039804478936e-07, "loss": 0.0001557998824864626, "reward": 1.6559818585713704, "reward_std": 0.18729770680268606, "rewards/FidelityReward/mean": 0.7753825386365255, "rewards/FidelityReward/std": 0.21012424925963083, "rewards/JudgeFidelityReward/mean": 0.7729173898696899, "rewards/JudgeFidelityReward/std": 0.20020090540250143, "rewards/SelfEvolvingFormatReward/mean": 0.98828125, "rewards/SelfEvolvingFormatReward/std": 0.10746473570664723, "step": 145 }, { "clip_ratio/high_max": 0.0019019055180251598, "clip_ratio/high_mean": 0.0001906510558910668, "clip_ratio/low_mean": 8.51665114169009e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002758175658527762, "completions/clipped_ratio": 0.0, "completions/max_length": 205.5, "completions/mean_length": 76.4736328125, "completions/min_length": 32.0, "epoch": 0.3024193548387097, "frac_reward_zero_std": 0.03125, "grad_norm": 2.21875, "kl": 0.0021782509051263333, "learning_rate": 9.996995360472057e-07, "loss": 0.0002586953341960907, "reward": 1.5911229848861694, "reward_std": 0.21695364266633987, "rewards/FidelityReward/mean": 0.7255796194076538, "rewards/FidelityReward/std": 0.20079346746206284, "rewards/JudgeFidelityReward/mean": 0.7408521175384521, "rewards/JudgeFidelityReward/std": 0.17834337800741196, "rewards/SelfEvolvingFormatReward/mean": 0.990234375, "rewards/SelfEvolvingFormatReward/std": 0.09792538359761238, "step": 150 }, { "clip_ratio/high_max": 0.0016927999444305898, "clip_ratio/high_mean": 0.0002606124384328723, "clip_ratio/low_mean": 4.153708578087389e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030214954167604444, "completions/clipped_ratio": 0.0, "completions/max_length": 215.66666666666666, "completions/mean_length": 75.49153645833333, "completions/min_length": 33.0, "epoch": 0.3125, "frac_reward_zero_std": 0.026041666666666668, "grad_norm": 2.25, "kl": 0.002211786853149533, "learning_rate": 9.995728791936505e-07, "loss": 0.0001222997554577887, "reward": 1.620366374651591, "reward_std": 0.19788832465807596, "rewards/FidelityReward/mean": 0.7437519629796346, "rewards/FidelityReward/std": 0.20386990904808044, "rewards/JudgeFidelityReward/mean": 0.7597392002741495, "rewards/JudgeFidelityReward/std": 0.18934995929400125, "rewards/SelfEvolvingFormatReward/mean": 0.9934895833333334, "rewards/SelfEvolvingFormatReward/std": 0.07908969124158223, "step": 155 }, { "clip_ratio/high_max": 0.001955489651300013, "clip_ratio/high_mean": 0.000348463945556432, "clip_ratio/low_mean": 3.459679428488016e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003830607398413122, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/mean_length": 71.8994140625, "completions/min_length": 33.5, "epoch": 0.3225806451612903, "frac_reward_zero_std": 0.0625, "grad_norm": 2.109375, "kl": 0.0023129272740334272, "learning_rate": 9.994240155173301e-07, "loss": 3.818401601165533e-05, "reward": 1.5973781943321228, "reward_std": 0.20761053264141083, "rewards/FidelityReward/mean": 0.7324697971343994, "rewards/FidelityReward/std": 0.22637249529361725, "rewards/JudgeFidelityReward/mean": 0.7434885799884796, "rewards/JudgeFidelityReward/std": 0.18989573419094086, "rewards/SelfEvolvingFormatReward/mean": 0.986328125, "rewards/SelfEvolvingFormatReward/std": 0.11332328617572784, "step": 160 }, { "clip_ratio/high_max": 0.0012772939633578062, "clip_ratio/high_mean": 0.0001947213662788272, "clip_ratio/low_mean": 7.888405889389105e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002736054244451225, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/mean_length": 75.55338541666667, "completions/min_length": 31.333333333333332, "epoch": 0.3326612903225806, "frac_reward_zero_std": 0.026041666666666668, "grad_norm": 2.296875, "kl": 0.0022107237949967385, "learning_rate": 9.992529516354757e-07, "loss": 3.62871156539768e-05, "reward": 1.603543798128764, "reward_std": 0.20419170459111533, "rewards/FidelityReward/mean": 0.7348321477572123, "rewards/FidelityReward/std": 0.2033290515343348, "rewards/JudgeFidelityReward/mean": 0.7491421302159628, "rewards/JudgeFidelityReward/std": 0.1697502334912618, "rewards/SelfEvolvingFormatReward/mean": 0.98828125, "rewards/SelfEvolvingFormatReward/std": 0.10140986740589142, "step": 165 }, { "clip_ratio/high_max": 0.0020389660960063337, "clip_ratio/high_mean": 0.0001564208505442366, "clip_ratio/low_mean": 6.008845230098814e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021650930284522474, "completions/clipped_ratio": 0.0, "completions/max_length": 249.5, "completions/mean_length": 75.9697265625, "completions/min_length": 33.0, "epoch": 0.34274193548387094, "frac_reward_zero_std": 0.03125, "grad_norm": 2.40625, "kl": 0.0023619862273335456, "learning_rate": 9.990596951521537e-07, "loss": 0.00018519146833568813, "reward": 1.5822868347167969, "reward_std": 0.2138306349515915, "rewards/FidelityReward/mean": 0.7189817428588867, "rewards/FidelityReward/std": 0.21662580221891403, "rewards/JudgeFidelityReward/mean": 0.7353993952274323, "rewards/JudgeFidelityReward/std": 0.19044488668441772, "rewards/SelfEvolvingFormatReward/mean": 0.9912109375, "rewards/SelfEvolvingFormatReward/std": 0.0920594371855259, "step": 170 }, { "clip_ratio/high_max": 0.0014155389741063117, "clip_ratio/high_mean": 0.00019852854311466216, "clip_ratio/low_mean": 2.054879514616914e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021907733753323555, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/mean_length": 76.41861979166667, "completions/min_length": 32.666666666666664, "epoch": 0.3528225806451613, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.09375, "kl": 0.0022019742522388698, "learning_rate": 9.988442546579276e-07, "loss": 2.37074593314901e-05, "reward": 1.6000791788101196, "reward_std": 0.2108451227347056, "rewards/FidelityReward/mean": 0.7290548284848531, "rewards/FidelityReward/std": 0.2090267539024353, "rewards/JudgeFidelityReward/mean": 0.7518143455187479, "rewards/JudgeFidelityReward/std": 0.17476301888624826, "rewards/SelfEvolvingFormatReward/mean": 0.990234375, "rewards/SelfEvolvingFormatReward/std": 0.09809480359156926, "step": 175 }, { "clip_ratio/high_max": 0.002523000305518508, "clip_ratio/high_mean": 0.00030654206057079136, "clip_ratio/low_mean": 0.00010695899254642427, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00041350103565491737, "completions/clipped_ratio": 0.0, "completions/max_length": 240.5, "completions/mean_length": 75.2431640625, "completions/min_length": 35.0, "epoch": 0.3629032258064516, "frac_reward_zero_std": 0.03125, "grad_norm": 1.9453125, "kl": 0.002392694493755698, "learning_rate": 9.986066397294757e-07, "loss": 0.00011702119372785092, "reward": 1.5603572726249695, "reward_std": 0.21784928441047668, "rewards/FidelityReward/mean": 0.7082729637622833, "rewards/FidelityReward/std": 0.21341516077518463, "rewards/JudgeFidelityReward/mean": 0.7139342129230499, "rewards/JudgeFidelityReward/std": 0.19263170659542084, "rewards/SelfEvolvingFormatReward/mean": 0.990234375, "rewards/SelfEvolvingFormatReward/std": 0.09631745889782906, "step": 180 }, { "clip_ratio/high_max": 0.0012794756330549717, "clip_ratio/high_mean": 0.00016021786141209304, "clip_ratio/low_mean": 0.00015596621669828893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031618407228961587, "completions/clipped_ratio": 0.0, "completions/max_length": 220.33333333333334, "completions/mean_length": 77.27734375, "completions/min_length": 33.0, "epoch": 0.37298387096774194, "frac_reward_zero_std": 0.010416666666666666, "grad_norm": 2.15625, "kl": 0.002362056216225028, "learning_rate": 9.983468609291661e-07, "loss": 0.00015197301981970668, "reward": 1.5564955075581868, "reward_std": 0.2202495982249578, "rewards/FidelityReward/mean": 0.7012860774993896, "rewards/FidelityReward/std": 0.22110274930795035, "rewards/JudgeFidelityReward/mean": 0.717580258846283, "rewards/JudgeFidelityReward/std": 0.19788825511932373, "rewards/SelfEvolvingFormatReward/mean": 0.9928385416666666, "rewards/SelfEvolvingFormatReward/std": 0.08374229073524475, "step": 185 }, { "clip_ratio/high_max": 0.0015356113202869893, "clip_ratio/high_mean": 0.0002717108407523483, "clip_ratio/low_mean": 5.298713804222643e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032469796715304253, "completions/clipped_ratio": 0.0, "completions/max_length": 220.5, "completions/mean_length": 75.84375, "completions/min_length": 32.5, "epoch": 0.38306451612903225, "frac_reward_zero_std": 0.015625, "grad_norm": 2.3125, "kl": 0.0023801904171705246, "learning_rate": 9.980649298045868e-07, "loss": 6.338959792628884e-05, "reward": 1.5881436467170715, "reward_std": 0.2061980441212654, "rewards/FidelityReward/mean": 0.7132294774055481, "rewards/FidelityReward/std": 0.2114647999405861, "rewards/JudgeFidelityReward/mean": 0.7527579367160797, "rewards/JudgeFidelityReward/std": 0.16737008839845657, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.038198307156562805, "step": 190 }, { "clip_ratio/high_max": 0.000967850862070918, "clip_ratio/high_mean": 0.00014582685544155537, "clip_ratio/low_mean": 0.00012316658394411205, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026899344520643355, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/mean_length": 74.958984375, "completions/min_length": 34.333333333333336, "epoch": 0.39314516129032256, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 2.421875, "kl": 0.00242949896492064, "learning_rate": 9.977608588880326e-07, "loss": 0.0001923381700180471, "reward": 1.5717185338338215, "reward_std": 0.21819930771986643, "rewards/FidelityReward/mean": 0.7092628677686056, "rewards/FidelityReward/std": 0.20873962342739105, "rewards/JudgeFidelityReward/mean": 0.7366301218668619, "rewards/JudgeFidelityReward/std": 0.17760172486305237, "rewards/SelfEvolvingFormatReward/mean": 0.98828125, "rewards/SelfEvolvingFormatReward/std": 0.10746473570664723, "step": 195 }, { "clip_ratio/high_max": 0.0017504966584965588, "clip_ratio/high_mean": 0.00016246377199422569, "clip_ratio/low_mean": 0.00015012900694273413, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031259277602657676, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/mean_length": 75.193359375, "completions/min_length": 32.0, "epoch": 0.4032258064516129, "frac_reward_zero_std": 0.046875, "grad_norm": 2.203125, "kl": 0.002421754412353039, "learning_rate": 9.974346616959475e-07, "loss": 0.00026002069935202596, "reward": 1.6198822259902954, "reward_std": 0.20087207108736038, "rewards/FidelityReward/mean": 0.7452955842018127, "rewards/FidelityReward/std": 0.20606467127799988, "rewards/JudgeFidelityReward/mean": 0.7550325989723206, "rewards/JudgeFidelityReward/std": 0.1984081193804741, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07131390832364559, "step": 200 }, { "clip_ratio/high_max": 0.0010749900713562966, "clip_ratio/high_mean": 0.0001397440501023084, "clip_ratio/low_mean": 8.121997234411538e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022096402244642377, "completions/clipped_ratio": 0.0, "completions/max_length": 210.33333333333334, "completions/mean_length": 78.19075520833333, "completions/min_length": 33.333333333333336, "epoch": 0.41330645161290325, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 2.109375, "kl": 0.0024728372693061827, "learning_rate": 9.970863527283244e-07, "loss": 0.00011326970998197794, "reward": 1.614173452059428, "reward_std": 0.20263735949993134, "rewards/FidelityReward/mean": 0.744748075803121, "rewards/FidelityReward/std": 0.20372106631596884, "rewards/JudgeFidelityReward/mean": 0.7499184608459473, "rewards/JudgeFidelityReward/std": 0.18844708800315857, "rewards/SelfEvolvingFormatReward/mean": 0.9889322916666666, "rewards/SelfEvolvingFormatReward/std": 0.10436853021383286, "step": 205 }, { "clip_ratio/high_max": 0.0013077816227450966, "clip_ratio/high_mean": 0.0002117067517247051, "clip_ratio/low_mean": 0.00013654466019943356, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034825141774490475, "completions/clipped_ratio": 0.0, "completions/max_length": 204.5, "completions/mean_length": 75.2236328125, "completions/min_length": 35.5, "epoch": 0.42338709677419356, "frac_reward_zero_std": 0.03125, "grad_norm": 2.125, "kl": 0.002361476421356201, "learning_rate": 9.967159474680607e-07, "loss": -3.614836314227432e-05, "reward": 1.5922671556472778, "reward_std": 0.20500746369361877, "rewards/FidelityReward/mean": 0.7251898348331451, "rewards/FidelityReward/std": 0.21296337991952896, "rewards/JudgeFidelityReward/mean": 0.7400141656398773, "rewards/JudgeFidelityReward/std": 0.18776675313711166, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 210 }, { "clip_ratio/high_max": 0.0010721836239099502, "clip_ratio/high_mean": 0.00017346800304949284, "clip_ratio/low_mean": 6.203758530318738e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023550558835268022, "completions/clipped_ratio": 0.0, "completions/max_length": 267.6666666666667, "completions/mean_length": 76.88671875, "completions/min_length": 33.666666666666664, "epoch": 0.4334677419354839, "frac_reward_zero_std": 0.046875, "grad_norm": 2.15625, "kl": 0.0024767718743532894, "learning_rate": 9.963234623802694e-07, "loss": 7.07272905856371e-05, "reward": 1.5817376772562664, "reward_std": 0.21123030285040537, "rewards/FidelityReward/mean": 0.7233226497968038, "rewards/FidelityReward/std": 0.212075541416804, "rewards/JudgeFidelityReward/mean": 0.7259446779886881, "rewards/JudgeFidelityReward/std": 0.1930959572394689, "rewards/SelfEvolvingFormatReward/mean": 0.9908854166666666, "rewards/SelfEvolvingFormatReward/std": 0.09499859809875488, "step": 215 }, { "clip_ratio/high_max": 0.0011792399920523167, "clip_ratio/high_mean": 0.0002342647931072861, "clip_ratio/low_mean": 0.00016072849684860556, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003949932986870408, "completions/clipped_ratio": 0.0, "completions/max_length": 238.5, "completions/mean_length": 77.560546875, "completions/min_length": 36.0, "epoch": 0.4435483870967742, "frac_reward_zero_std": 0.0546875, "grad_norm": 1.9765625, "kl": 0.0025764000602066517, "learning_rate": 9.959089149115476e-07, "loss": 0.00010009747929871083, "reward": 1.5912587642669678, "reward_std": 0.20703159272670746, "rewards/FidelityReward/mean": 0.7238824665546417, "rewards/FidelityReward/std": 0.210459902882576, "rewards/JudgeFidelityReward/mean": 0.7445181608200073, "rewards/JudgeFidelityReward/std": 0.1941552236676216, "rewards/SelfEvolvingFormatReward/mean": 0.990234375, "rewards/SelfEvolvingFormatReward/std": 0.09843364357948303, "step": 220 }, { "clip_ratio/high_max": 0.0012873748783022165, "clip_ratio/high_mean": 0.00022110456484369934, "clip_ratio/low_mean": 5.2648858400061727e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027375342324376106, "completions/clipped_ratio": 0.0, "completions/max_length": 251.66666666666666, "completions/mean_length": 76.328125, "completions/min_length": 33.0, "epoch": 0.4536290322580645, "frac_reward_zero_std": 0.046875, "grad_norm": 1.9296875, "kl": 0.0026257896795868875, "learning_rate": 9.95472323489201e-07, "loss": 0.00018595801666378974, "reward": 1.6006362040837605, "reward_std": 0.2060138980547587, "rewards/FidelityReward/mean": 0.7310521006584167, "rewards/FidelityReward/std": 0.20566360652446747, "rewards/JudgeFidelityReward/mean": 0.7463295857111613, "rewards/JudgeFidelityReward/std": 0.19505838056405386, "rewards/SelfEvolvingFormatReward/mean": 0.9928385416666666, "rewards/SelfEvolvingFormatReward/std": 0.08421787619590759, "step": 225 }, { "clip_ratio/high_max": 0.0023675303906202315, "clip_ratio/high_mean": 0.000278000725666061, "clip_ratio/low_mean": 5.903130513615906e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003370320366229862, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/mean_length": 75.7998046875, "completions/min_length": 31.0, "epoch": 0.4637096774193548, "frac_reward_zero_std": 0.046875, "grad_norm": 2.421875, "kl": 0.0024207110516726972, "learning_rate": 9.95013707520425e-07, "loss": 0.00021912972442805767, "reward": 1.5902616381645203, "reward_std": 0.21288231015205383, "rewards/FidelityReward/mean": 0.729463130235672, "rewards/FidelityReward/std": 0.20804648846387863, "rewards/JudgeFidelityReward/mean": 0.7264797389507294, "rewards/JudgeFidelityReward/std": 0.1944076269865036, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.049216821789741516, "step": 230 }, { "clip_ratio/high_max": 0.0013696403708308934, "clip_ratio/high_mean": 0.00020637655397877098, "clip_ratio/low_mean": 8.822972886264324e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029460627119988204, "completions/clipped_ratio": 0.0, "completions/max_length": 217.66666666666666, "completions/mean_length": 74.00520833333333, "completions/min_length": 30.333333333333332, "epoch": 0.4737903225806452, "frac_reward_zero_std": 0.036458333333333336, "grad_norm": 2.15625, "kl": 0.0024490753654390573, "learning_rate": 9.94533087391441e-07, "loss": -9.466469055041671e-05, "reward": 1.6036229928334553, "reward_std": 0.19912827014923096, "rewards/FidelityReward/mean": 0.7331844369570414, "rewards/FidelityReward/std": 0.21116992831230164, "rewards/JudgeFidelityReward/mean": 0.7454344034194946, "rewards/JudgeFidelityReward/std": 0.17944316565990448, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.0649204986790816, "step": 235 }, { "clip_ratio/high_max": 0.0013158308574929833, "clip_ratio/high_mean": 0.0002175040281144902, "clip_ratio/low_mean": 5.063239950686693e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002681364218005911, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/mean_length": 77.0810546875, "completions/min_length": 32.0, "epoch": 0.4838709677419355, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.015625, "kl": 0.002558813150972128, "learning_rate": 9.940304844665917e-07, "loss": 5.9981830418109894e-05, "reward": 1.6030145287513733, "reward_std": 0.2104048654437065, "rewards/FidelityReward/mean": 0.7354603707790375, "rewards/FidelityReward/std": 0.21362753957509995, "rewards/JudgeFidelityReward/mean": 0.7478034794330597, "rewards/JudgeFidelityReward/std": 0.1844249740242958, "rewards/SelfEvolvingFormatReward/mean": 0.9873046875, "rewards/SelfEvolvingFormatReward/std": 0.11128726229071617, "step": 240 }, { "clip_ratio/high_max": 0.0007778969127684831, "clip_ratio/high_mean": 0.000167425669496879, "clip_ratio/low_mean": 3.293849295005202e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000200364162446931, "completions/clipped_ratio": 0.0, "completions/max_length": 232.33333333333334, "completions/mean_length": 77.56640625, "completions/min_length": 32.333333333333336, "epoch": 0.4939516129032258, "frac_reward_zero_std": 0.03125, "grad_norm": 2.0625, "kl": 0.0023338465485721826, "learning_rate": 9.9350592108739e-07, "loss": 3.5391745041124525e-05, "reward": 1.557969570159912, "reward_std": 0.21174705028533936, "rewards/FidelityReward/mean": 0.7048249244689941, "rewards/FidelityReward/std": 0.23668424785137177, "rewards/JudgeFidelityReward/mean": 0.7147527535756429, "rewards/JudgeFidelityReward/std": 0.21478116512298584, "rewards/SelfEvolvingFormatReward/mean": 0.9915364583333334, "rewards/SelfEvolvingFormatReward/std": 0.09108796715736389, "step": 245 }, { "clip_ratio/high_max": 0.0015630280366167426, "clip_ratio/high_mean": 0.0002168779872590676, "clip_ratio/low_mean": 8.126762986648828e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002981456287670881, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/mean_length": 76.119140625, "completions/min_length": 32.5, "epoch": 0.5040322580645161, "frac_reward_zero_std": 0.03125, "grad_norm": 2.21875, "kl": 0.0024170488119125365, "learning_rate": 9.92959420571527e-07, "loss": 0.00010005596559494733, "reward": 1.6128284335136414, "reward_std": 0.20539288222789764, "rewards/FidelityReward/mean": 0.7480037808418274, "rewards/FidelityReward/std": 0.1974068433046341, "rewards/JudgeFidelityReward/mean": 0.7374616861343384, "rewards/JudgeFidelityReward/std": 0.187340147793293, "rewards/SelfEvolvingFormatReward/mean": 0.9921875, "rewards/SelfEvolvingFormatReward/std": 0.062070440500974655, "step": 250 }, { "clip_ratio/high_max": 0.0010669812094420195, "clip_ratio/high_mean": 0.00011722285416908562, "clip_ratio/low_mean": 5.9622543631121516e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017684539780020713, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/mean_length": 75.45833333333333, "completions/min_length": 32.666666666666664, "epoch": 0.5141129032258065, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 1.9375, "kl": 0.0024374651722609995, "learning_rate": 9.923910072118348e-07, "loss": 5.4010632447898387e-05, "reward": 1.5706866184870403, "reward_std": 0.21231508255004883, "rewards/FidelityReward/mean": 0.7117733359336853, "rewards/FidelityReward/std": 0.2123520920674006, "rewards/JudgeFidelityReward/mean": 0.7243369817733765, "rewards/JudgeFidelityReward/std": 0.18178515136241913, "rewards/SelfEvolvingFormatReward/mean": 0.9934895833333334, "rewards/SelfEvolvingFormatReward/std": 0.0803072452545166, "step": 255 }, { "clip_ratio/high_max": 0.0018735669553279876, "clip_ratio/high_mean": 0.0003490012080874294, "clip_ratio/low_mean": 9.21984828892164e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004411996924318373, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/mean_length": 76.7431640625, "completions/min_length": 33.0, "epoch": 0.5241935483870968, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.1875, "kl": 0.0023197198752313852, "learning_rate": 9.918007062752071e-07, "loss": 0.00016568489372730254, "reward": 1.5972990989685059, "reward_std": 0.20712397992610931, "rewards/FidelityReward/mean": 0.7308632433414459, "rewards/FidelityReward/std": 0.19937240332365036, "rewards/JudgeFidelityReward/mean": 0.7397076487541199, "rewards/JudgeFidelityReward/std": 0.18004238605499268, "rewards/SelfEvolvingFormatReward/mean": 0.9931640625, "rewards/SelfEvolvingFormatReward/std": 0.08043622970581055, "step": 260 }, { "clip_ratio/high_max": 0.0010086384601891041, "clip_ratio/high_mean": 0.00017341006896458565, "clip_ratio/low_mean": 6.382748833857476e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002372375631239265, "completions/clipped_ratio": 0.0, "completions/max_length": 205.33333333333334, "completions/mean_length": 76.36263020833333, "completions/min_length": 31.666666666666668, "epoch": 0.5342741935483871, "frac_reward_zero_std": 0.026041666666666668, "grad_norm": 2.140625, "kl": 0.0024096535053104162, "learning_rate": 9.911885440014754e-07, "loss": 0.00013410190586000682, "reward": 1.569998820622762, "reward_std": 0.21390988926092783, "rewards/FidelityReward/mean": 0.7041231989860535, "rewards/FidelityReward/std": 0.21163484454154968, "rewards/JudgeFidelityReward/mean": 0.7382616400718689, "rewards/JudgeFidelityReward/std": 0.1887451559305191, "rewards/SelfEvolvingFormatReward/mean": 0.9934895833333334, "rewards/SelfEvolvingFormatReward/std": 0.07956527670224507, "step": 265 }, { "clip_ratio/high_max": 0.0019212122075259685, "clip_ratio/high_mean": 0.00020882351527689024, "clip_ratio/low_mean": 0.00015059332363307477, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035941684618592264, "completions/clipped_ratio": 0.0, "completions/max_length": 274.5, "completions/mean_length": 76.8037109375, "completions/min_length": 31.5, "epoch": 0.5443548387096774, "frac_reward_zero_std": 0.0234375, "grad_norm": 2.21875, "kl": 0.002371253678575158, "learning_rate": 9.905545476022432e-07, "loss": 8.756585302762687e-05, "reward": 1.592892050743103, "reward_std": 0.215424045920372, "rewards/FidelityReward/mean": 0.7185666263103485, "rewards/FidelityReward/std": 0.21928489953279495, "rewards/JudgeFidelityReward/mean": 0.7564631402492523, "rewards/JudgeFidelityReward/std": 0.1761614978313446, "rewards/SelfEvolvingFormatReward/mean": 0.9921875, "rewards/SelfEvolvingFormatReward/std": 0.08812850713729858, "step": 270 }, { "clip_ratio/high_max": 0.0010839900001883506, "clip_ratio/high_mean": 0.0001536751165986061, "clip_ratio/low_mean": 0.00011402067029848695, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026769578689709304, "completions/clipped_ratio": 0.0, "completions/max_length": 229.66666666666666, "completions/mean_length": 76.38606770833333, "completions/min_length": 37.666666666666664, "epoch": 0.5544354838709677, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 2.1875, "kl": 0.002552008954808116, "learning_rate": 9.898987452596764e-07, "loss": 8.179345168173313e-05, "reward": 1.594842831293742, "reward_std": 0.21073834101359049, "rewards/FidelityReward/mean": 0.7319564620653788, "rewards/FidelityReward/std": 0.20358507335186005, "rewards/JudgeFidelityReward/mean": 0.7348873217900594, "rewards/JudgeFidelityReward/std": 0.18172016243139902, "rewards/SelfEvolvingFormatReward/mean": 0.9908854166666666, "rewards/SelfEvolvingFormatReward/std": 0.09231136987606685, "step": 275 }, { "clip_ratio/high_max": 0.002021536836400628, "clip_ratio/high_mean": 0.00023640940780751408, "clip_ratio/low_mean": 0.00013614971830975265, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003725591115653515, "completions/clipped_ratio": 0.0, "completions/max_length": 221.5, "completions/mean_length": 76.6630859375, "completions/min_length": 33.0, "epoch": 0.5645161290322581, "frac_reward_zero_std": 0.0234375, "grad_norm": 2.046875, "kl": 0.002516914764419198, "learning_rate": 9.892211661252498e-07, "loss": 0.00015246723778545858, "reward": 1.6386302709579468, "reward_std": 0.19769922643899918, "rewards/FidelityReward/mean": 0.7625327110290527, "rewards/FidelityReward/std": 0.19950998574495316, "rewards/JudgeFidelityReward/mean": 0.7580544948577881, "rewards/JudgeFidelityReward/std": 0.18720212578773499, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 280 }, { "clip_ratio/high_max": 0.0009316698415204883, "clip_ratio/high_mean": 0.00015615298470947892, "clip_ratio/low_mean": 4.103812607354484e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019719110568985342, "completions/clipped_ratio": 0.0, "completions/max_length": 225.33333333333334, "completions/mean_length": 76.47916666666667, "completions/min_length": 35.666666666666664, "epoch": 0.5745967741935484, "frac_reward_zero_std": 0.046875, "grad_norm": 2.265625, "kl": 0.0024893193040043116, "learning_rate": 9.885218403184528e-07, "loss": 0.0001531324931420386, "reward": 1.623213569323222, "reward_std": 0.19943425059318542, "rewards/FidelityReward/mean": 0.7421566049257914, "rewards/FidelityReward/std": 0.21137947340806326, "rewards/JudgeFidelityReward/mean": 0.7686242858568827, "rewards/JudgeFidelityReward/std": 0.17625017960866293, "rewards/SelfEvolvingFormatReward/mean": 0.9934895833333334, "rewards/SelfEvolvingFormatReward/std": 0.07908969124158223, "step": 285 }, { "clip_ratio/high_max": 0.0013207357143983245, "clip_ratio/high_mean": 0.00020493044576141983, "clip_ratio/low_mean": 6.231365041458048e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026724410126917065, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/mean_length": 75.447265625, "completions/min_length": 33.0, "epoch": 0.5846774193548387, "frac_reward_zero_std": 0.03125, "grad_norm": 2.3125, "kl": 0.0027465680614113808, "learning_rate": 9.878007989254485e-07, "loss": 0.00014806247781962156, "reward": 1.6446102857589722, "reward_std": 0.18939480185508728, "rewards/FidelityReward/mean": 0.7597477734088898, "rewards/FidelityReward/std": 0.19545937329530716, "rewards/JudgeFidelityReward/mean": 0.772654801607132, "rewards/JudgeFidelityReward/std": 0.18514742702245712, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 290 }, { "clip_ratio/high_max": 0.001214950392022729, "clip_ratio/high_mean": 0.00021466886391863227, "clip_ratio/low_mean": 7.678132533328608e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002223470015451312, "completions/clipped_ratio": 0.0, "completions/max_length": 231.33333333333334, "completions/mean_length": 76.16015625, "completions/min_length": 32.666666666666664, "epoch": 0.594758064516129, "frac_reward_zero_std": 0.036458333333333336, "grad_norm": 2.109375, "kl": 0.0027552892919629813, "learning_rate": 9.870580739976935e-07, "loss": 0.00028331163339316844, "reward": 1.6077385346094768, "reward_std": 0.20427908500035605, "rewards/FidelityReward/mean": 0.7386068304379781, "rewards/FidelityReward/std": 0.2099431504805883, "rewards/JudgeFidelityReward/mean": 0.7454249461491903, "rewards/JudgeFidelityReward/std": 0.20450535416603088, "rewards/SelfEvolvingFormatReward/mean": 0.9928385416666666, "rewards/SelfEvolvingFormatReward/std": 0.08421787619590759, "step": 295 }, { "clip_ratio/high_max": 0.0019868471659719943, "clip_ratio/high_mean": 0.0003111242898739874, "clip_ratio/low_mean": 0.00011202927125850693, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00042315354803577063, "completions/clipped_ratio": 0.0, "completions/max_length": 227.5, "completions/mean_length": 75.703125, "completions/min_length": 32.0, "epoch": 0.6048387096774194, "frac_reward_zero_std": 0.0234375, "grad_norm": 2.171875, "kl": 0.0027867887169122697, "learning_rate": 9.862936985505128e-07, "loss": 0.00016098381020128726, "reward": 1.6165317296981812, "reward_std": 0.20332063734531403, "rewards/FidelityReward/mean": 0.7412256896495819, "rewards/FidelityReward/std": 0.20807379484176636, "rewards/JudgeFidelityReward/mean": 0.7574479877948761, "rewards/JudgeFidelityReward/std": 0.18963442742824554, "rewards/SelfEvolvingFormatReward/mean": 0.9931640625, "rewards/SelfEvolvingFormatReward/std": 0.0822625607252121, "step": 300 }, { "clip_ratio/high_max": 0.0013066187966614962, "clip_ratio/high_mean": 0.00018936135747935623, "clip_ratio/low_mean": 6.43294828478247e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025369084905833004, "completions/clipped_ratio": 0.0, "completions/max_length": 200.66666666666666, "completions/mean_length": 75.49869791666667, "completions/min_length": 33.333333333333336, "epoch": 0.6149193548387096, "frac_reward_zero_std": 0.046875, "grad_norm": 2.3125, "kl": 0.002878782385960221, "learning_rate": 9.855077065616315e-07, "loss": 2.4391603074036538e-05, "reward": 1.6148613293965657, "reward_std": 0.20127060015996298, "rewards/FidelityReward/mean": 0.7432370781898499, "rewards/FidelityReward/std": 0.20375916361808777, "rewards/JudgeFidelityReward/mean": 0.7517120043436686, "rewards/JudgeFidelityReward/std": 0.17630178729693094, "rewards/SelfEvolvingFormatReward/mean": 0.9915364583333334, "rewards/SelfEvolvingFormatReward/std": 0.09074912716945012, "step": 305 }, { "clip_ratio/high_max": 0.002004476776346564, "clip_ratio/high_mean": 0.000206852174596861, "clip_ratio/low_mean": 0.00014293257845565677, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003497847588732839, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/mean_length": 77.123046875, "completions/min_length": 31.5, "epoch": 0.625, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.09375, "kl": 0.0029592263977974652, "learning_rate": 9.847001329696652e-07, "loss": 0.0001822957186959684, "reward": 1.6015384793281555, "reward_std": 0.19969259947538376, "rewards/FidelityReward/mean": 0.7312135398387909, "rewards/FidelityReward/std": 0.20555494725704193, "rewards/JudgeFidelityReward/mean": 0.7455326914787292, "rewards/JudgeFidelityReward/std": 0.17770697176456451, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06616134010255337, "step": 310 }, { "clip_ratio/high_max": 0.0007401495473459363, "clip_ratio/high_mean": 0.0001148746465332806, "clip_ratio/low_mean": 0.0001188705675303936, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023374521406367422, "completions/clipped_ratio": 0.0, "completions/max_length": 211.66666666666666, "completions/mean_length": 75.63346354166667, "completions/min_length": 31.0, "epoch": 0.6350806451612904, "frac_reward_zero_std": 0.036458333333333336, "grad_norm": 2.34375, "kl": 0.0028862398583441974, "learning_rate": 9.838710136725668e-07, "loss": 0.00012372254859656097, "reward": 1.6144988934199016, "reward_std": 0.19939513007799783, "rewards/FidelityReward/mean": 0.7450321118036906, "rewards/FidelityReward/std": 0.21013451119263968, "rewards/JudgeFidelityReward/mean": 0.7428397933642069, "rewards/JudgeFidelityReward/std": 0.18681908150513968, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06243881583213806, "step": 315 }, { "clip_ratio/high_max": 0.002004886604845524, "clip_ratio/high_mean": 0.0002573783160187304, "clip_ratio/low_mean": 0.00010730996436905115, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003646882832981646, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/mean_length": 74.044921875, "completions/min_length": 34.0, "epoch": 0.6451612903225806, "frac_reward_zero_std": 0.0078125, "grad_norm": 2.234375, "kl": 0.0029578486457467077, "learning_rate": 9.830203855260304e-07, "loss": 0.00015798918902873993, "reward": 1.591860830783844, "reward_std": 0.2114465832710266, "rewards/FidelityReward/mean": 0.7278772294521332, "rewards/FidelityReward/std": 0.19702628254890442, "rewards/JudgeFidelityReward/mean": 0.7318735718727112, "rewards/JudgeFidelityReward/std": 0.17337269335985184, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 320 }, { "clip_ratio/high_max": 0.0011376409325748681, "clip_ratio/high_mean": 0.00018162710475735366, "clip_ratio/low_mean": 4.1094400512520227e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022272150963544845, "completions/clipped_ratio": 0.0, "completions/max_length": 256.3333333333333, "completions/mean_length": 77.43619791666667, "completions/min_length": 34.0, "epoch": 0.655241935483871, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 2.5, "kl": 0.0031816278118640185, "learning_rate": 9.821482863418537e-07, "loss": -8.871505269780754e-06, "reward": 1.6273669799168904, "reward_std": 0.19914257526397705, "rewards/FidelityReward/mean": 0.7521573901176453, "rewards/FidelityReward/std": 0.19737226764361063, "rewards/JudgeFidelityReward/mean": 0.7556275526682535, "rewards/JudgeFidelityReward/std": 0.18793260057767233, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.07174401481946309, "step": 325 }, { "clip_ratio/high_max": 0.0016741338651627303, "clip_ratio/high_mean": 0.00024272059090435505, "clip_ratio/low_mean": 6.937232974451036e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003120929119177163, "completions/clipped_ratio": 0.0, "completions/max_length": 268.5, "completions/mean_length": 75.9794921875, "completions/min_length": 32.0, "epoch": 0.6653225806451613, "frac_reward_zero_std": 0.0390625, "grad_norm": 1.984375, "kl": 0.003177332133054733, "learning_rate": 9.81254754886256e-07, "loss": 7.332940585911274e-05, "reward": 1.6112900972366333, "reward_std": 0.20259395986795425, "rewards/FidelityReward/mean": 0.7375958561897278, "rewards/FidelityReward/std": 0.19600486010313034, "rewards/JudgeFidelityReward/mean": 0.755200982093811, "rewards/JudgeFidelityReward/std": 0.17830181866884232, "rewards/SelfEvolvingFormatReward/mean": 0.9921875, "rewards/SelfEvolvingFormatReward/std": 0.08812850713729858, "step": 330 }, { "clip_ratio/high_max": 0.0019934539683163165, "clip_ratio/high_mean": 0.0002360528102144599, "clip_ratio/low_mean": 3.907275968231261e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002751255757175386, "completions/clipped_ratio": 0.0, "completions/max_length": 198.66666666666666, "completions/mean_length": 76.21028645833333, "completions/min_length": 33.333333333333336, "epoch": 0.6754032258064516, "frac_reward_zero_std": 0.036458333333333336, "grad_norm": 2.234375, "kl": 0.003376556094735861, "learning_rate": 9.803398308781568e-07, "loss": 0.00017515772487968205, "reward": 1.5759168068567913, "reward_std": 0.211767906943957, "rewards/FidelityReward/mean": 0.7096703251202902, "rewards/FidelityReward/std": 0.20909111201763153, "rewards/JudgeFidelityReward/mean": 0.7370503147443136, "rewards/JudgeFidelityReward/std": 0.18058453500270844, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.0649204986790816, "step": 335 }, { "clip_ratio/high_max": 0.0021852553822100163, "clip_ratio/high_mean": 0.00022013281704857945, "clip_ratio/low_mean": 5.5564477952430026e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002756972913630307, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/mean_length": 74.244140625, "completions/min_length": 34.5, "epoch": 0.6854838709677419, "frac_reward_zero_std": 0.0234375, "grad_norm": 2.296875, "kl": 0.0031226723920553924, "learning_rate": 9.794035549874078e-07, "loss": 8.044074056670069e-05, "reward": 1.5830724835395813, "reward_std": 0.20681045949459076, "rewards/FidelityReward/mean": 0.7203421294689178, "rewards/FidelityReward/std": 0.20909161865711212, "rewards/JudgeFidelityReward/mean": 0.7264372110366821, "rewards/JudgeFidelityReward/std": 0.19104721397161484, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 340 }, { "clip_ratio/high_max": 0.0012390640564262866, "clip_ratio/high_mean": 0.00017345978994853795, "clip_ratio/low_mean": 3.603078512242064e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002094905823469162, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/mean_length": 77.31510416666667, "completions/min_length": 32.0, "epoch": 0.6955645161290323, "frac_reward_zero_std": 0.036458333333333336, "grad_norm": 2.140625, "kl": 0.0031697167083621024, "learning_rate": 9.784459688329878e-07, "loss": -2.6808877009898424e-05, "reward": 1.5705999930699666, "reward_std": 0.21656072636445364, "rewards/FidelityReward/mean": 0.7131428718566895, "rewards/FidelityReward/std": 0.21626592675844827, "rewards/JudgeFidelityReward/mean": 0.7240288654963175, "rewards/JudgeFidelityReward/std": 0.20347357292970022, "rewards/SelfEvolvingFormatReward/mean": 0.9908854166666666, "rewards/SelfEvolvingFormatReward/std": 0.09418417265017827, "step": 345 }, { "clip_ratio/high_max": 0.002709051128476858, "clip_ratio/high_mean": 0.00023875992046669127, "clip_ratio/low_mean": 8.873899350874126e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003274989197961986, "completions/clipped_ratio": 0.0, "completions/max_length": 191.5, "completions/mean_length": 75.8095703125, "completions/min_length": 33.5, "epoch": 0.7056451612903226, "frac_reward_zero_std": 0.0234375, "grad_norm": 1.9765625, "kl": 0.002995402691885829, "learning_rate": 9.774671149811503e-07, "loss": -1.8651559366844593e-05, "reward": 1.6583386063575745, "reward_std": 0.19070467352867126, "rewards/FidelityReward/mean": 0.7644099593162537, "rewards/FidelityReward/std": 0.20552733540534973, "rewards/JudgeFidelityReward/mean": 0.7917634844779968, "rewards/JudgeFidelityReward/std": 0.18006739765405655, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 350 }, { "clip_ratio/high_max": 0.0010475616902112961, "clip_ratio/high_mean": 0.00014282985939644276, "clip_ratio/low_mean": 8.635825361125171e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002291881013661623, "completions/clipped_ratio": 0.0, "completions/max_length": 202.33333333333334, "completions/mean_length": 74.04622395833333, "completions/min_length": 34.333333333333336, "epoch": 0.7157258064516129, "frac_reward_zero_std": 0.036458333333333336, "grad_norm": 2.15625, "kl": 0.003225324489176273, "learning_rate": 9.76467036943533e-07, "loss": 7.803280605003237e-05, "reward": 1.5934796333312988, "reward_std": 0.2012221614519755, "rewards/FidelityReward/mean": 0.7305273214975992, "rewards/FidelityReward/std": 0.21393844485282898, "rewards/JudgeFidelityReward/mean": 0.7337169845898946, "rewards/JudgeFidelityReward/std": 0.18846604724725088, "rewards/SelfEvolvingFormatReward/mean": 0.9921875, "rewards/SelfEvolvingFormatReward/std": 0.07155731568733852, "step": 355 }, { "clip_ratio/high_max": 0.0018426071852445602, "clip_ratio/high_mean": 0.00026659985305741427, "clip_ratio/low_mean": 0.00010094598255818709, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003675458487123251, "completions/clipped_ratio": 0.0, "completions/max_length": 194.5, "completions/mean_length": 75.4501953125, "completions/min_length": 31.5, "epoch": 0.7258064516129032, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.15625, "kl": 0.003368240129202604, "learning_rate": 9.75445779175223e-07, "loss": 0.000122145377099514, "reward": 1.6345046758651733, "reward_std": 0.19110313802957535, "rewards/FidelityReward/mean": 0.760875791311264, "rewards/FidelityReward/std": 0.19404229521751404, "rewards/JudgeFidelityReward/mean": 0.7540935277938843, "rewards/JudgeFidelityReward/std": 0.17889262735843658, "rewards/SelfEvolvingFormatReward/mean": 0.9931640625, "rewards/SelfEvolvingFormatReward/std": 0.08043622970581055, "step": 360 }, { "clip_ratio/high_max": 0.0016487493179738522, "clip_ratio/high_mean": 0.00010452818823978305, "clip_ratio/low_mean": 2.8197357460157944e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00013272554497234523, "completions/clipped_ratio": 0.0, "completions/max_length": 215.33333333333334, "completions/mean_length": 75.33268229166667, "completions/min_length": 34.333333333333336, "epoch": 0.7358870967741935, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.21875, "kl": 0.003168569877743721, "learning_rate": 9.744033870727797e-07, "loss": 8.364318637177348e-05, "reward": 1.605026404062907, "reward_std": 0.20224803686141968, "rewards/FidelityReward/mean": 0.7333139578501383, "rewards/FidelityReward/std": 0.20622047781944275, "rewards/JudgeFidelityReward/mean": 0.7492843667666117, "rewards/JudgeFidelityReward/std": 0.19165091713269553, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07565464576085408, "step": 365 }, { "clip_ratio/high_max": 0.0012850062223151327, "clip_ratio/high_mean": 0.00013953751476947218, "clip_ratio/low_mean": 0.0001324941811617464, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027203169302083554, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 74.0107421875, "completions/min_length": 32.0, "epoch": 0.7459677419354839, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.125, "kl": 0.0033486932050436737, "learning_rate": 9.73339906972219e-07, "loss": 2.40143621340394e-05, "reward": 1.628901183605194, "reward_std": 0.1950421705842018, "rewards/FidelityReward/mean": 0.7501066625118256, "rewards/FidelityReward/std": 0.1989622637629509, "rewards/JudgeFidelityReward/mean": 0.7644250392913818, "rewards/JudgeFidelityReward/std": 0.1830107644200325, "rewards/SelfEvolvingFormatReward/mean": 0.9931640625, "rewards/SelfEvolvingFormatReward/std": 0.07595821656286716, "step": 370 }, { "clip_ratio/high_max": 0.0012857143301516771, "clip_ratio/high_mean": 0.00012203795195091516, "clip_ratio/low_mean": 1.918193229357712e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001412198878824711, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/mean_length": 77.24283854166667, "completions/min_length": 34.666666666666664, "epoch": 0.7560483870967742, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.140625, "kl": 0.00354496268555522, "learning_rate": 9.722553861469523e-07, "loss": 0.00013385291676968337, "reward": 1.5978545745213826, "reward_std": 0.20586919784545898, "rewards/FidelityReward/mean": 0.7309906482696533, "rewards/FidelityReward/std": 0.2212108870347341, "rewards/JudgeFidelityReward/mean": 0.742842415968577, "rewards/JudgeFidelityReward/std": 0.19551816582679749, "rewards/SelfEvolvingFormatReward/mean": 0.9908854166666666, "rewards/SelfEvolvingFormatReward/std": 0.09418417265017827, "step": 375 }, { "clip_ratio/high_max": 0.0019283576402813196, "clip_ratio/high_mean": 0.0002247095515485853, "clip_ratio/low_mean": 8.778491028351709e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003124944632872939, "completions/clipped_ratio": 0.0, "completions/max_length": 270.5, "completions/mean_length": 75.302734375, "completions/min_length": 33.5, "epoch": 0.7661290322580645, "frac_reward_zero_std": 0.0625, "grad_norm": 2.328125, "kl": 0.0034782163333147765, "learning_rate": 9.711498728056846e-07, "loss": 0.0001419226871803403, "reward": 1.567171335220337, "reward_std": 0.20496829599142075, "rewards/FidelityReward/mean": 0.7129653096199036, "rewards/FidelityReward/std": 0.2030917853116989, "rewards/JudgeFidelityReward/mean": 0.7084119021892548, "rewards/JudgeFidelityReward/std": 0.21230367571115494, "rewards/SelfEvolvingFormatReward/mean": 1.0, "rewards/SelfEvolvingFormatReward/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.001316293841227889, "clip_ratio/high_mean": 0.00015300113009288907, "clip_ratio/low_mean": 7.771360396873206e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023071472533047198, "completions/clipped_ratio": 0.0, "completions/max_length": 201.33333333333334, "completions/mean_length": 75.00716145833333, "completions/min_length": 31.333333333333332, "epoch": 0.7762096774193549, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.234375, "kl": 0.003263033227995038, "learning_rate": 9.700234160902728e-07, "loss": 0.00015619526384398342, "reward": 1.6431833505630493, "reward_std": 0.19499988853931427, "rewards/FidelityReward/mean": 0.7632731199264526, "rewards/FidelityReward/std": 0.19541709125041962, "rewards/JudgeFidelityReward/mean": 0.763075570265452, "rewards/JudgeFidelityReward/std": 0.1822619984547297, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.054928320149580635, "step": 385 }, { "clip_ratio/high_max": 0.0014188212575390936, "clip_ratio/high_mean": 0.00019410373643040656, "clip_ratio/low_mean": 0.0001005523488856852, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029465609695762397, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/mean_length": 73.8779296875, "completions/min_length": 34.0, "epoch": 0.7862903225806451, "frac_reward_zero_std": 0.015625, "grad_norm": 2.4375, "kl": 0.003546341881155968, "learning_rate": 9.688760660735402e-07, "loss": 0.00017767713870853186, "reward": 1.5937024354934692, "reward_std": 0.2148924246430397, "rewards/FidelityReward/mean": 0.7258866727352142, "rewards/FidelityReward/std": 0.1970176249742508, "rewards/JudgeFidelityReward/mean": 0.7395377159118652, "rewards/JudgeFidelityReward/std": 0.1850525066256523, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 390 }, { "clip_ratio/high_max": 0.0008142068050801754, "clip_ratio/high_mean": 0.00019781413720920682, "clip_ratio/low_mean": 8.191667729988694e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027973082615062594, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/mean_length": 74.59505208333333, "completions/min_length": 34.0, "epoch": 0.7963709677419355, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.140625, "kl": 0.0035549656488001345, "learning_rate": 9.677078737570517e-07, "loss": 0.00013229053001850843, "reward": 1.6106128295262654, "reward_std": 0.20449099441369376, "rewards/FidelityReward/mean": 0.7390865087509155, "rewards/FidelityReward/std": 0.2107552041610082, "rewards/JudgeFidelityReward/mean": 0.7495629588762919, "rewards/JudgeFidelityReward/std": 0.1877656082312266, "rewards/SelfEvolvingFormatReward/mean": 0.9934895833333334, "rewards/SelfEvolvingFormatReward/std": 0.07610434914628665, "step": 395 }, { "clip_ratio/high_max": 0.002507927268743515, "clip_ratio/high_mean": 0.0002324261295143515, "clip_ratio/low_mean": 0.0001427518727723509, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003751780022867024, "completions/clipped_ratio": 0.0, "completions/max_length": 227.5, "completions/mean_length": 75.2880859375, "completions/min_length": 33.5, "epoch": 0.8064516129032258, "frac_reward_zero_std": 0.046875, "grad_norm": 1.9921875, "kl": 0.003182622930034995, "learning_rate": 9.665188910688458e-07, "loss": 0.00012974299024790526, "reward": 1.5767424702644348, "reward_std": 0.20488616824150085, "rewards/FidelityReward/mean": 0.7155653238296509, "rewards/FidelityReward/std": 0.2301565185189247, "rewards/JudgeFidelityReward/mean": 0.7252840101718903, "rewards/JudgeFidelityReward/std": 0.2099102959036827, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 400 }, { "clip_ratio/high_max": 0.0010299540590494871, "clip_ratio/high_mean": 0.00010588050354272127, "clip_ratio/low_mean": 4.514697357080877e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015102747129276396, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 75.12369791666667, "completions/min_length": 32.333333333333336, "epoch": 0.8165322580645161, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 1.921875, "kl": 0.003255288489162922, "learning_rate": 9.653091708611264e-07, "loss": 6.838762201368808e-05, "reward": 1.6178345282872517, "reward_std": 0.20507678389549255, "rewards/FidelityReward/mean": 0.7448057333628336, "rewards/FidelityReward/std": 0.198949267466863, "rewards/JudgeFidelityReward/mean": 0.750614861647288, "rewards/JudgeFidelityReward/std": 0.18701824049154916, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.0649204986790816, "step": 405 }, { "clip_ratio/high_max": 0.0016459109261631965, "clip_ratio/high_mean": 0.0002516284061130136, "clip_ratio/low_mean": 8.289423712994903e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003345226694364101, "completions/clipped_ratio": 0.0, "completions/max_length": 190.5, "completions/mean_length": 73.1025390625, "completions/min_length": 32.0, "epoch": 0.8266129032258065, "frac_reward_zero_std": 0.03125, "grad_norm": 1.96875, "kl": 0.0033506360370665787, "learning_rate": 9.640787669079144e-07, "loss": 0.00024944143369793893, "reward": 1.5695459246635437, "reward_std": 0.20494718849658966, "rewards/FidelityReward/mean": 0.7126714587211609, "rewards/FidelityReward/std": 0.227609321475029, "rewards/JudgeFidelityReward/mean": 0.7196083664894104, "rewards/JudgeFidelityReward/std": 0.2193503975868225, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07639661431312561, "step": 410 }, { "clip_ratio/high_max": 0.0010273972526192664, "clip_ratio/high_mean": 0.00012680821819230915, "clip_ratio/low_mean": 3.263264370616526e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015944086480885745, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/mean_length": 74.54622395833333, "completions/min_length": 36.333333333333336, "epoch": 0.8366935483870968, "frac_reward_zero_std": 0.03125, "grad_norm": 2.328125, "kl": 0.003400250989943743, "learning_rate": 9.62827733902656e-07, "loss": 0.0001523955608718097, "reward": 1.617352565129598, "reward_std": 0.19548503557840982, "rewards/FidelityReward/mean": 0.7392642498016357, "rewards/FidelityReward/std": 0.20968564848105112, "rewards/JudgeFidelityReward/mean": 0.7620360056559244, "rewards/JudgeFidelityReward/std": 0.1898804008960724, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07300814365347226, "step": 415 }, { "clip_ratio/high_max": 0.002097941841930151, "clip_ratio/high_mean": 0.0002161448122933507, "clip_ratio/low_mean": 0.00010681468993425369, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003229594964068383, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 74.416015625, "completions/min_length": 32.0, "epoch": 0.8467741935483871, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.078125, "kl": 0.003471101960167289, "learning_rate": 9.615561274557927e-07, "loss": -4.5389775186777116e-05, "reward": 1.5832498669624329, "reward_std": 0.2154698744416237, "rewards/FidelityReward/mean": 0.7261075973510742, "rewards/FidelityReward/std": 0.2080501765012741, "rewards/JudgeFidelityReward/mean": 0.7230735421180725, "rewards/JudgeFidelityReward/std": 0.19407101720571518, "rewards/SelfEvolvingFormatReward/mean": 0.9912109375, "rewards/SelfEvolvingFormatReward/std": 0.0920594371855259, "step": 420 }, { "clip_ratio/high_max": 0.0010786783881485463, "clip_ratio/high_mean": 0.00016759714926593006, "clip_ratio/low_mean": 0.00012000958377029746, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00028760674176737667, "completions/clipped_ratio": 0.0, "completions/max_length": 204.66666666666666, "completions/mean_length": 73.55533854166667, "completions/min_length": 33.333333333333336, "epoch": 0.8568548387096774, "frac_reward_zero_std": 0.036458333333333336, "grad_norm": 2.3125, "kl": 0.003646107390522957, "learning_rate": 9.60264004092288e-07, "loss": 0.00017391140572726728, "reward": 1.5755774577458699, "reward_std": 0.21255737046400705, "rewards/FidelityReward/mean": 0.7139163414637247, "rewards/FidelityReward/std": 0.20918900767962137, "rewards/JudgeFidelityReward/mean": 0.727879524230957, "rewards/JudgeFidelityReward/std": 0.18150187532107034, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.05362415313720703, "step": 425 }, { "clip_ratio/high_max": 0.0014242589473724364, "clip_ratio/high_mean": 0.0001677646127063781, "clip_ratio/low_mean": 0.00012402277789078653, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002917873847763985, "completions/clipped_ratio": 0.0, "completions/max_length": 230.5, "completions/mean_length": 74.435546875, "completions/min_length": 32.0, "epoch": 0.8669354838709677, "frac_reward_zero_std": 0.046875, "grad_norm": 2.015625, "kl": 0.0033176443073898556, "learning_rate": 9.589514212491163e-07, "loss": 0.00011181997833773494, "reward": 1.5565544366836548, "reward_std": 0.20641054213047028, "rewards/FidelityReward/mean": 0.6953582465648651, "rewards/FidelityReward/std": 0.21307209134101868, "rewards/JudgeFidelityReward/mean": 0.724345475435257, "rewards/JudgeFidelityReward/std": 0.17908954620361328, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 430 }, { "clip_ratio/high_max": 0.0018275573384016752, "clip_ratio/high_mean": 0.00014923018752597273, "clip_ratio/low_mean": 5.7736966846277936e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020696714636869729, "completions/clipped_ratio": 0.0, "completions/max_length": 164.66666666666666, "completions/mean_length": 73.56184895833333, "completions/min_length": 34.0, "epoch": 0.8770161290322581, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 2.046875, "kl": 0.003306277794763446, "learning_rate": 9.576184372727088e-07, "loss": 0.00014130959752947092, "reward": 1.5964099168777466, "reward_std": 0.20810621480147043, "rewards/FidelityReward/mean": 0.7343758543332418, "rewards/FidelityReward/std": 0.20108435054620108, "rewards/JudgeFidelityReward/mean": 0.7253701289494833, "rewards/JudgeFidelityReward/std": 0.19842028617858887, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.02081293861071269, "step": 435 }, { "clip_ratio/high_max": 0.001808472815901041, "clip_ratio/high_mean": 0.00028433128027245403, "clip_ratio/low_mean": 0.00012257650669198484, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004069078015163541, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/mean_length": 76.3955078125, "completions/min_length": 31.0, "epoch": 0.8870967741935484, "frac_reward_zero_std": 0.015625, "grad_norm": 2.15625, "kl": 0.0034071278292685746, "learning_rate": 9.562651114163595e-07, "loss": 8.052109042182565e-05, "reward": 1.6543192863464355, "reward_std": 0.18823904544115067, "rewards/FidelityReward/mean": 0.7719813585281372, "rewards/FidelityReward/std": 0.19796733558177948, "rewards/JudgeFidelityReward/mean": 0.7685820460319519, "rewards/JudgeFidelityReward/std": 0.18294009566307068, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 440 }, { "clip_ratio/high_max": 0.0020722313784062862, "clip_ratio/high_mean": 0.00025685186265036466, "clip_ratio/low_mean": 5.37767686182633e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003106286283582449, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/mean_length": 75.0, "completions/min_length": 33.666666666666664, "epoch": 0.8971774193548387, "frac_reward_zero_std": 0.046875, "grad_norm": 2.234375, "kl": 0.0036086780484765766, "learning_rate": 9.548915038375927e-07, "loss": 0.0001725410227663815, "reward": 1.5936236381530762, "reward_std": 0.20693915585676828, "rewards/FidelityReward/mean": 0.7259300549825033, "rewards/FidelityReward/std": 0.2074479560057322, "rewards/JudgeFidelityReward/mean": 0.741246501604716, "rewards/JudgeFidelityReward/std": 0.19192010660966238, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.0734837291141351, "step": 445 }, { "clip_ratio/high_max": 0.0014450440881773829, "clip_ratio/high_mean": 0.00021245286334306, "clip_ratio/low_mean": 9.32169692532625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003056698362343013, "completions/clipped_ratio": 0.0, "completions/max_length": 192.5, "completions/mean_length": 72.9033203125, "completions/min_length": 32.5, "epoch": 0.907258064516129, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.390625, "kl": 0.0035986676812171936, "learning_rate": 9.534976755954878e-07, "loss": 0.00015276784542948008, "reward": 1.6114169359207153, "reward_std": 0.19568924605846405, "rewards/FidelityReward/mean": 0.7427796721458435, "rewards/FidelityReward/std": 0.19222036749124527, "rewards/JudgeFidelityReward/mean": 0.7431339025497437, "rewards/JudgeFidelityReward/std": 0.17747990787029266, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07131390832364559, "step": 450 }, { "clip_ratio/high_max": 0.0012154258554801344, "clip_ratio/high_mean": 0.00017941404366865753, "clip_ratio/low_mean": 4.077233606949449e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022018636809661984, "completions/clipped_ratio": 0.0, "completions/max_length": 232.66666666666666, "completions/mean_length": 74.689453125, "completions/min_length": 34.333333333333336, "epoch": 0.9173387096774194, "frac_reward_zero_std": 0.03125, "grad_norm": 2.21875, "kl": 0.0036416539456695317, "learning_rate": 9.520836886479648e-07, "loss": 0.00013680162373930216, "reward": 1.6225932836532593, "reward_std": 0.2029707282781601, "rewards/FidelityReward/mean": 0.75175940990448, "rewards/FidelityReward/std": 0.19950036704540253, "rewards/JudgeFidelityReward/mean": 0.7455740372339884, "rewards/JudgeFidelityReward/std": 0.1872014751036962, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.04754260554909706, "step": 455 }, { "clip_ratio/high_max": 0.0018150960560888052, "clip_ratio/high_mean": 0.00020029924053233117, "clip_ratio/low_mean": 0.00011600038342294284, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003162996144965291, "completions/clipped_ratio": 0.0, "completions/max_length": 210.5, "completions/mean_length": 76.8134765625, "completions/min_length": 32.0, "epoch": 0.9274193548387096, "frac_reward_zero_std": 0.0390625, "grad_norm": 1.953125, "kl": 0.0036030960734933616, "learning_rate": 9.506496058490318e-07, "loss": 0.00021024607121944427, "reward": 1.5880165100097656, "reward_std": 0.20692020654678345, "rewards/FidelityReward/mean": 0.7282078862190247, "rewards/FidelityReward/std": 0.20159916579723358, "rewards/JudgeFidelityReward/mean": 0.7303594648838043, "rewards/JudgeFidelityReward/std": 0.19510173052549362, "rewards/SelfEvolvingFormatReward/mean": 0.9892578125, "rewards/SelfEvolvingFormatReward/std": 0.09135611914098263, "step": 460 }, { "clip_ratio/high_max": 0.0010834884829819202, "clip_ratio/high_mean": 0.00014392734738066792, "clip_ratio/low_mean": 4.6576625027228144e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019050397677347063, "completions/clipped_ratio": 0.0, "completions/max_length": 229.66666666666666, "completions/mean_length": 76.72005208333333, "completions/min_length": 33.666666666666664, "epoch": 0.9375, "frac_reward_zero_std": 0.026041666666666668, "grad_norm": 2.21875, "kl": 0.003469265252351761, "learning_rate": 9.491954909459894e-07, "loss": -1.4035822823643685e-05, "reward": 1.6168943643569946, "reward_std": 0.2025804320971171, "rewards/FidelityReward/mean": 0.7453358372052511, "rewards/FidelityReward/std": 0.20750045279661813, "rewards/JudgeFidelityReward/mean": 0.7483254671096802, "rewards/JudgeFidelityReward/std": 0.19258654117584229, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.06835554415980975, "step": 465 }, { "clip_ratio/high_max": 0.0017863365355879068, "clip_ratio/high_mean": 0.00023616202524863185, "clip_ratio/low_mean": 6.569331235368736e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003018553426954895, "completions/clipped_ratio": 0.0, "completions/max_length": 232.5, "completions/mean_length": 73.8095703125, "completions/min_length": 36.5, "epoch": 0.9475806451612904, "frac_reward_zero_std": 0.0390625, "grad_norm": 1.9375, "kl": 0.0035008850507438185, "learning_rate": 9.477214085765974e-07, "loss": 7.169916061684489e-05, "reward": 1.6110008358955383, "reward_std": 0.19787679612636566, "rewards/FidelityReward/mean": 0.7399646043777466, "rewards/FidelityReward/std": 0.20494937151670456, "rewards/JudgeFidelityReward/mean": 0.7430489659309387, "rewards/JudgeFidelityReward/std": 0.20595244318246841, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 470 }, { "clip_ratio/high_max": 0.001558528793975711, "clip_ratio/high_mean": 0.00016799596487544478, "clip_ratio/low_mean": 9.08637885004282e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025885975919663905, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/mean_length": 75.10611979166667, "completions/min_length": 33.333333333333336, "epoch": 0.9576612903225806, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.09375, "kl": 0.003716399520635605, "learning_rate": 9.462274242662025e-07, "loss": 0.00022369502112269403, "reward": 1.5878278414408367, "reward_std": 0.20188860595226288, "rewards/FidelityReward/mean": 0.7282669544219971, "rewards/FidelityReward/std": 0.20414317150910696, "rewards/JudgeFidelityReward/mean": 0.7236790657043457, "rewards/JudgeFidelityReward/std": 0.19323292871316275, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.06566246723135312, "step": 475 }, { "clip_ratio/high_max": 0.0022500901482999324, "clip_ratio/high_mean": 0.00019611713360063733, "clip_ratio/low_mean": 9.725074050948024e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029336787993088364, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/mean_length": 75.203125, "completions/min_length": 36.0, "epoch": 0.967741935483871, "frac_reward_zero_std": 0.015625, "grad_norm": 2.0, "kl": 0.003613591333851218, "learning_rate": 9.447136044248243e-07, "loss": 0.00011318452889099717, "reward": 1.593558430671692, "reward_std": 0.19220629334449768, "rewards/FidelityReward/mean": 0.7252277731895447, "rewards/FidelityReward/std": 0.23206043988466263, "rewards/JudgeFidelityReward/mean": 0.7386143207550049, "rewards/JudgeFidelityReward/std": 0.21817822754383087, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 480 }, { "clip_ratio/high_max": 0.0011723757954314352, "clip_ratio/high_mean": 0.00016218077507801354, "clip_ratio/low_mean": 0.00010235755471512675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002645383181516081, "completions/clipped_ratio": 0.0, "completions/max_length": 234.33333333333334, "completions/mean_length": 74.73177083333333, "completions/min_length": 32.0, "epoch": 0.9778225806451613, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.140625, "kl": 0.0034535580314695837, "learning_rate": 9.431800163442041e-07, "loss": 0.00014449101872742175, "reward": 1.5814368327458699, "reward_std": 0.20238933463891348, "rewards/FidelityReward/mean": 0.7213548421859741, "rewards/FidelityReward/std": 0.20362207293510437, "rewards/JudgeFidelityReward/mean": 0.7234192490577698, "rewards/JudgeFidelityReward/std": 0.19171919922033945, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04627847671508789, "step": 485 }, { "clip_ratio/high_max": 0.001171713531948626, "clip_ratio/high_mean": 0.00015593113494105637, "clip_ratio/low_mean": 9.808331960812212e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025401445454917846, "completions/clipped_ratio": 0.0, "completions/max_length": 267.5, "completions/mean_length": 75.1474609375, "completions/min_length": 31.0, "epoch": 0.9879032258064516, "frac_reward_zero_std": 0.0625, "grad_norm": 2.15625, "kl": 0.003644200135022402, "learning_rate": 9.416267281948132e-07, "loss": 0.0002147066406905651, "reward": 1.623420536518097, "reward_std": 0.1851251944899559, "rewards/FidelityReward/mean": 0.7496626675128937, "rewards/FidelityReward/std": 0.19945240020751953, "rewards/JudgeFidelityReward/mean": 0.7484922409057617, "rewards/JudgeFidelityReward/std": 0.18619056791067123, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 490 }, { "clip_ratio/high_max": 0.0011841311119496823, "clip_ratio/high_mean": 0.0002235025051049888, "clip_ratio/low_mean": 3.65105108357966e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002600130159407854, "completions/clipped_ratio": 0.0, "completions/max_length": 212.33333333333334, "completions/mean_length": 75.50260416666667, "completions/min_length": 31.0, "epoch": 0.9979838709677419, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.296875, "kl": 0.0037629819475114346, "learning_rate": 9.400538090228233e-07, "loss": 0.00012270202860236167, "reward": 1.598408301671346, "reward_std": 0.1926950067281723, "rewards/FidelityReward/mean": 0.7283265988032023, "rewards/FidelityReward/std": 0.2096781631310781, "rewards/JudgeFidelityReward/mean": 0.7421165506045023, "rewards/JudgeFidelityReward/std": 0.1967570036649704, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 495 }, { "clip_ratio/high_max": 0.0023770046420395373, "clip_ratio/high_mean": 0.0002930303162429482, "clip_ratio/low_mean": 0.00015881494909990578, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004518452798947692, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/mean_length": 74.904296875, "completions/min_length": 33.5, "epoch": 1.0080645161290323, "frac_reward_zero_std": 0.0078125, "grad_norm": 2.125, "kl": 0.003726558154448867, "learning_rate": 9.384613287470362e-07, "loss": 0.00016939626075327397, "reward": 1.5805119276046753, "reward_std": 0.20925211161375046, "rewards/FidelityReward/mean": 0.7181563079357147, "rewards/FidelityReward/std": 0.21119672060012817, "rewards/JudgeFidelityReward/mean": 0.7256878912448883, "rewards/JudgeFidelityReward/std": 0.20211216807365417, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 500 }, { "clip_ratio/high_max": 0.0007584890816360712, "clip_ratio/high_mean": 8.21422814624384e-05, "clip_ratio/low_mean": 6.381955463439227e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001459618390072137, "completions/clipped_ratio": 0.0, "completions/max_length": 219.33333333333334, "completions/mean_length": 74.767578125, "completions/min_length": 33.666666666666664, "epoch": 1.0181451612903225, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.140625, "kl": 0.003499349532648921, "learning_rate": 9.368493581557769e-07, "loss": 0.00020882603712379932, "reward": 1.6462950309117634, "reward_std": 0.1922173649072647, "rewards/FidelityReward/mean": 0.7669726014137268, "rewards/FidelityReward/std": 0.19106647372245789, "rewards/JudgeFidelityReward/mean": 0.7612489859263102, "rewards/JudgeFidelityReward/std": 0.17602909108002981, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04162587722142538, "step": 505 }, { "clip_ratio/high_max": 0.0022038863971829414, "clip_ratio/high_mean": 0.0001755740464432165, "clip_ratio/low_mean": 8.095951707218774e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025653355987742543, "completions/clipped_ratio": 0.0, "completions/max_length": 225.5, "completions/mean_length": 76.04296875, "completions/min_length": 35.0, "epoch": 1.028225806451613, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.0, "kl": 0.0034431429114192722, "learning_rate": 9.352179689037461e-07, "loss": 0.0002549342345446348, "reward": 1.6514723300933838, "reward_std": 0.1780329942703247, "rewards/FidelityReward/mean": 0.7705270051956177, "rewards/FidelityReward/std": 0.2007790356874466, "rewards/JudgeFidelityReward/mean": 0.7657969892024994, "rewards/JudgeFidelityReward/std": 0.1925663724541664, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 510 }, { "clip_ratio/high_max": 0.0007970415987074375, "clip_ratio/high_mean": 9.531349351163954e-05, "clip_ratio/low_mean": 9.94635745882988e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019477707101032138, "completions/clipped_ratio": 0.0, "completions/max_length": 197.66666666666666, "completions/mean_length": 73.75716145833333, "completions/min_length": 34.0, "epoch": 1.0383064516129032, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.109375, "kl": 0.0035792432725429537, "learning_rate": 9.335672335088355e-07, "loss": 0.00022075343877077104, "reward": 1.5881810983022053, "reward_std": 0.19155045847098032, "rewards/FidelityReward/mean": 0.7259675860404968, "rewards/FidelityReward/std": 0.20778378347555795, "rewards/JudgeFidelityReward/mean": 0.7270310322443644, "rewards/JudgeFidelityReward/std": 0.19932294388612112, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04162587722142538, "step": 515 }, { "clip_ratio/high_max": 0.0013308865018188953, "clip_ratio/high_mean": 0.00013808992080157622, "clip_ratio/low_mean": 9.305482089985161e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002311447518877685, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/mean_length": 76.63671875, "completions/min_length": 34.0, "epoch": 1.0483870967741935, "frac_reward_zero_std": 0.0625, "grad_norm": 2.09375, "kl": 0.0035867986269295217, "learning_rate": 9.318972253489045e-07, "loss": 0.00013508474221453072, "reward": 1.6485061049461365, "reward_std": 0.1879657655954361, "rewards/FidelityReward/mean": 0.7681467831134796, "rewards/FidelityReward/std": 0.19286666810512543, "rewards/JudgeFidelityReward/mean": 0.7636482417583466, "rewards/JudgeFidelityReward/std": 0.17846491932868958, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 520 }, { "clip_ratio/high_max": 0.000786901987157762, "clip_ratio/high_mean": 0.00011640689917840064, "clip_ratio/low_mean": 4.760270530823618e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016400960739701987, "completions/clipped_ratio": 0.0, "completions/max_length": 188.33333333333334, "completions/mean_length": 73.47721354166667, "completions/min_length": 33.333333333333336, "epoch": 1.0584677419354838, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.328125, "kl": 0.003433499438688159, "learning_rate": 9.302080186585179e-07, "loss": 9.903124300763011e-05, "reward": 1.6026536623636882, "reward_std": 0.20714689791202545, "rewards/FidelityReward/mean": 0.7375296354293823, "rewards/FidelityReward/std": 0.2089687486489614, "rewards/JudgeFidelityReward/mean": 0.7328522602717081, "rewards/JudgeFidelityReward/std": 0.2059452384710312, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.05027572065591812, "step": 525 }, { "clip_ratio/high_max": 0.0016353435814380646, "clip_ratio/high_mean": 0.00026386984973214567, "clip_ratio/low_mean": 7.49308557715267e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003388006880413741, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/mean_length": 73.3134765625, "completions/min_length": 33.0, "epoch": 1.0685483870967742, "frac_reward_zero_std": 0.0625, "grad_norm": 2.46875, "kl": 0.003476974740624428, "learning_rate": 9.284996885256458e-07, "loss": 0.00021256315521895887, "reward": 1.589996337890625, "reward_std": 0.19888553023338318, "rewards/FidelityReward/mean": 0.7220189571380615, "rewards/FidelityReward/std": 0.20946024358272552, "rewards/JudgeFidelityReward/mean": 0.7388843894004822, "rewards/JudgeFidelityReward/std": 0.19012704491615295, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 530 }, { "clip_ratio/high_max": 0.0009862352628260851, "clip_ratio/high_mean": 0.0001701774133834988, "clip_ratio/low_mean": 9.343967831227928e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026361709460616113, "completions/clipped_ratio": 0.0, "completions/max_length": 189.33333333333334, "completions/mean_length": 75.27669270833333, "completions/min_length": 32.333333333333336, "epoch": 1.0786290322580645, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.234375, "kl": 0.0032292513642460108, "learning_rate": 9.267723108883268e-07, "loss": 0.00018552177352830767, "reward": 1.6207613945007324, "reward_std": 0.19791900118192038, "rewards/FidelityReward/mean": 0.7420472701390585, "rewards/FidelityReward/std": 0.2054333289464315, "rewards/JudgeFidelityReward/mean": 0.7593814929326376, "rewards/JudgeFidelityReward/std": 0.18653305868307749, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 535 }, { "clip_ratio/high_max": 0.0014797952957451344, "clip_ratio/high_mean": 0.00025116748292930424, "clip_ratio/low_mean": 0.0001337924441031646, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003849599277600646, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/mean_length": 75.537109375, "completions/min_length": 32.5, "epoch": 1.0887096774193548, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.125, "kl": 0.0033902379684150217, "learning_rate": 9.250259625312916e-07, "loss": 9.6505181863904e-05, "reward": 1.6320975422859192, "reward_std": 0.19210845232009888, "rewards/FidelityReward/mean": 0.7561315298080444, "rewards/FidelityReward/std": 0.20060937106609344, "rewards/JudgeFidelityReward/mean": 0.7548616528511047, "rewards/JudgeFidelityReward/std": 0.1955794394016266, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 540 }, { "clip_ratio/high_max": 0.0015539086889475584, "clip_ratio/high_mean": 0.00018147206865251064, "clip_ratio/low_mean": 8.908923191484064e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002705612918362021, "completions/clipped_ratio": 0.0, "completions/max_length": 196.66666666666666, "completions/mean_length": 73.86783854166667, "completions/min_length": 33.333333333333336, "epoch": 1.0987903225806452, "frac_reward_zero_std": 0.078125, "grad_norm": 1.9765625, "kl": 0.00332570425234735, "learning_rate": 9.2326072108255e-07, "loss": 0.00011084664147347212, "reward": 1.6369022925694783, "reward_std": 0.18409612278143564, "rewards/FidelityReward/mean": 0.7535685300827026, "rewards/FidelityReward/std": 0.19531740248203278, "rewards/JudgeFidelityReward/mean": 0.767969528834025, "rewards/JudgeFidelityReward/std": 0.17570483684539795, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.029462782045205433, "step": 545 }, { "clip_ratio/high_max": 0.001435672794468701, "clip_ratio/high_mean": 0.00019773704116232693, "clip_ratio/low_mean": 0.00016521114157512783, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003629481769166887, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/mean_length": 74.2265625, "completions/min_length": 32.5, "epoch": 1.1088709677419355, "frac_reward_zero_std": 0.0390625, "grad_norm": 1.9765625, "kl": 0.003335633687674999, "learning_rate": 9.214766650099408e-07, "loss": 8.115480304695665e-05, "reward": 1.620646357536316, "reward_std": 0.19622685760259628, "rewards/FidelityReward/mean": 0.7462309002876282, "rewards/FidelityReward/std": 0.2156522050499916, "rewards/JudgeFidelityReward/mean": 0.7537136375904083, "rewards/JudgeFidelityReward/std": 0.1985473707318306, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06941771507263184, "step": 550 }, { "clip_ratio/high_max": 0.0014293357729911805, "clip_ratio/high_mean": 0.00020335575100034476, "clip_ratio/low_mean": 4.598129307851195e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002493370557203889, "completions/clipped_ratio": 0.0, "completions/max_length": 190.33333333333334, "completions/mean_length": 74.6171875, "completions/min_length": 31.0, "epoch": 1.1189516129032258, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.125, "kl": 0.0034847544971853494, "learning_rate": 9.196738736176428e-07, "loss": 0.00015407662140205504, "reward": 1.6073570648829143, "reward_std": 0.19115016361077627, "rewards/FidelityReward/mean": 0.734769880771637, "rewards/FidelityReward/std": 0.20682587226231894, "rewards/JudgeFidelityReward/mean": 0.7484295964241028, "rewards/JudgeFidelityReward/std": 0.18232408662637076, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.054928320149580635, "step": 555 }, { "clip_ratio/high_max": 0.0020423254929482935, "clip_ratio/high_mean": 0.00037562341894954444, "clip_ratio/low_mean": 9.553042473271489e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004711538553237915, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/mean_length": 76.990234375, "completions/min_length": 36.0, "epoch": 1.129032258064516, "frac_reward_zero_std": 0.015625, "grad_norm": 2.109375, "kl": 0.0033298964146524666, "learning_rate": 9.178524270426502e-07, "loss": 7.172013865783811e-05, "reward": 1.6112347841262817, "reward_std": 0.20654379576444626, "rewards/FidelityReward/mean": 0.7387530505657196, "rewards/FidelityReward/std": 0.20456420630216599, "rewards/JudgeFidelityReward/mean": 0.7478930950164795, "rewards/JudgeFidelityReward/std": 0.19748066365718842, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 560 }, { "clip_ratio/high_max": 0.0012911347672343253, "clip_ratio/high_mean": 0.00013265617890283464, "clip_ratio/low_mean": 8.631791279185563e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021897407714277506, "completions/clipped_ratio": 0.0, "completions/max_length": 217.66666666666666, "completions/mean_length": 74.662109375, "completions/min_length": 33.666666666666664, "epoch": 1.1391129032258065, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.171875, "kl": 0.003310439921915531, "learning_rate": 9.160124062512104e-07, "loss": 0.00010836385190486908, "reward": 1.6411219040552776, "reward_std": 0.1910504251718521, "rewards/FidelityReward/mean": 0.7614614367485046, "rewards/FidelityReward/std": 0.19644617040952048, "rewards/JudgeFidelityReward/mean": 0.7619250416755676, "rewards/JudgeFidelityReward/std": 0.19402121504147848, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.05027572065591812, "step": 565 }, { "clip_ratio/high_max": 0.0017819267697632313, "clip_ratio/high_mean": 0.00029260744922794403, "clip_ratio/low_mean": 0.000102278619306162, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003948860801756382, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/mean_length": 75.388671875, "completions/min_length": 32.0, "epoch": 1.1491935483870968, "frac_reward_zero_std": 0.0234375, "grad_norm": 2.265625, "kl": 0.00348663697950542, "learning_rate": 9.141538930352244e-07, "loss": 0.0001499232603237033, "reward": 1.6079723834991455, "reward_std": 0.19409578293561935, "rewards/FidelityReward/mean": 0.7367365956306458, "rewards/FidelityReward/std": 0.20902155339717865, "rewards/JudgeFidelityReward/mean": 0.7444248199462891, "rewards/JudgeFidelityReward/std": 0.18433324247598648, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 570 }, { "clip_ratio/high_max": 0.0010019007604569197, "clip_ratio/high_mean": 0.00020001147640869021, "clip_ratio/low_mean": 3.9516501419711855e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023952798219397664, "completions/clipped_ratio": 0.0, "completions/max_length": 217.66666666666666, "completions/mean_length": 74.19596354166667, "completions/min_length": 33.0, "epoch": 1.159274193548387, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 2.015625, "kl": 0.0035289299674332144, "learning_rate": 9.122769700086119e-07, "loss": 2.994682581629604e-05, "reward": 1.6376818418502808, "reward_std": 0.18154818813006082, "rewards/FidelityReward/mean": 0.7610440254211426, "rewards/FidelityReward/std": 0.19991150995095572, "rewards/JudgeFidelityReward/mean": 0.7584839264551798, "rewards/JudgeFidelityReward/std": 0.19898520906766257, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.05875233809153239, "step": 575 }, { "clip_ratio/high_max": 0.0022299102041870356, "clip_ratio/high_mean": 0.00030797640793025496, "clip_ratio/low_mean": 8.424201514571905e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000392218423075974, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 288.5, "completions/mean_length": 74.1904296875, "completions/min_length": 32.0, "epoch": 1.1693548387096775, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.171875, "kl": 0.003221688000485301, "learning_rate": 9.103817206036382e-07, "loss": 0.00019378915894776582, "reward": 1.5749393701553345, "reward_std": 0.21259748935699463, "rewards/FidelityReward/mean": 0.7139135599136353, "rewards/FidelityReward/std": 0.20557010173797607, "rewards/JudgeFidelityReward/mean": 0.7288876473903656, "rewards/JudgeFidelityReward/std": 0.19285614043474197, "rewards/SelfEvolvingFormatReward/mean": 0.9931640625, "rewards/SelfEvolvingFormatReward/std": 0.08043622970581055, "step": 580 }, { "clip_ratio/high_max": 0.000998556800186634, "clip_ratio/high_mean": 0.0001292276894673705, "clip_ratio/low_mean": 8.071087067946791e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020993856014683843, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/mean_length": 75.94856770833333, "completions/min_length": 33.0, "epoch": 1.1794354838709677, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.046875, "kl": 0.0030935286078602074, "learning_rate": 9.084682290672054e-07, "loss": 0.00012836636742576956, "reward": 1.612806002298991, "reward_std": 0.19813678661982217, "rewards/FidelityReward/mean": 0.7374562422434489, "rewards/FidelityReward/std": 0.19483970602353415, "rewards/JudgeFidelityReward/mean": 0.7513506015141805, "rewards/JudgeFidelityReward/std": 0.1824225684007009, "rewards/SelfEvolvingFormatReward/mean": 0.9993489583333334, "rewards/SelfEvolvingFormatReward/std": 0.014731391022602717, "step": 585 }, { "clip_ratio/high_max": 0.0019368555396795273, "clip_ratio/high_mean": 0.00021055025281384586, "clip_ratio/low_mean": 0.00011730403202818707, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003278542892076075, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 75.404296875, "completions/min_length": 34.5, "epoch": 1.189516129032258, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.046875, "kl": 0.003326195012778044, "learning_rate": 9.065365804571088e-07, "loss": 1.888846600195393e-05, "reward": 1.6311485767364502, "reward_std": 0.19623849540948868, "rewards/FidelityReward/mean": 0.748270571231842, "rewards/FidelityReward/std": 0.21364907175302505, "rewards/JudgeFidelityReward/mean": 0.7706389725208282, "rewards/JudgeFidelityReward/std": 0.1868349313735962, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06616134010255337, "step": 590 }, { "clip_ratio/high_max": 0.0012776535004377365, "clip_ratio/high_mean": 0.00019171849125996233, "clip_ratio/low_mean": 8.85542598553002e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002802727511152625, "completions/clipped_ratio": 0.0, "completions/max_length": 201.33333333333334, "completions/mean_length": 75.845703125, "completions/min_length": 33.333333333333336, "epoch": 1.1995967741935485, "frac_reward_zero_std": 0.046875, "grad_norm": 1.953125, "kl": 0.003206081362441182, "learning_rate": 9.045868606382538e-07, "loss": 0.00021854215301573277, "reward": 1.608991026878357, "reward_std": 0.20920229951540628, "rewards/FidelityReward/mean": 0.7367909153302511, "rewards/FidelityReward/std": 0.20279129842917124, "rewards/JudgeFidelityReward/mean": 0.7489575544993082, "rewards/JudgeFidelityReward/std": 0.18255079289277396, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.05362415313720703, "step": 595 }, { "clip_ratio/high_max": 0.0018868097569793463, "clip_ratio/high_mean": 0.00028395303525030613, "clip_ratio/low_mean": 0.00010094659955939278, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038489961298182607, "completions/clipped_ratio": 0.0, "completions/max_length": 238.5, "completions/mean_length": 75.8603515625, "completions/min_length": 36.0, "epoch": 1.2096774193548387, "frac_reward_zero_std": 0.0625, "grad_norm": 2.28125, "kl": 0.003324106661602855, "learning_rate": 9.026191562788415e-07, "loss": 1.135899219661951e-06, "reward": 1.6413321495056152, "reward_std": 0.189909465610981, "rewards/FidelityReward/mean": 0.7592511177062988, "rewards/FidelityReward/std": 0.1877257376909256, "rewards/JudgeFidelityReward/mean": 0.7670918703079224, "rewards/JudgeFidelityReward/std": 0.1806408315896988, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 600 }, { "clip_ratio/high_max": 0.0008783928817138076, "clip_ratio/high_mean": 0.00010326302726753056, "clip_ratio/low_mean": 6.149167165858671e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001647546945605427, "completions/clipped_ratio": 0.0, "completions/max_length": 221.66666666666666, "completions/mean_length": 76.12044270833333, "completions/min_length": 33.333333333333336, "epoch": 1.219758064516129, "frac_reward_zero_std": 0.046875, "grad_norm": 1.9609375, "kl": 0.0032531611621379854, "learning_rate": 9.00633554846514e-07, "loss": 0.00015426366589963437, "reward": 1.6239542961120605, "reward_std": 0.19172940651575723, "rewards/FidelityReward/mean": 0.74470454454422, "rewards/FidelityReward/std": 0.20524614055951437, "rewards/JudgeFidelityReward/mean": 0.7624057133992513, "rewards/JudgeFidelityReward/std": 0.18060003717740378, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.04754260554909706, "step": 605 }, { "clip_ratio/high_max": 0.0012845756486058236, "clip_ratio/high_mean": 0.00019575466867536307, "clip_ratio/low_mean": 0.00010909613192779943, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030485080205835404, "completions/clipped_ratio": 0.0, "completions/max_length": 210.5, "completions/mean_length": 75.2822265625, "completions/min_length": 33.5, "epoch": 1.2298387096774193, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.09375, "kl": 0.0032131943386048078, "learning_rate": 8.986301446044676e-07, "loss": 8.896399522200227e-05, "reward": 1.6232876181602478, "reward_std": 0.1891619861125946, "rewards/FidelityReward/mean": 0.7415268421173096, "rewards/FidelityReward/std": 0.19719228893518448, "rewards/JudgeFidelityReward/mean": 0.7674277722835541, "rewards/JudgeFidelityReward/std": 0.16754883527755737, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 610 }, { "clip_ratio/high_max": 0.0009894866961985826, "clip_ratio/high_mean": 0.0001161213731393218, "clip_ratio/low_mean": 8.7224887101911e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020334626315161587, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/mean_length": 74.99544270833333, "completions/min_length": 32.333333333333336, "epoch": 1.2399193548387097, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.515625, "kl": 0.0033285082783550022, "learning_rate": 8.966090146075291e-07, "loss": 0.00019685861188918353, "reward": 1.6097251971562703, "reward_std": 0.19424951573212942, "rewards/FidelityReward/mean": 0.7343238592147827, "rewards/FidelityReward/std": 0.20161395271619162, "rewards/JudgeFidelityReward/mean": 0.7521047790845236, "rewards/JudgeFidelityReward/std": 0.18269074459870657, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.02081293861071269, "step": 615 }, { "clip_ratio/high_max": 0.0013127005659043789, "clip_ratio/high_mean": 0.00017812474397942423, "clip_ratio/low_mean": 9.142248309217394e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000269547215430066, "completions/clipped_ratio": 0.0, "completions/max_length": 197.5, "completions/mean_length": 76.2880859375, "completions/min_length": 33.5, "epoch": 1.25, "frac_reward_zero_std": 0.0234375, "grad_norm": 2.234375, "kl": 0.003461890621110797, "learning_rate": 8.945702546981968e-07, "loss": 0.00013946343678981066, "reward": 1.5945574045181274, "reward_std": 0.20678403973579407, "rewards/FidelityReward/mean": 0.732734739780426, "rewards/FidelityReward/std": 0.1994636356830597, "rewards/JudgeFidelityReward/mean": 0.7275514602661133, "rewards/JudgeFidelityReward/std": 0.18932580202817917, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06243881583213806, "step": 620 }, { "clip_ratio/high_max": 0.001365375961177051, "clip_ratio/high_mean": 0.00021276476909406483, "clip_ratio/low_mean": 1.7631852824706584e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023039662046357988, "completions/clipped_ratio": 0.0, "completions/max_length": 232.66666666666666, "completions/mean_length": 76.52669270833333, "completions/min_length": 33.0, "epoch": 1.2600806451612903, "frac_reward_zero_std": 0.036458333333333336, "grad_norm": 2.140625, "kl": 0.0035285252146422864, "learning_rate": 8.925139555026473e-07, "loss": 6.334880599752068e-05, "reward": 1.5921732187271118, "reward_std": 0.2057236284017563, "rewards/FidelityReward/mean": 0.7264052430788676, "rewards/FidelityReward/std": 0.2160342981417974, "rewards/JudgeFidelityReward/mean": 0.7367443442344666, "rewards/JudgeFidelityReward/std": 0.18692918121814728, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.07100204626719157, "step": 625 }, { "clip_ratio/high_max": 0.0018943045288324356, "clip_ratio/high_mean": 0.0003665481228381395, "clip_ratio/low_mean": 8.026483701542019e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00044681295985355974, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 76.337890625, "completions/min_length": 34.0, "epoch": 1.2701612903225805, "frac_reward_zero_std": 0.0234375, "grad_norm": 2.15625, "kl": 0.0036774768959730864, "learning_rate": 8.90440208426707e-07, "loss": 0.00013948054984211922, "reward": 1.632437527179718, "reward_std": 0.19778937101364136, "rewards/FidelityReward/mean": 0.7539342045783997, "rewards/FidelityReward/std": 0.2069338709115982, "rewards/JudgeFidelityReward/mean": 0.7628660202026367, "rewards/JudgeFidelityReward/std": 0.20006193220615387, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07639661431312561, "step": 630 }, { "clip_ratio/high_max": 0.0012495001312345267, "clip_ratio/high_mean": 0.0001645627082325518, "clip_ratio/low_mean": 7.464316149707884e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023920584935694934, "completions/clipped_ratio": 0.0, "completions/max_length": 255.66666666666666, "completions/mean_length": 76.814453125, "completions/min_length": 32.0, "epoch": 1.280241935483871, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.125, "kl": 0.003692268207669258, "learning_rate": 8.883491056517884e-07, "loss": 8.458188967779279e-05, "reward": 1.6441163221995037, "reward_std": 0.19254034757614136, "rewards/FidelityReward/mean": 0.7640542785326639, "rewards/FidelityReward/std": 0.1979845662911733, "rewards/JudgeFidelityReward/mean": 0.7653323610623678, "rewards/JudgeFidelityReward/std": 0.1865114023288091, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.06957309817274411, "step": 635 }, { "clip_ratio/high_max": 0.0014950294047594071, "clip_ratio/high_mean": 0.00015233377926051616, "clip_ratio/low_mean": 9.000337304314599e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024233714793808758, "completions/clipped_ratio": 0.0, "completions/max_length": 226.5, "completions/mean_length": 75.5556640625, "completions/min_length": 33.5, "epoch": 1.2903225806451613, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.0, "kl": 0.0034374089911580084, "learning_rate": 8.862407401307934e-07, "loss": 0.00021584341302514077, "reward": 1.6759803295135498, "reward_std": 0.1745346486568451, "rewards/FidelityReward/mean": 0.7796679139137268, "rewards/FidelityReward/std": 0.1987016424536705, "rewards/JudgeFidelityReward/mean": 0.794577956199646, "rewards/JudgeFidelityReward/std": 0.18223830312490463, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 640 }, { "clip_ratio/high_max": 0.0016009056009352208, "clip_ratio/high_mean": 0.0002745814505033195, "clip_ratio/low_mean": 3.0896822136128324e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003054782631807029, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/mean_length": 75.23111979166667, "completions/min_length": 33.333333333333336, "epoch": 1.3004032258064515, "frac_reward_zero_std": 0.036458333333333336, "grad_norm": 2.265625, "kl": 0.0036021945998072624, "learning_rate": 8.841152055839806e-07, "loss": 0.00012351255863904954, "reward": 1.6410905122756958, "reward_std": 0.1916411817073822, "rewards/FidelityReward/mean": 0.7549005150794983, "rewards/FidelityReward/std": 0.2094531704982122, "rewards/JudgeFidelityReward/mean": 0.7782393495241801, "rewards/JudgeFidelityReward/std": 0.1874872793753942, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.0734837291141351, "step": 645 }, { "clip_ratio/high_max": 0.0014054435072466732, "clip_ratio/high_mean": 0.00024546217173337936, "clip_ratio/low_mean": 0.00013367982173804194, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037914200220257044, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/mean_length": 74.5771484375, "completions/min_length": 32.5, "epoch": 1.310483870967742, "frac_reward_zero_std": 0.0703125, "grad_norm": 1.9609375, "kl": 0.00369371366687119, "learning_rate": 8.819725964947995e-07, "loss": 0.000248456746339798, "reward": 1.5932749509811401, "reward_std": 0.1907499060034752, "rewards/FidelityReward/mean": 0.7279483079910278, "rewards/FidelityReward/std": 0.22240214049816132, "rewards/JudgeFidelityReward/mean": 0.7316297888755798, "rewards/JudgeFidelityReward/std": 0.22417157888412476, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 650 }, { "clip_ratio/high_max": 0.001604291796684265, "clip_ratio/high_mean": 0.00021479703718796372, "clip_ratio/low_mean": 7.563143444713205e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029042846290394664, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/mean_length": 74.43229166666667, "completions/min_length": 32.666666666666664, "epoch": 1.3205645161290323, "frac_reward_zero_std": 0.0625, "grad_norm": 1.96875, "kl": 0.0035498138517141343, "learning_rate": 8.79813008105691e-07, "loss": 7.70477287005633e-05, "reward": 1.5975842873255413, "reward_std": 0.19558570782343546, "rewards/FidelityReward/mean": 0.7261700431505839, "rewards/FidelityReward/std": 0.223571186264356, "rewards/JudgeFidelityReward/mean": 0.7480368614196777, "rewards/JudgeFidelityReward/std": 0.19956834117571512, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.06835554415980975, "step": 655 }, { "clip_ratio/high_max": 0.0014307500328868627, "clip_ratio/high_mean": 0.0001749038026900962, "clip_ratio/low_mean": 6.868313503218814e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024358693626709283, "completions/clipped_ratio": 0.0, "completions/max_length": 229.5, "completions/mean_length": 73.5009765625, "completions/min_length": 29.5, "epoch": 1.3306451612903225, "frac_reward_zero_std": 0.0625, "grad_norm": 1.8515625, "kl": 0.0035727576352655886, "learning_rate": 8.776365364138525e-07, "loss": 0.00018859412521123886, "reward": 1.6249927878379822, "reward_std": 0.189725860953331, "rewards/FidelityReward/mean": 0.7462618052959442, "rewards/FidelityReward/std": 0.19566188752651215, "rewards/JudgeFidelityReward/mean": 0.7633213400840759, "rewards/JudgeFidelityReward/std": 0.1698402836918831, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07639661431312561, "step": 660 }, { "clip_ratio/high_max": 0.0010617754887789488, "clip_ratio/high_mean": 0.00013754141400568187, "clip_ratio/low_mean": 8.271397091448307e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022025539074093102, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/mean_length": 75.078125, "completions/min_length": 33.333333333333336, "epoch": 1.340725806451613, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 1.984375, "kl": 0.0037686909548938275, "learning_rate": 8.754432781669728e-07, "loss": 0.00020099978428333998, "reward": 1.5905930201212566, "reward_std": 0.20474832753340402, "rewards/FidelityReward/mean": 0.7252245942751566, "rewards/FidelityReward/std": 0.19990364710489908, "rewards/JudgeFidelityReward/mean": 0.7346431215604147, "rewards/JudgeFidelityReward/std": 0.18363689382870993, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.061009867737690605, "step": 665 }, { "clip_ratio/high_max": 0.0028761631809175016, "clip_ratio/high_mean": 0.0003396879881620407, "clip_ratio/low_mean": 5.3710238717030734e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039339824579656126, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/mean_length": 74.125, "completions/min_length": 38.5, "epoch": 1.3508064516129032, "frac_reward_zero_std": 0.078125, "grad_norm": 2.21875, "kl": 0.003456853283569217, "learning_rate": 8.732333308589293e-07, "loss": 3.138819884043187e-05, "reward": 1.650431513786316, "reward_std": 0.17702319473028183, "rewards/FidelityReward/mean": 0.768574982881546, "rewards/FidelityReward/std": 0.1867859959602356, "rewards/JudgeFidelityReward/mean": 0.7637131214141846, "rewards/JudgeFidelityReward/std": 0.18350443243980408, "rewards/SelfEvolvingFormatReward/mean": 1.0, "rewards/SelfEvolvingFormatReward/std": 0.0, "step": 670 }, { "clip_ratio/high_max": 0.0007920529460534453, "clip_ratio/high_mean": 0.0001214043761137873, "clip_ratio/low_mean": 3.404385861358605e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015544823254458605, "completions/clipped_ratio": 0.0, "completions/max_length": 237.66666666666666, "completions/mean_length": 74.078125, "completions/min_length": 33.333333333333336, "epoch": 1.3608870967741935, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.21875, "kl": 0.0035529250744730234, "learning_rate": 8.710067927254554e-07, "loss": 0.00020849253050982952, "reward": 1.6255322297414143, "reward_std": 0.19734853009382883, "rewards/FidelityReward/mean": 0.750289261341095, "rewards/FidelityReward/std": 0.19515116016070047, "rewards/JudgeFidelityReward/mean": 0.7537410656611124, "rewards/JudgeFidelityReward/std": 0.17728510002295175, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04627847671508789, "step": 675 }, { "clip_ratio/high_max": 0.0015265317633748055, "clip_ratio/high_mean": 0.00023071533651091157, "clip_ratio/low_mean": 0.00013397395377978683, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003646892961114645, "completions/clipped_ratio": 0.0, "completions/max_length": 215.5, "completions/mean_length": 73.04296875, "completions/min_length": 31.5, "epoch": 1.370967741935484, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.453125, "kl": 0.003528269054368138, "learning_rate": 8.687637627397736e-07, "loss": 0.00011523320572450756, "reward": 1.5844515562057495, "reward_std": 0.20672860741615295, "rewards/FidelityReward/mean": 0.7237606346607208, "rewards/FidelityReward/std": 0.2075192555785179, "rewards/JudgeFidelityReward/mean": 0.7243117094039917, "rewards/JudgeFidelityReward/std": 0.1902826577425003, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 680 }, { "clip_ratio/high_max": 0.001930274348706007, "clip_ratio/high_mean": 0.00018424351583234966, "clip_ratio/low_mean": 4.935361430398188e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023359712213277817, "completions/clipped_ratio": 0.0, "completions/max_length": 196.33333333333334, "completions/mean_length": 76.515625, "completions/min_length": 32.333333333333336, "epoch": 1.3810483870967742, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 1.953125, "kl": 0.003612079471349716, "learning_rate": 8.665043406081959e-07, "loss": 0.00017887039575725794, "reward": 1.6032399733861287, "reward_std": 0.18780227998892465, "rewards/FidelityReward/mean": 0.7346210877100626, "rewards/FidelityReward/std": 0.21059379975001016, "rewards/JudgeFidelityReward/mean": 0.7404929796854655, "rewards/JudgeFidelityReward/std": 0.19486969709396362, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04410756006836891, "step": 685 }, { "clip_ratio/high_max": 0.0016258503776043654, "clip_ratio/high_mean": 0.00019449505489319563, "clip_ratio/low_mean": 0.00010242630887660197, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002969213528558612, "completions/clipped_ratio": 0.0, "completions/max_length": 231.5, "completions/mean_length": 73.328125, "completions/min_length": 32.0, "epoch": 1.3911290322580645, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.046875, "kl": 0.003460288466885686, "learning_rate": 8.642286267656916e-07, "loss": 0.00011310081463307142, "reward": 1.6089423894882202, "reward_std": 0.20089133083820343, "rewards/FidelityReward/mean": 0.7334210872650146, "rewards/FidelityReward/std": 0.1929113268852234, "rewards/JudgeFidelityReward/mean": 0.7559254765510559, "rewards/JudgeFidelityReward/std": 0.16507473587989807, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06941771507263184, "step": 690 }, { "clip_ratio/high_max": 0.00104796530213207, "clip_ratio/high_mean": 0.0002235113875940442, "clip_ratio/low_mean": 3.5363644565222784e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025887503288686277, "completions/clipped_ratio": 0.0, "completions/max_length": 196.33333333333334, "completions/mean_length": 73.33138020833333, "completions/min_length": 31.666666666666668, "epoch": 1.4012096774193548, "frac_reward_zero_std": 0.08854166666666667, "grad_norm": 2.03125, "kl": 0.0035660707391798495, "learning_rate": 8.619367223714231e-07, "loss": 0.00019740264397114516, "reward": 1.607433597246806, "reward_std": 0.1885577936967214, "rewards/FidelityReward/mean": 0.7372048497200012, "rewards/FidelityReward/std": 0.20859457552433014, "rewards/JudgeFidelityReward/mean": 0.7443638642628988, "rewards/JudgeFidelityReward/std": 0.1939282864332199, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.061009867737690605, "step": 695 }, { "clip_ratio/high_max": 0.0028057904914021493, "clip_ratio/high_mean": 0.00043705882271751764, "clip_ratio/low_mean": 0.00011230182135477663, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005493606440722942, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/mean_length": 75.595703125, "completions/min_length": 35.0, "epoch": 1.4112903225806452, "frac_reward_zero_std": 0.0703125, "grad_norm": 1.9609375, "kl": 0.0037797615863382817, "learning_rate": 8.596287293042489e-07, "loss": 0.0001788265770301223, "reward": 1.6364941596984863, "reward_std": 0.18413805216550827, "rewards/FidelityReward/mean": 0.75811368227005, "rewards/FidelityReward/std": 0.19574426114559174, "rewards/JudgeFidelityReward/mean": 0.7606672048568726, "rewards/JudgeFidelityReward/std": 0.18200107663869858, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 700 }, { "clip_ratio/high_max": 0.001207055151462555, "clip_ratio/high_mean": 0.00015768736484460532, "clip_ratio/low_mean": 7.776033889967948e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023544770665466784, "completions/clipped_ratio": 0.0, "completions/max_length": 215.66666666666666, "completions/mean_length": 75.76432291666667, "completions/min_length": 32.666666666666664, "epoch": 1.4213709677419355, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.265625, "kl": 0.003271968895569444, "learning_rate": 8.573047501581951e-07, "loss": 0.00013430085964500905, "reward": 1.6357983350753784, "reward_std": 0.18715102970600128, "rewards/FidelityReward/mean": 0.7546847661336263, "rewards/FidelityReward/std": 0.20067955056826273, "rewards/JudgeFidelityReward/mean": 0.7641803820927938, "rewards/JudgeFidelityReward/std": 0.18814371526241302, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 705 }, { "clip_ratio/high_max": 0.001959692779928446, "clip_ratio/high_mean": 0.0002641329192556441, "clip_ratio/low_mean": 0.00016255484952125697, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000426687765866518, "completions/clipped_ratio": 0.0, "completions/max_length": 207.5, "completions/mean_length": 76.3173828125, "completions/min_length": 33.0, "epoch": 1.4314516129032258, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.0625, "kl": 0.0034284746274352075, "learning_rate": 8.549648882378946e-07, "loss": 0.00018771778559312224, "reward": 1.6028215885162354, "reward_std": 0.20269698649644852, "rewards/FidelityReward/mean": 0.7339736223220825, "rewards/FidelityReward/std": 0.20947516709566116, "rewards/JudgeFidelityReward/mean": 0.7425787448883057, "rewards/JudgeFidelityReward/std": 0.1923869401216507, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06941771507263184, "step": 710 }, { "clip_ratio/high_max": 0.0011912497226148843, "clip_ratio/high_mean": 0.00022423671907745302, "clip_ratio/low_mean": 6.779028990422376e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002920270140748471, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/mean_length": 76.724609375, "completions/min_length": 36.0, "epoch": 1.441532258064516, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 1.890625, "kl": 0.0033730565570294856, "learning_rate": 8.526092475539958e-07, "loss": 6.793711800128222e-05, "reward": 1.6085057258605957, "reward_std": 0.19744369884332022, "rewards/FidelityReward/mean": 0.7374211351076762, "rewards/FidelityReward/std": 0.2138290504614512, "rewards/JudgeFidelityReward/mean": 0.7467263340950012, "rewards/JudgeFidelityReward/std": 0.1992805302143097, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.062273996571699776, "step": 715 }, { "clip_ratio/high_max": 0.0022912837564945223, "clip_ratio/high_mean": 0.00037700943648815154, "clip_ratio/low_mean": 9.899945871438831e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004760089097544551, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/mean_length": 74.3681640625, "completions/min_length": 35.5, "epoch": 1.4516129032258065, "frac_reward_zero_std": 0.078125, "grad_norm": 2.0625, "kl": 0.003780394047498703, "learning_rate": 8.502379328185384e-07, "loss": 0.00014881883980706335, "reward": 1.628044068813324, "reward_std": 0.18570178002119064, "rewards/FidelityReward/mean": 0.7502396702766418, "rewards/FidelityReward/std": 0.1976233497262001, "rewards/JudgeFidelityReward/mean": 0.7604916095733643, "rewards/JudgeFidelityReward/std": 0.18009310215711594, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06616134010255337, "step": 720 }, { "clip_ratio/high_max": 0.00111869047395885, "clip_ratio/high_mean": 0.00013551540905609727, "clip_ratio/low_mean": 2.0585310267051682e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015610072296112776, "completions/clipped_ratio": 0.0, "completions/max_length": 257.3333333333333, "completions/mean_length": 74.35546875, "completions/min_length": 33.0, "epoch": 1.4616935483870968, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 2.09375, "kl": 0.00351148578338325, "learning_rate": 8.478510494402988e-07, "loss": 0.00013680504634976386, "reward": 1.599442680676778, "reward_std": 0.1989391545454661, "rewards/FidelityReward/mean": 0.7341169516245524, "rewards/FidelityReward/std": 0.21768425901730856, "rewards/JudgeFidelityReward/mean": 0.7358597715695699, "rewards/JudgeFidelityReward/std": 0.2027409424384435, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.07174401481946309, "step": 725 }, { "clip_ratio/high_max": 0.0020043952856212853, "clip_ratio/high_mean": 0.0003018290619365871, "clip_ratio/low_mean": 0.00012582278359332121, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004276518477126956, "completions/clipped_ratio": 0.0, "completions/max_length": 201.5, "completions/mean_length": 73.08984375, "completions/min_length": 34.5, "epoch": 1.471774193548387, "frac_reward_zero_std": 0.0625, "grad_norm": 2.390625, "kl": 0.0038472670596092938, "learning_rate": 8.454487035201055e-07, "loss": 0.000143245211802423, "reward": 1.6108475923538208, "reward_std": 0.19381389021873474, "rewards/FidelityReward/mean": 0.7390163838863373, "rewards/FidelityReward/std": 0.20594951510429382, "rewards/JudgeFidelityReward/mean": 0.7475687563419342, "rewards/JudgeFidelityReward/std": 0.19730141013860703, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 730 }, { "clip_ratio/high_max": 0.0010737914824858307, "clip_ratio/high_mean": 0.00016361910675186664, "clip_ratio/low_mean": 7.957346679177135e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002431925619021058, "completions/clipped_ratio": 0.0, "completions/max_length": 231.33333333333334, "completions/mean_length": 73.78255208333333, "completions/min_length": 33.666666666666664, "epoch": 1.4818548387096775, "frac_reward_zero_std": 0.11458333333333333, "grad_norm": 2.078125, "kl": 0.003480457840487361, "learning_rate": 8.43031001846121e-07, "loss": 0.00011873642215505242, "reward": 1.6322510639826457, "reward_std": 0.1710647145907084, "rewards/FidelityReward/mean": 0.7513734499613444, "rewards/FidelityReward/std": 0.20885869363943735, "rewards/JudgeFidelityReward/mean": 0.7624062101046244, "rewards/JudgeFidelityReward/std": 0.1890715758005778, "rewards/SelfEvolvingFormatReward/mean": 0.9993489583333334, "rewards/SelfEvolvingFormatReward/std": 0.014731391022602717, "step": 735 }, { "clip_ratio/high_max": 0.0016768498346209526, "clip_ratio/high_mean": 0.0002827884687576443, "clip_ratio/low_mean": 0.0001345451339147985, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004173335968516767, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/mean_length": 76.53515625, "completions/min_length": 34.0, "epoch": 1.4919354838709677, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.046875, "kl": 0.00367811624892056, "learning_rate": 8.405980518890967e-07, "loss": 9.682738455012441e-05, "reward": 1.5875986218452454, "reward_std": 0.20550260692834854, "rewards/FidelityReward/mean": 0.7242696285247803, "rewards/FidelityReward/std": 0.23214030265808105, "rewards/JudgeFidelityReward/mean": 0.7315408885478973, "rewards/JudgeFidelityReward/std": 0.22106479853391647, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06941771507263184, "step": 740 }, { "clip_ratio/high_max": 0.0010197840631008149, "clip_ratio/high_mean": 0.00020557157695293425, "clip_ratio/low_mean": 7.27644277503714e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002783360076136887, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/mean_length": 75.158203125, "completions/min_length": 31.666666666666668, "epoch": 1.502016129032258, "frac_reward_zero_std": 0.0625, "grad_norm": 2.109375, "kl": 0.003903386276215315, "learning_rate": 8.381499617975946e-07, "loss": 3.7547259125858544e-06, "reward": 1.605461557706197, "reward_std": 0.1921925793091456, "rewards/FidelityReward/mean": 0.7367771863937378, "rewards/FidelityReward/std": 0.20870657761891684, "rewards/JudgeFidelityReward/mean": 0.741926113764445, "rewards/JudgeFidelityReward/std": 0.19476579129695892, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.0649204986790816, "step": 745 }, { "clip_ratio/high_max": 0.0014177114237099886, "clip_ratio/high_mean": 0.0001676111714914441, "clip_ratio/low_mean": 0.0001238680531969294, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002914792217779905, "completions/clipped_ratio": 0.0, "completions/max_length": 231.5, "completions/mean_length": 72.0078125, "completions/min_length": 32.0, "epoch": 1.5120967741935485, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.203125, "kl": 0.0037392024416476486, "learning_rate": 8.356868403931803e-07, "loss": 0.0002582948189228773, "reward": 1.5381697416305542, "reward_std": 0.21207699179649353, "rewards/FidelityReward/mean": 0.6904847621917725, "rewards/FidelityReward/std": 0.21305683255195618, "rewards/JudgeFidelityReward/mean": 0.6953699588775635, "rewards/JudgeFidelityReward/std": 0.19517897069454193, "rewards/SelfEvolvingFormatReward/mean": 1.0, "rewards/SelfEvolvingFormatReward/std": 0.0, "step": 750 }, { "clip_ratio/high_max": 0.0014492639806121589, "clip_ratio/high_mean": 0.0002088619687128812, "clip_ratio/low_mean": 7.448700198438018e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00028334895614534616, "completions/clipped_ratio": 0.0, "completions/max_length": 225.66666666666666, "completions/mean_length": 75.20442708333333, "completions/min_length": 33.333333333333336, "epoch": 1.5221774193548387, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.203125, "kl": 0.0037283933721482753, "learning_rate": 8.33208797165585e-07, "loss": 8.568688062950968e-05, "reward": 1.6137044429779053, "reward_std": 0.19870010515054068, "rewards/FidelityReward/mean": 0.7421625057856241, "rewards/FidelityReward/std": 0.19629046817620596, "rewards/JudgeFidelityReward/mean": 0.7450369795163473, "rewards/JudgeFidelityReward/std": 0.18641015887260437, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 755 }, { "clip_ratio/high_max": 0.001652307389304042, "clip_ratio/high_mean": 0.00017776433960534633, "clip_ratio/low_mean": 9.179420594591647e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026955854264087975, "completions/clipped_ratio": 0.0, "completions/max_length": 251.5, "completions/mean_length": 75.2578125, "completions/min_length": 33.0, "epoch": 1.532258064516129, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.140625, "kl": 0.0035654558334499596, "learning_rate": 8.3071594226784e-07, "loss": 0.0001302841119468212, "reward": 1.5909894704818726, "reward_std": 0.20170116424560547, "rewards/FidelityReward/mean": 0.72828009724617, "rewards/FidelityReward/std": 0.21506460011005402, "rewards/JudgeFidelityReward/mean": 0.7283483445644379, "rewards/JudgeFidelityReward/std": 0.20582108199596405, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 760 }, { "clip_ratio/high_max": 0.0013075795723125339, "clip_ratio/high_mean": 0.00012449650675989689, "clip_ratio/low_mean": 7.05798709532246e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019507638062350453, "completions/clipped_ratio": 0.0, "completions/max_length": 211.66666666666666, "completions/mean_length": 74.87630208333333, "completions/min_length": 35.333333333333336, "epoch": 1.5423387096774195, "frac_reward_zero_std": 0.0625, "grad_norm": 2.234375, "kl": 0.003952575381845236, "learning_rate": 8.282083865113785e-07, "loss": 0.0002575176302343607, "reward": 1.6191606521606445, "reward_std": 0.19953595598538718, "rewards/FidelityReward/mean": 0.7476562658945719, "rewards/FidelityReward/std": 0.19393291076024374, "rewards/JudgeFidelityReward/mean": 0.7482170263926188, "rewards/JudgeFidelityReward/std": 0.17989063262939453, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.06835554415980975, "step": 765 }, { "clip_ratio/high_max": 0.0018820853438228369, "clip_ratio/high_mean": 0.00032842204673215746, "clip_ratio/low_mean": 6.344935754896142e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039187141228467224, "completions/clipped_ratio": 0.0, "completions/max_length": 232.5, "completions/mean_length": 74.052734375, "completions/min_length": 34.0, "epoch": 1.5524193548387095, "frac_reward_zero_std": 0.078125, "grad_norm": 2.109375, "kl": 0.003879649750888348, "learning_rate": 8.256862413611112e-07, "loss": 2.3394687741529196e-05, "reward": 1.6072525382041931, "reward_std": 0.1921481266617775, "rewards/FidelityReward/mean": 0.7377997636795044, "rewards/FidelityReward/std": 0.20815159380435944, "rewards/JudgeFidelityReward/mean": 0.7398821413516998, "rewards/JudgeFidelityReward/std": 0.19652455300092697, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 770 }, { "clip_ratio/high_max": 0.0010526081081479789, "clip_ratio/high_mean": 7.770525407977403e-05, "clip_ratio/low_mean": 0.00010847753437701613, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001861827971879393, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/mean_length": 76.376953125, "completions/min_length": 34.0, "epoch": 1.5625, "frac_reward_zero_std": 0.026041666666666668, "grad_norm": 2.1875, "kl": 0.0038392658345401285, "learning_rate": 8.231496189304704e-07, "loss": 0.00021659866906702517, "reward": 1.621914307276408, "reward_std": 0.19908575216929117, "rewards/FidelityReward/mean": 0.7429630160331726, "rewards/FidelityReward/std": 0.21314328908920288, "rewards/JudgeFidelityReward/mean": 0.762459933757782, "rewards/JudgeFidelityReward/std": 0.19016849994659424, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.05362415313720703, "step": 775 }, { "clip_ratio/high_max": 0.0018541774712502956, "clip_ratio/high_mean": 0.0001838219934143126, "clip_ratio/low_mean": 0.00011099351395387203, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002948155102785677, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/mean_length": 77.0517578125, "completions/min_length": 33.5, "epoch": 1.5725806451612905, "frac_reward_zero_std": 0.03125, "grad_norm": 2.234375, "kl": 0.0037529848981648684, "learning_rate": 8.205986319764276e-07, "loss": 6.957196164876222e-05, "reward": 1.6075104475021362, "reward_std": 0.19329436123371124, "rewards/FidelityReward/mean": 0.7364306151866913, "rewards/FidelityReward/std": 0.2080443874001503, "rewards/JudgeFidelityReward/mean": 0.7441127598285675, "rewards/JudgeFidelityReward/std": 0.20355241000652313, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 780 }, { "clip_ratio/high_max": 0.0013500779634341598, "clip_ratio/high_mean": 0.00013500702334567904, "clip_ratio/low_mean": 7.520778162870557e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021021479042246938, "completions/clipped_ratio": 0.0, "completions/max_length": 216.33333333333334, "completions/mean_length": 75.67057291666667, "completions/min_length": 34.333333333333336, "epoch": 1.5826612903225805, "frac_reward_zero_std": 0.046875, "grad_norm": 1.9609375, "kl": 0.0038614793214946986, "learning_rate": 8.180333938944803e-07, "loss": 0.00016464988002553582, "reward": 1.6344128449757893, "reward_std": 0.18885886669158936, "rewards/FidelityReward/mean": 0.7572546601295471, "rewards/FidelityReward/std": 0.2024814486503601, "rewards/JudgeFidelityReward/mean": 0.7575715382893881, "rewards/JudgeFidelityReward/std": 0.1916777491569519, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.054928320149580635, "step": 785 }, { "clip_ratio/high_max": 0.002201330941170454, "clip_ratio/high_mean": 0.00029509119922295214, "clip_ratio/low_mean": 8.817178750177845e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003832629648968577, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/mean_length": 77.880859375, "completions/min_length": 31.0, "epoch": 1.592741935483871, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.0625, "kl": 0.003718973556533456, "learning_rate": 8.154540187136115e-07, "loss": 0.00010176445357501507, "reward": 1.60820072889328, "reward_std": 0.19080662727355957, "rewards/FidelityReward/mean": 0.7367359399795532, "rewards/FidelityReward/std": 0.23046162724494934, "rewards/JudgeFidelityReward/mean": 0.7468357682228088, "rewards/JudgeFidelityReward/std": 0.22356083244085312, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 790 }, { "clip_ratio/high_max": 0.0007546498905867338, "clip_ratio/high_mean": 0.00013491648132912815, "clip_ratio/low_mean": 0.00013164502452127634, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002665614942088723, "completions/clipped_ratio": 0.0, "completions/max_length": 214.33333333333334, "completions/mean_length": 75.75325520833333, "completions/min_length": 32.666666666666664, "epoch": 1.6028225806451613, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.203125, "kl": 0.003947620606049895, "learning_rate": 8.128606210912215e-07, "loss": 0.00013500202912837268, "reward": 1.6266104380289714, "reward_std": 0.1877302328745524, "rewards/FidelityReward/mean": 0.7493086457252502, "rewards/FidelityReward/std": 0.21726447840531668, "rewards/JudgeFidelityReward/mean": 0.7598118782043457, "rewards/JudgeFidelityReward/std": 0.2040102183818817, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.07174401481946309, "step": 795 }, { "clip_ratio/high_max": 0.001680393167771399, "clip_ratio/high_mean": 0.0002677296230103821, "clip_ratio/low_mean": 0.00010748991626314819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037521954509429635, "completions/clipped_ratio": 0.0, "completions/max_length": 272.5, "completions/mean_length": 75.537109375, "completions/min_length": 30.5, "epoch": 1.6129032258064515, "frac_reward_zero_std": 0.0625, "grad_norm": 2.15625, "kl": 0.003956712782382965, "learning_rate": 8.102533163080303e-07, "loss": 0.00022748482879251242, "reward": 1.6204094886779785, "reward_std": 0.19397956877946854, "rewards/FidelityReward/mean": 0.7448211908340454, "rewards/FidelityReward/std": 0.20862339437007904, "rewards/JudgeFidelityReward/mean": 0.7580125331878662, "rewards/JudgeFidelityReward/std": 0.1953229382634163, "rewards/SelfEvolvingFormatReward/mean": 0.9931640625, "rewards/SelfEvolvingFormatReward/std": 0.0822625607252121, "step": 800 }, { "clip_ratio/high_max": 0.001660401001572609, "clip_ratio/high_mean": 0.0002089143788907677, "clip_ratio/low_mean": 4.997950600227341e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025889389216899873, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/mean_length": 74.181640625, "completions/min_length": 33.333333333333336, "epoch": 1.622983870967742, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 1.9609375, "kl": 0.00380551521666348, "learning_rate": 8.076322202629542e-07, "loss": 0.00017993529327213763, "reward": 1.6252650419871013, "reward_std": 0.1947882076104482, "rewards/FidelityReward/mean": 0.7495838403701782, "rewards/FidelityReward/std": 0.19928001364072165, "rewards/JudgeFidelityReward/mean": 0.7559196949005127, "rewards/JudgeFidelityReward/std": 0.18737037976582846, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.06566246723135312, "step": 805 }, { "clip_ratio/high_max": 0.0022101088659837844, "clip_ratio/high_mean": 0.00016458503087051212, "clip_ratio/low_mean": 0.00013111692387610674, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029570193728432057, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/mean_length": 76.55078125, "completions/min_length": 32.0, "epoch": 1.6330645161290323, "frac_reward_zero_std": 0.0625, "grad_norm": 1.890625, "kl": 0.00378048662096262, "learning_rate": 8.049974494679531e-07, "loss": 0.00010176064679399133, "reward": 1.6133186221122742, "reward_std": 0.1903330311179161, "rewards/FidelityReward/mean": 0.737410694360733, "rewards/FidelityReward/std": 0.20589664578437805, "rewards/JudgeFidelityReward/mean": 0.7557219564914703, "rewards/JudgeFidelityReward/std": 0.1844061315059662, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 810 }, { "clip_ratio/high_max": 0.0007875698385760189, "clip_ratio/high_mean": 8.744823353481478e-05, "clip_ratio/low_mean": 0.00013454305299092083, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022199128288775684, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/mean_length": 73.27083333333333, "completions/min_length": 31.666666666666668, "epoch": 1.6431451612903225, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.046875, "kl": 0.0036961557809263466, "learning_rate": 8.02349121042852e-07, "loss": 5.596964620053768e-05, "reward": 1.600343147913615, "reward_std": 0.19307803610960642, "rewards/FidelityReward/mean": 0.7337480783462524, "rewards/FidelityReward/std": 0.20488778253396353, "rewards/JudgeFidelityReward/mean": 0.7364453474680582, "rewards/JudgeFidelityReward/std": 0.18672878046830496, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04410756006836891, "step": 815 }, { "clip_ratio/high_max": 0.0015016297809779644, "clip_ratio/high_mean": 0.00019890281255356966, "clip_ratio/low_mean": 0.00010458668257342651, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003034894994925708, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 293.5, "completions/mean_length": 76.03515625, "completions/min_length": 35.0, "epoch": 1.653225806451613, "frac_reward_zero_std": 0.0625, "grad_norm": 2.0, "kl": 0.0036112205125391483, "learning_rate": 7.996873527101341e-07, "loss": 0.0002772978274151683, "reward": 1.6146870851516724, "reward_std": 0.1914563998579979, "rewards/FidelityReward/mean": 0.7398465573787689, "rewards/FidelityReward/std": 0.20939451456069946, "rewards/JudgeFidelityReward/mean": 0.7526108622550964, "rewards/JudgeFidelityReward/std": 0.20207177847623825, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.038198307156562805, "step": 820 }, { "clip_ratio/high_max": 0.0009937394876033067, "clip_ratio/high_mean": 0.0001649530720897019, "clip_ratio/low_mean": 8.229724044213071e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002472503110766411, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/mean_length": 75.06575520833333, "completions/min_length": 33.333333333333336, "epoch": 1.6633064516129032, "frac_reward_zero_std": 0.046875, "grad_norm": 2.25, "kl": 0.0035839808639138936, "learning_rate": 7.970122627897085e-07, "loss": 0.0001562644261866808, "reward": 1.6360057592391968, "reward_std": 0.1893797665834427, "rewards/FidelityReward/mean": 0.7549645105997721, "rewards/FidelityReward/std": 0.19715415438016257, "rewards/JudgeFidelityReward/mean": 0.7633845607439677, "rewards/JudgeFidelityReward/std": 0.17728727062543234, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.029462782045205433, "step": 825 }, { "clip_ratio/high_max": 0.0015351495472714304, "clip_ratio/high_mean": 0.00018467198533471673, "clip_ratio/low_mean": 0.00011666826612781734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030134025146253405, "completions/clipped_ratio": 0.0, "completions/max_length": 228.5, "completions/mean_length": 76.9560546875, "completions/min_length": 35.5, "epoch": 1.6733870967741935, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.109375, "kl": 0.0035410865675657988, "learning_rate": 7.9432397019365e-07, "loss": 1.1737694148905575e-05, "reward": 1.6161231398582458, "reward_std": 0.1951608806848526, "rewards/FidelityReward/mean": 0.744577944278717, "rewards/FidelityReward/std": 0.20421337336301804, "rewards/JudgeFidelityReward/mean": 0.7479730844497681, "rewards/JudgeFidelityReward/std": 0.1743839904665947, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06616134010255337, "step": 830 }, { "clip_ratio/high_max": 0.0014192060101777316, "clip_ratio/high_mean": 0.0002330766525119543, "clip_ratio/low_mean": 4.0215595799963924e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027329223230481147, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/mean_length": 76.33463541666667, "completions/min_length": 34.0, "epoch": 1.683467741935484, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.140625, "kl": 0.003450315212830901, "learning_rate": 7.916225944209145e-07, "loss": 0.00021036784164607525, "reward": 1.5832541386286418, "reward_std": 0.20197726289431253, "rewards/FidelityReward/mean": 0.7194936076800028, "rewards/FidelityReward/std": 0.21555976072947183, "rewards/JudgeFidelityReward/mean": 0.7288230856259664, "rewards/JudgeFidelityReward/std": 0.20900293191274008, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.02081293861071269, "step": 835 }, { "clip_ratio/high_max": 0.0018301811534911395, "clip_ratio/high_mean": 0.0001828995009418577, "clip_ratio/low_mean": 0.00012297052599024027, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030587001238018273, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/mean_length": 78.5556640625, "completions/min_length": 30.0, "epoch": 1.6935483870967742, "frac_reward_zero_std": 0.0625, "grad_norm": 2.03125, "kl": 0.003561254171654582, "learning_rate": 7.889082555520255e-07, "loss": 7.351900567300617e-05, "reward": 1.5685769319534302, "reward_std": 0.1973000094294548, "rewards/FidelityReward/mean": 0.71180060505867, "rewards/FidelityReward/std": 0.2296636775135994, "rewards/JudgeFidelityReward/mean": 0.7155059576034546, "rewards/JudgeFidelityReward/std": 0.22336746752262115, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 840 }, { "clip_ratio/high_max": 0.001112413266673684, "clip_ratio/high_mean": 0.00017111668712459504, "clip_ratio/low_mean": 0.00010461000783834606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027572669787332413, "completions/clipped_ratio": 0.0, "completions/max_length": 200.33333333333334, "completions/mean_length": 77.16927083333333, "completions/min_length": 33.0, "epoch": 1.7036290322580645, "frac_reward_zero_std": 0.036458333333333336, "grad_norm": 2.109375, "kl": 0.0038598717655986547, "learning_rate": 7.861810742437376e-07, "loss": 0.00021162088960409166, "reward": 1.6698144674301147, "reward_std": 0.1877685785293579, "rewards/FidelityReward/mean": 0.7765031456947327, "rewards/FidelityReward/std": 0.1930383344491323, "rewards/JudgeFidelityReward/mean": 0.7905289133389791, "rewards/JudgeFidelityReward/std": 0.1773488720258077, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.061009867737690605, "step": 845 }, { "clip_ratio/high_max": 0.0020715924445539715, "clip_ratio/high_mean": 0.00031115778256207705, "clip_ratio/low_mean": 9.744369890540838e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040860146982595323, "completions/clipped_ratio": 0.0, "completions/max_length": 203.5, "completions/mean_length": 74.3310546875, "completions/min_length": 31.5, "epoch": 1.713709677419355, "frac_reward_zero_std": 0.046875, "grad_norm": 2.0, "kl": 0.0036462567280977965, "learning_rate": 7.834411717236722e-07, "loss": 2.480759867466986e-05, "reward": 1.5864542722702026, "reward_std": 0.2021886482834816, "rewards/FidelityReward/mean": 0.725833535194397, "rewards/FidelityReward/std": 0.21268333494663239, "rewards/JudgeFidelityReward/mean": 0.7251477241516113, "rewards/JudgeFidelityReward/std": 0.19906801730394363, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 850 }, { "clip_ratio/high_max": 0.0014393284451216458, "clip_ratio/high_mean": 0.00020515479263849556, "clip_ratio/low_mean": 7.291192159755155e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027806671569123863, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/mean_length": 74.12174479166667, "completions/min_length": 36.666666666666664, "epoch": 1.723790322580645, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.15625, "kl": 0.0034957310650497674, "learning_rate": 7.806886697849293e-07, "loss": 0.00012921905145049095, "reward": 1.6407625675201416, "reward_std": 0.18838177621364594, "rewards/FidelityReward/mean": 0.7600661516189575, "rewards/FidelityReward/std": 0.19936237235864004, "rewards/JudgeFidelityReward/mean": 0.7626949946085612, "rewards/JudgeFidelityReward/std": 0.18315168221791586, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.02081293861071269, "step": 855 }, { "clip_ratio/high_max": 0.0027386965230107307, "clip_ratio/high_mean": 0.00032091812463477256, "clip_ratio/low_mean": 8.799447386991233e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004089125897735357, "completions/clipped_ratio": 0.0, "completions/max_length": 226.5, "completions/mean_length": 76.0693359375, "completions/min_length": 35.5, "epoch": 1.7338709677419355, "frac_reward_zero_std": 0.0625, "grad_norm": 2.21875, "kl": 0.003492424450814724, "learning_rate": 7.779236907806737e-07, "loss": 2.8276414377614854e-05, "reward": 1.650943398475647, "reward_std": 0.1875375360250473, "rewards/FidelityReward/mean": 0.7617213129997253, "rewards/FidelityReward/std": 0.20328973233699799, "rewards/JudgeFidelityReward/mean": 0.7803972363471985, "rewards/JudgeFidelityReward/std": 0.18413779884576797, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 860 }, { "clip_ratio/high_max": 0.001011986006051302, "clip_ratio/high_mean": 0.0001539476856123656, "clip_ratio/low_mean": 0.00011326732928864657, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026721500325948, "completions/clipped_ratio": 0.0, "completions/max_length": 205.33333333333334, "completions/mean_length": 75.3828125, "completions/min_length": 30.0, "epoch": 1.7439516129032258, "frac_reward_zero_std": 0.0625, "grad_norm": 2.09375, "kl": 0.0035797862336039545, "learning_rate": 7.751463576186957e-07, "loss": 0.0002364202169701457, "reward": 1.630694309870402, "reward_std": 0.18328813711802164, "rewards/FidelityReward/mean": 0.7575836777687073, "rewards/FidelityReward/std": 0.20974585910638174, "rewards/JudgeFidelityReward/mean": 0.7488253911336263, "rewards/JudgeFidelityReward/std": 0.20283940434455872, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.05027572065591812, "step": 865 }, { "clip_ratio/high_max": 0.001741645811125636, "clip_ratio/high_mean": 0.0002757016627583653, "clip_ratio/low_mean": 0.0001070052181603387, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038270687218755484, "completions/clipped_ratio": 0.0, "completions/max_length": 199.5, "completions/mean_length": 75.84765625, "completions/min_length": 32.5, "epoch": 1.754032258064516, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.03125, "kl": 0.003771153837442398, "learning_rate": 7.723567937559479e-07, "loss": 0.00019339288119226694, "reward": 1.6182091236114502, "reward_std": 0.19951683282852173, "rewards/FidelityReward/mean": 0.7403523027896881, "rewards/FidelityReward/std": 0.2097669094800949, "rewards/JudgeFidelityReward/mean": 0.7586433589458466, "rewards/JudgeFidelityReward/std": 0.20154006034135818, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 870 }, { "clip_ratio/high_max": 0.0008812427520751953, "clip_ratio/high_mean": 0.00018939747824333608, "clip_ratio/low_mean": 9.675105975475163e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002861485467292368, "completions/clipped_ratio": 0.0, "completions/max_length": 200.66666666666666, "completions/mean_length": 76.736328125, "completions/min_length": 32.0, "epoch": 1.7641129032258065, "frac_reward_zero_std": 0.0625, "grad_norm": 1.84375, "kl": 0.0036526585929095743, "learning_rate": 7.695551231930573e-07, "loss": 9.78879164904356e-05, "reward": 1.5717151165008545, "reward_std": 0.21176275114218393, "rewards/FidelityReward/mean": 0.7149770657221476, "rewards/FidelityReward/std": 0.2217165877421697, "rewards/JudgeFidelityReward/mean": 0.7154292861620585, "rewards/JudgeFidelityReward/std": 0.23111552000045776, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 875 }, { "clip_ratio/high_max": 0.0020044281147420405, "clip_ratio/high_mean": 0.0002623030100949109, "clip_ratio/low_mean": 0.00016276702226605266, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00042507003527134655, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/mean_length": 75.0322265625, "completions/min_length": 33.5, "epoch": 1.7741935483870968, "frac_reward_zero_std": 0.046875, "grad_norm": 1.9375, "kl": 0.0036413232795894147, "learning_rate": 7.667414704688136e-07, "loss": 5.3362589096650484e-05, "reward": 1.6269779205322266, "reward_std": 0.19447188079357147, "rewards/FidelityReward/mean": 0.7472217381000519, "rewards/FidelityReward/std": 0.20666255056858063, "rewards/JudgeFidelityReward/mean": 0.7614654302597046, "rewards/JudgeFidelityReward/std": 0.18650534749031067, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 880 }, { "clip_ratio/high_max": 0.0010409831069409848, "clip_ratio/high_mean": 0.00012126340297982096, "clip_ratio/low_mean": 3.5819376353174445e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015708278515376152, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/mean_length": 76.19401041666667, "completions/min_length": 34.0, "epoch": 1.784274193548387, "frac_reward_zero_std": 0.03125, "grad_norm": 2.03125, "kl": 0.003571014851331711, "learning_rate": 7.639159606546324e-07, "loss": 0.00023481266107410191, "reward": 1.5985350211461384, "reward_std": 0.2049056887626648, "rewards/FidelityReward/mean": 0.7284473180770874, "rewards/FidelityReward/std": 0.21219887336095175, "rewards/JudgeFidelityReward/mean": 0.7440817356109619, "rewards/JudgeFidelityReward/std": 0.19449400901794434, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.061009867737690605, "step": 885 }, { "clip_ratio/high_max": 0.0015816438477486371, "clip_ratio/high_mean": 0.0002558587701059878, "clip_ratio/low_mean": 7.794602279318497e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003338047943543643, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 75.8564453125, "completions/min_length": 34.5, "epoch": 1.7943548387096775, "frac_reward_zero_std": 0.046875, "grad_norm": 2.015625, "kl": 0.003809464490041137, "learning_rate": 7.610787193489968e-07, "loss": 0.00011201146990060806, "reward": 1.5990545153617859, "reward_std": 0.1890798807144165, "rewards/FidelityReward/mean": 0.7323552072048187, "rewards/FidelityReward/std": 0.22185833752155304, "rewards/JudgeFidelityReward/mean": 0.7353516817092896, "rewards/JudgeFidelityReward/std": 0.19328855723142624, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 890 }, { "clip_ratio/high_max": 0.0011706393677741289, "clip_ratio/high_mean": 0.00013591825845651328, "clip_ratio/low_mean": 7.387007644865662e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020978833199478686, "completions/clipped_ratio": 0.0, "completions/max_length": 237.66666666666666, "completions/mean_length": 75.23567708333333, "completions/min_length": 32.0, "epoch": 1.8044354838709677, "frac_reward_zero_std": 0.09375, "grad_norm": 2.015625, "kl": 0.003705738252028823, "learning_rate": 7.582298726718733e-07, "loss": 0.0001380031812004745, "reward": 1.6186468601226807, "reward_std": 0.18527286251386008, "rewards/FidelityReward/mean": 0.7463621298472086, "rewards/FidelityReward/std": 0.20996725062529245, "rewards/JudgeFidelityReward/mean": 0.7458715240160624, "rewards/JudgeFidelityReward/std": 0.20352350175380707, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.029462782045205433, "step": 895 }, { "clip_ratio/high_max": 0.002001827582716942, "clip_ratio/high_mean": 0.0003375916625373065, "clip_ratio/low_mean": 7.636321897734888e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00041395488660782575, "completions/clipped_ratio": 0.0, "completions/max_length": 196.5, "completions/mean_length": 76.7109375, "completions/min_length": 36.0, "epoch": 1.814516129032258, "frac_reward_zero_std": 0.046875, "grad_norm": 2.0, "kl": 0.0036493569146841764, "learning_rate": 7.553695472591059e-07, "loss": 0.0002187909558415413, "reward": 1.6221795678138733, "reward_std": 0.19343861937522888, "rewards/FidelityReward/mean": 0.7494019865989685, "rewards/FidelityReward/std": 0.19697456061840057, "rewards/JudgeFidelityReward/mean": 0.7465316951274872, "rewards/JudgeFidelityReward/std": 0.19769834727048874, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 900 }, { "clip_ratio/high_max": 0.0012885154224932193, "clip_ratio/high_mean": 0.00015563553897663952, "clip_ratio/low_mean": 3.674216568470001e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001923777046613395, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 75.34049479166667, "completions/min_length": 35.333333333333336, "epoch": 1.8245967741935485, "frac_reward_zero_std": 0.078125, "grad_norm": 2.328125, "kl": 0.00359477368183434, "learning_rate": 7.524978702567871e-07, "loss": 0.0001399357453919947, "reward": 1.5830150445302327, "reward_std": 0.1861813614765803, "rewards/FidelityReward/mean": 0.721111536026001, "rewards/FidelityReward/std": 0.22464477022488913, "rewards/JudgeFidelityReward/mean": 0.7264112035433451, "rewards/JudgeFidelityReward/std": 0.22136651476224264, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.05027572065591812, "step": 905 }, { "clip_ratio/high_max": 0.0012991803465411066, "clip_ratio/high_mean": 0.00016510828863829373, "clip_ratio/low_mean": 0.00012795173679478467, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002930600312538445, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/mean_length": 74.2783203125, "completions/min_length": 34.5, "epoch": 1.8346774193548387, "frac_reward_zero_std": 0.078125, "grad_norm": 2.140625, "kl": 0.0037443089298903943, "learning_rate": 7.496149693156059e-07, "loss": 8.175828261300922e-05, "reward": 1.6199855208396912, "reward_std": 0.1751735806465149, "rewards/FidelityReward/mean": 0.7445310354232788, "rewards/FidelityReward/std": 0.2111947163939476, "rewards/JudgeFidelityReward/mean": 0.7518855929374695, "rewards/JudgeFidelityReward/std": 0.1907210424542427, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 910 }, { "clip_ratio/high_max": 0.0008585417177528143, "clip_ratio/high_mean": 0.00017231845995411277, "clip_ratio/low_mean": 2.09443416679278e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001932628103531897, "completions/clipped_ratio": 0.0, "completions/max_length": 200.33333333333334, "completions/mean_length": 77.34309895833333, "completions/min_length": 36.333333333333336, "epoch": 1.844758064516129, "frac_reward_zero_std": 0.046875, "grad_norm": 2.09375, "kl": 0.003616413986310363, "learning_rate": 7.467209725851736e-07, "loss": 0.0001218917896039784, "reward": 1.58245054880778, "reward_std": 0.21079314748446146, "rewards/FidelityReward/mean": 0.7255853414535522, "rewards/FidelityReward/std": 0.2090964913368225, "rewards/JudgeFidelityReward/mean": 0.7189387083053589, "rewards/JudgeFidelityReward/std": 0.19260107477506003, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.07100204626719157, "step": 915 }, { "clip_ratio/high_max": 0.0013011108152568341, "clip_ratio/high_mean": 0.0002052893571089953, "clip_ratio/low_mean": 0.00011587617045734077, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032116552465595306, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/mean_length": 74.6025390625, "completions/min_length": 33.0, "epoch": 1.8548387096774195, "frac_reward_zero_std": 0.09375, "grad_norm": 2.046875, "kl": 0.003956697648391128, "learning_rate": 7.438160087083271e-07, "loss": 0.00010645417496562004, "reward": 1.6006959080696106, "reward_std": 0.19997704774141312, "rewards/FidelityReward/mean": 0.7319273054599762, "rewards/FidelityReward/std": 0.2087911069393158, "rewards/JudgeFidelityReward/mean": 0.7453497052192688, "rewards/JudgeFidelityReward/std": 0.1784340664744377, "rewards/SelfEvolvingFormatReward/mean": 0.9921875, "rewards/SelfEvolvingFormatReward/std": 0.08508053794503212, "step": 920 }, { "clip_ratio/high_max": 0.0006651537958532571, "clip_ratio/high_mean": 8.210010128095746e-05, "clip_ratio/low_mean": 0.00010395499703008682, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018605509540066124, "completions/clipped_ratio": 0.0, "completions/max_length": 188.66666666666666, "completions/mean_length": 76.44921875, "completions/min_length": 33.333333333333336, "epoch": 1.8649193548387095, "frac_reward_zero_std": 0.046875, "grad_norm": 2.0625, "kl": 0.0040150498505681755, "learning_rate": 7.409002068154109e-07, "loss": 0.00019015667494386433, "reward": 1.5742456118265789, "reward_std": 0.20799238979816437, "rewards/FidelityReward/mean": 0.715529183546702, "rewards/FidelityReward/std": 0.20702166358629862, "rewards/JudgeFidelityReward/mean": 0.7213392655054728, "rewards/JudgeFidelityReward/std": 0.18419663111368814, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.061009867737690605, "step": 925 }, { "clip_ratio/high_max": 0.001480700122192502, "clip_ratio/high_mean": 0.0002185350196668878, "clip_ratio/low_mean": 7.972678795340472e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029826180543750523, "completions/clipped_ratio": 0.0, "completions/max_length": 214.5, "completions/mean_length": 76.556640625, "completions/min_length": 34.0, "epoch": 1.875, "frac_reward_zero_std": 0.046875, "grad_norm": 2.125, "kl": 0.0036576491314917804, "learning_rate": 7.379736965185368e-07, "loss": 9.289595764130353e-05, "reward": 1.6214990615844727, "reward_std": 0.19730941206216812, "rewards/FidelityReward/mean": 0.7491028308868408, "rewards/FidelityReward/std": 0.21381190419197083, "rewards/JudgeFidelityReward/mean": 0.7496752738952637, "rewards/JudgeFidelityReward/std": 0.2033543884754181, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06616134010255337, "step": 930 }, { "clip_ratio/high_max": 0.0007330200402066112, "clip_ratio/high_mean": 0.00012386833550408481, "clip_ratio/low_mean": 0.00012539523304440082, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002492635743692517, "completions/clipped_ratio": 0.0, "completions/max_length": 183.33333333333334, "completions/mean_length": 74.71549479166667, "completions/min_length": 34.333333333333336, "epoch": 1.8850806451612905, "frac_reward_zero_std": 0.0625, "grad_norm": 2.171875, "kl": 0.0038675041869282722, "learning_rate": 7.350366079058222e-07, "loss": 0.00015151540283113717, "reward": 1.6399108171463013, "reward_std": 0.18164141972859701, "rewards/FidelityReward/mean": 0.7599546114603678, "rewards/FidelityReward/std": 0.1979209283987681, "rewards/JudgeFidelityReward/mean": 0.7644696633021036, "rewards/JudgeFidelityReward/std": 0.18715880314509073, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.054841707150141396, "step": 935 }, { "clip_ratio/high_max": 0.00223612729460001, "clip_ratio/high_mean": 0.0002670081332325935, "clip_ratio/low_mean": 0.00010441117919981479, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037141930079087615, "completions/clipped_ratio": 0.0, "completions/max_length": 247.5, "completions/mean_length": 75.6962890625, "completions/min_length": 35.0, "epoch": 1.8951612903225805, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.046875, "kl": 0.00413377471268177, "learning_rate": 7.320890715356084e-07, "loss": -8.538772817701102e-08, "reward": 1.6062541604042053, "reward_std": 0.20264346152544022, "rewards/FidelityReward/mean": 0.7382562458515167, "rewards/FidelityReward/std": 0.19999698549509048, "rewards/JudgeFidelityReward/mean": 0.7418552935123444, "rewards/JudgeFidelityReward/std": 0.19154000282287598, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 940 }, { "clip_ratio/high_max": 0.0007117730099707841, "clip_ratio/high_mean": 8.841004746500403e-05, "clip_ratio/low_mean": 7.325355836655945e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016166360583156348, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 74.703125, "completions/min_length": 35.666666666666664, "epoch": 1.905241935483871, "frac_reward_zero_std": 0.0625, "grad_norm": 2.203125, "kl": 0.003952168161049485, "learning_rate": 7.291312184306557e-07, "loss": 0.0001865188591182232, "reward": 1.620928446451823, "reward_std": 0.19016672670841217, "rewards/FidelityReward/mean": 0.7460780739784241, "rewards/FidelityReward/std": 0.19748914241790771, "rewards/JudgeFidelityReward/mean": 0.7529560724894205, "rewards/JudgeFidelityReward/std": 0.18677107989788055, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04627847671508789, "step": 945 }, { "clip_ratio/high_max": 0.0021584838163107634, "clip_ratio/high_mean": 0.0002172956330468878, "clip_ratio/low_mean": 0.00015237925690598787, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036967489868402483, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 304.5, "completions/mean_length": 76.7998046875, "completions/min_length": 32.5, "epoch": 1.9153225806451613, "frac_reward_zero_std": 0.0625, "grad_norm": 2.53125, "kl": 0.0038639347068965433, "learning_rate": 7.261631800723205e-07, "loss": 0.00022184024564921855, "reward": 1.6251254081726074, "reward_std": 0.19366355240345, "rewards/FidelityReward/mean": 0.7498623728752136, "rewards/FidelityReward/std": 0.20123756676912308, "rewards/JudgeFidelityReward/mean": 0.7563854157924652, "rewards/JudgeFidelityReward/std": 0.18825337290763855, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 950 }, { "clip_ratio/high_max": 0.0010928086005151272, "clip_ratio/high_mean": 0.00018234309391118586, "clip_ratio/low_mean": 2.187989593949169e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020422298694029452, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/mean_length": 78.41471354166667, "completions/min_length": 31.666666666666668, "epoch": 1.9254032258064515, "frac_reward_zero_std": 0.0625, "grad_norm": 2.125, "kl": 0.004135921131819487, "learning_rate": 7.231850883947098e-07, "loss": 0.0001098415465094149, "reward": 1.6029682159423828, "reward_std": 0.20302854478359222, "rewards/FidelityReward/mean": 0.7329523166020712, "rewards/FidelityReward/std": 0.2053839365641276, "rewards/JudgeFidelityReward/mean": 0.745891273021698, "rewards/JudgeFidelityReward/std": 0.17999631663163504, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.062187383572260536, "step": 955 }, { "clip_ratio/high_max": 0.001355701475404203, "clip_ratio/high_mean": 0.0002571830176748335, "clip_ratio/low_mean": 5.8436456311028453e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003156194812618196, "completions/clipped_ratio": 0.0, "completions/max_length": 202.5, "completions/mean_length": 79.0185546875, "completions/min_length": 37.5, "epoch": 1.935483870967742, "frac_reward_zero_std": 0.0234375, "grad_norm": 2.15625, "kl": 0.0038166353013366463, "learning_rate": 7.201970757788171e-07, "loss": 0.00012988022062927486, "reward": 1.6574163436889648, "reward_std": 0.19730865210294724, "rewards/FidelityReward/mean": 0.7734749019145966, "rewards/FidelityReward/std": 0.2049088105559349, "rewards/JudgeFidelityReward/mean": 0.7717892527580261, "rewards/JudgeFidelityReward/std": 0.20742275565862656, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 960 }, { "clip_ratio/high_max": 0.0006985589861869812, "clip_ratio/high_mean": 0.00012817178794648497, "clip_ratio/low_mean": 8.473794150631875e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021290971781127155, "completions/clipped_ratio": 0.0, "completions/max_length": 179.33333333333334, "completions/mean_length": 75.08723958333333, "completions/min_length": 33.666666666666664, "epoch": 1.9455645161290323, "frac_reward_zero_std": 0.0625, "grad_norm": 1.84375, "kl": 0.0037498744670301674, "learning_rate": 7.171992750466377e-07, "loss": 0.00017829262651503086, "reward": 1.6160353422164917, "reward_std": 0.19368289411067963, "rewards/FidelityReward/mean": 0.7465813755989075, "rewards/FidelityReward/std": 0.20858564972877502, "rewards/JudgeFidelityReward/mean": 0.7421631415685018, "rewards/JudgeFidelityReward/std": 0.18966612219810486, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04410756006836891, "step": 965 }, { "clip_ratio/high_max": 0.0020458690356463193, "clip_ratio/high_mean": 0.00040159233612939714, "clip_ratio/low_mean": 0.0001330832776147872, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005346756079234183, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/mean_length": 77.537109375, "completions/min_length": 33.5, "epoch": 1.9556451612903225, "frac_reward_zero_std": 0.046875, "grad_norm": 2.09375, "kl": 0.0036804776173084974, "learning_rate": 7.141918194552645e-07, "loss": 0.00021140468306839465, "reward": 1.594226360321045, "reward_std": 0.20142541080713272, "rewards/FidelityReward/mean": 0.7270587682723999, "rewards/FidelityReward/std": 0.21639974415302277, "rewards/JudgeFidelityReward/mean": 0.73531174659729, "rewards/JudgeFidelityReward/std": 0.20930033922195435, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 970 }, { "clip_ratio/high_max": 0.0013643715530633927, "clip_ratio/high_mean": 0.00020880911615677177, "clip_ratio/low_mean": 0.0001488278212491423, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035763692576438186, "completions/clipped_ratio": 0.0, "completions/max_length": 226.66666666666666, "completions/mean_length": 76.11263020833333, "completions/min_length": 35.0, "epoch": 1.965725806451613, "frac_reward_zero_std": 0.03125, "grad_norm": 2.21875, "kl": 0.00392344850115478, "learning_rate": 7.111748426909646e-07, "loss": 0.000130062410607934, "reward": 1.611115574836731, "reward_std": 0.18619074920813242, "rewards/FidelityReward/mean": 0.7370432615280151, "rewards/FidelityReward/std": 0.22068378329277039, "rewards/JudgeFidelityReward/mean": 0.7513998945554098, "rewards/JudgeFidelityReward/std": 0.20321612556775412, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04410756006836891, "step": 975 }, { "clip_ratio/high_max": 0.0013395846821367741, "clip_ratio/high_mean": 0.00023273304104804994, "clip_ratio/low_mean": 0.00010237265960313379, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033510568900965156, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/mean_length": 75.6240234375, "completions/min_length": 34.5, "epoch": 1.9758064516129032, "frac_reward_zero_std": 0.0703125, "grad_norm": 1.875, "kl": 0.003795926831662655, "learning_rate": 7.081484788632359e-07, "loss": 0.00010532636661082506, "reward": 1.5510587096214294, "reward_std": 0.2121327966451645, "rewards/FidelityReward/mean": 0.6964459419250488, "rewards/FidelityReward/std": 0.21278897672891617, "rewards/JudgeFidelityReward/mean": 0.7102020382881165, "rewards/JudgeFidelityReward/std": 0.1999376341700554, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 980 }, { "clip_ratio/high_max": 0.0011879368219524622, "clip_ratio/high_mean": 0.00020402715308591722, "clip_ratio/low_mean": 0.00010323644673917442, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003072636085562408, "completions/clipped_ratio": 0.0, "completions/max_length": 215.33333333333334, "completions/mean_length": 75.4765625, "completions/min_length": 35.333333333333336, "epoch": 1.9858870967741935, "frac_reward_zero_std": 0.09895833333333333, "grad_norm": 2.140625, "kl": 0.0038309779949486254, "learning_rate": 7.051128624988477e-07, "loss": 0.00011235530255362391, "reward": 1.6409404277801514, "reward_std": 0.17275509734948477, "rewards/FidelityReward/mean": 0.7601159811019897, "rewards/FidelityReward/std": 0.19376526276270548, "rewards/JudgeFidelityReward/mean": 0.7642530004183451, "rewards/JudgeFidelityReward/std": 0.17952843010425568, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04162587722142538, "step": 985 }, { "clip_ratio/high_max": 0.0019416664028540254, "clip_ratio/high_mean": 0.00018132849945686757, "clip_ratio/low_mean": 0.00010884659423027188, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002901750907767564, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/mean_length": 76.953125, "completions/min_length": 33.0, "epoch": 1.995967741935484, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.03125, "kl": 0.004168014042079449, "learning_rate": 7.020681285358585e-07, "loss": -3.945384523831308e-05, "reward": 1.6238741278648376, "reward_std": 0.1794523224234581, "rewards/FidelityReward/mean": 0.7450030148029327, "rewards/FidelityReward/std": 0.22299956530332565, "rewards/JudgeFidelityReward/mean": 0.7616485953330994, "rewards/JudgeFidelityReward/std": 0.19916978478431702, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06243881583213806, "step": 990 }, { "clip_ratio/high_max": 0.0007968052057549357, "clip_ratio/high_mean": 0.00013931910507380962, "clip_ratio/low_mean": 9.169901895802468e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023101812112145126, "completions/clipped_ratio": 0.0, "completions/max_length": 269.3333333333333, "completions/mean_length": 78.2421875, "completions/min_length": 33.333333333333336, "epoch": 2.006048387096774, "frac_reward_zero_std": 0.03125, "grad_norm": 2.03125, "kl": 0.003948001842945814, "learning_rate": 6.990144123176192e-07, "loss": 0.0001928334590047598, "reward": 1.5936506986618042, "reward_std": 0.20511525869369507, "rewards/FidelityReward/mean": 0.727348804473877, "rewards/FidelityReward/std": 0.20825264354546866, "rewards/JudgeFidelityReward/mean": 0.7352078557014465, "rewards/JudgeFidelityReward/std": 0.2017998993396759, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.05027572065591812, "step": 995 }, { "clip_ratio/high_max": 0.0016864804085344077, "clip_ratio/high_mean": 0.0002780825481750071, "clip_ratio/low_mean": 7.358069706242532e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035166324814781544, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/mean_length": 77.8896484375, "completions/min_length": 33.0, "epoch": 2.0161290322580645, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.078125, "kl": 0.00406807977706194, "learning_rate": 6.959518495867569e-07, "loss": 0.0001849690219387412, "reward": 1.6307625770568848, "reward_std": 0.19415897130966187, "rewards/FidelityReward/mean": 0.7506296634674072, "rewards/FidelityReward/std": 0.20671764761209488, "rewards/JudgeFidelityReward/mean": 0.7622186839580536, "rewards/JudgeFidelityReward/std": 0.19504141062498093, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1000 }, { "clip_ratio/high_max": 0.0007222348358482122, "clip_ratio/high_mean": 0.00011319566983729601, "clip_ratio/low_mean": 6.63553539197892e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001795510179363191, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 76.041015625, "completions/min_length": 35.333333333333336, "epoch": 2.026209677419355, "frac_reward_zero_std": 0.03125, "grad_norm": 1.96875, "kl": 0.004003542754799127, "learning_rate": 6.928805764791396e-07, "loss": 0.00022312819492071868, "reward": 1.6313724120457966, "reward_std": 0.1949449231227239, "rewards/FidelityReward/mean": 0.7545228799184164, "rewards/FidelityReward/std": 0.19641578197479248, "rewards/JudgeFidelityReward/mean": 0.757605234781901, "rewards/JudgeFidelityReward/std": 0.18914439777533212, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.058838951090971627, "step": 1005 }, { "clip_ratio/high_max": 0.0017308399779722094, "clip_ratio/high_mean": 0.00023471073945984243, "clip_ratio/low_mean": 0.0001580689277034253, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039277964970096946, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/mean_length": 77.181640625, "completions/min_length": 31.5, "epoch": 2.036290322580645, "frac_reward_zero_std": 0.1015625, "grad_norm": 1.9609375, "kl": 0.004124086257070303, "learning_rate": 6.898007295178261e-07, "loss": 9.875362738966941e-05, "reward": 1.618061125278473, "reward_std": 0.17512333393096924, "rewards/FidelityReward/mean": 0.7424340546131134, "rewards/FidelityReward/std": 0.21253883093595505, "rewards/JudgeFidelityReward/mean": 0.7512542605400085, "rewards/JudgeFidelityReward/std": 0.19891446083784103, "rewards/SelfEvolvingFormatReward/mean": 1.0, "rewards/SelfEvolvingFormatReward/std": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0014857631642371417, "clip_ratio/high_mean": 0.00011592853697948158, "clip_ratio/low_mean": 7.779519073665142e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019372373353689908, "completions/clipped_ratio": 0.0, "completions/max_length": 213.66666666666666, "completions/mean_length": 77.39322916666667, "completions/min_length": 32.666666666666664, "epoch": 2.0463709677419355, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 1.9296875, "kl": 0.003842059429734945, "learning_rate": 6.867124456069972e-07, "loss": 8.698557503521443e-05, "reward": 1.6033282279968262, "reward_std": 0.20521100362141928, "rewards/FidelityReward/mean": 0.733800490697225, "rewards/FidelityReward/std": 0.2039472907781601, "rewards/JudgeFidelityReward/mean": 0.7423108220100403, "rewards/JudgeFidelityReward/std": 0.18646504978338876, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04410756006836891, "step": 1015 }, { "clip_ratio/high_max": 0.0016761930892243981, "clip_ratio/high_mean": 0.0002255452098324895, "clip_ratio/low_mean": 0.00012422629006323405, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003497714875265956, "completions/clipped_ratio": 0.0, "completions/max_length": 190.5, "completions/mean_length": 75.5830078125, "completions/min_length": 33.5, "epoch": 2.056451612903226, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.21875, "kl": 0.0040970703586935995, "learning_rate": 6.836158620258693e-07, "loss": 0.00012993557611480356, "reward": 1.6040900945663452, "reward_std": 0.1966804563999176, "rewards/FidelityReward/mean": 0.7348112761974335, "rewards/FidelityReward/std": 0.21950526535511017, "rewards/JudgeFidelityReward/mean": 0.7434403896331787, "rewards/JudgeFidelityReward/std": 0.19958744198083878, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06941771507263184, "step": 1020 }, { "clip_ratio/high_max": 0.0009451452642679214, "clip_ratio/high_mean": 0.0001332972140517086, "clip_ratio/low_mean": 4.8215521383099257e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001815127383451909, "completions/clipped_ratio": 0.0, "completions/max_length": 200.33333333333334, "completions/mean_length": 75.26692708333333, "completions/min_length": 34.333333333333336, "epoch": 2.066532258064516, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 2.140625, "kl": 0.0037748497910797597, "learning_rate": 6.805111164225925e-07, "loss": 0.00021046495530754327, "reward": 1.630033055941264, "reward_std": 0.18606935441493988, "rewards/FidelityReward/mean": 0.7535627086957296, "rewards/FidelityReward/std": 0.2037597894668579, "rewards/JudgeFidelityReward/mean": 0.7548937797546387, "rewards/JudgeFidelityReward/std": 0.19457654158274332, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 1025 }, { "clip_ratio/high_max": 0.0016589599195867776, "clip_ratio/high_mean": 0.00015033113304525615, "clip_ratio/low_mean": 9.83952806564048e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002487264107912779, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/mean_length": 75.4482421875, "completions/min_length": 32.5, "epoch": 2.0766129032258065, "frac_reward_zero_std": 0.078125, "grad_norm": 1.8984375, "kl": 0.00393896121531725, "learning_rate": 6.773983468081323e-07, "loss": 0.0002459113486111164, "reward": 1.6076573133468628, "reward_std": 0.1943894848227501, "rewards/FidelityReward/mean": 0.7372317314147949, "rewards/FidelityReward/std": 0.2041970118880272, "rewards/JudgeFidelityReward/mean": 0.7457339465618134, "rewards/JudgeFidelityReward/std": 0.204756960272789, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06616134010255337, "step": 1030 }, { "clip_ratio/high_max": 0.001291607180610299, "clip_ratio/high_mean": 0.0002100207842886448, "clip_ratio/low_mean": 7.612098997924476e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00028614179464057086, "completions/clipped_ratio": 0.0, "completions/max_length": 244.66666666666666, "completions/mean_length": 75.69140625, "completions/min_length": 34.0, "epoch": 2.086693548387097, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.15625, "kl": 0.003916370682418346, "learning_rate": 6.742776915501347e-07, "loss": 0.00023925716523081063, "reward": 1.6371792157491047, "reward_std": 0.19427978495756784, "rewards/FidelityReward/mean": 0.7571753859519958, "rewards/FidelityReward/std": 0.19651359816392264, "rewards/JudgeFidelityReward/mean": 0.7639139493306478, "rewards/JudgeFidelityReward/std": 0.1812113175789515, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.061009867737690605, "step": 1035 }, { "clip_ratio/high_max": 0.0016478369012475013, "clip_ratio/high_mean": 0.00017142521392088383, "clip_ratio/low_mean": 0.00014415129844564946, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003155765007250011, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/mean_length": 75.6953125, "completions/min_length": 30.5, "epoch": 2.096774193548387, "frac_reward_zero_std": 0.0625, "grad_norm": 2.203125, "kl": 0.004159029480069876, "learning_rate": 6.711492893667746e-07, "loss": 0.00020880107767879964, "reward": 1.600085198879242, "reward_std": 0.19479022175073624, "rewards/FidelityReward/mean": 0.7276129126548767, "rewards/FidelityReward/std": 0.21159563213586807, "rewards/JudgeFidelityReward/mean": 0.7459212243556976, "rewards/JudgeFidelityReward/std": 0.19244152307510376, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 1040 }, { "clip_ratio/high_max": 0.0013134006410837173, "clip_ratio/high_mean": 0.0002389780478551984, "clip_ratio/low_mean": 0.0001013544446323067, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034033250994980335, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/mean_length": 75.55859375, "completions/min_length": 32.333333333333336, "epoch": 2.1068548387096775, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 2.21875, "kl": 0.004104385990649462, "learning_rate": 6.680132793205908e-07, "loss": 0.0002530805766582489, "reward": 1.6503712733586628, "reward_std": 0.1845481644074122, "rewards/FidelityReward/mean": 0.7648529807726542, "rewards/FidelityReward/std": 0.20742679138978323, "rewards/JudgeFidelityReward/mean": 0.7736407915751139, "rewards/JudgeFidelityReward/std": 0.19239315887292227, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04019692912697792, "step": 1045 }, { "clip_ratio/high_max": 0.0016869876068085431, "clip_ratio/high_mean": 0.00025174104957841336, "clip_ratio/low_mean": 7.976526685524732e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033150633098557594, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 288.0, "completions/mean_length": 75.4931640625, "completions/min_length": 33.5, "epoch": 2.1169354838709675, "frac_reward_zero_std": 0.046875, "grad_norm": 1.96875, "kl": 0.004153239820152521, "learning_rate": 6.648698008123037e-07, "loss": 4.263903247192502e-05, "reward": 1.6003189086914062, "reward_std": 0.19402678310871124, "rewards/FidelityReward/mean": 0.7331346571445465, "rewards/FidelityReward/std": 0.21510830521583557, "rewards/JudgeFidelityReward/mean": 0.7372980713844299, "rewards/JudgeFidelityReward/std": 0.20496460050344467, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1050 }, { "clip_ratio/high_max": 0.0014704223722219467, "clip_ratio/high_mean": 0.00017983050202019513, "clip_ratio/low_mean": 9.314013877883553e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027297064661979675, "completions/clipped_ratio": 0.0, "completions/max_length": 211.33333333333334, "completions/mean_length": 76.56705729166667, "completions/min_length": 35.0, "epoch": 2.127016129032258, "frac_reward_zero_std": 0.046875, "grad_norm": 2.078125, "kl": 0.003831409988924861, "learning_rate": 6.61718993574619e-07, "loss": 0.0002413850976154208, "reward": 1.613018314043681, "reward_std": 0.19127621253331503, "rewards/FidelityReward/mean": 0.7449109554290771, "rewards/FidelityReward/std": 0.2037410040696462, "rewards/JudgeFidelityReward/mean": 0.738167921702067, "rewards/JudgeFidelityReward/std": 0.19581529001394907, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.025465538104375202, "step": 1055 }, { "clip_ratio/high_max": 0.0014938403852283954, "clip_ratio/high_mean": 0.00016061449132394047, "clip_ratio/low_mean": 0.0001417855528416112, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003024000441655517, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/mean_length": 75.423828125, "completions/min_length": 32.5, "epoch": 2.1370967741935485, "frac_reward_zero_std": 0.078125, "grad_norm": 1.921875, "kl": 0.00412658960558474, "learning_rate": 6.585609976660165e-07, "loss": 0.0001357653411105275, "reward": 1.648246943950653, "reward_std": 0.17858576029539108, "rewards/FidelityReward/mean": 0.7648473680019379, "rewards/FidelityReward/std": 0.191593199968338, "rewards/JudgeFidelityReward/mean": 0.7707053124904633, "rewards/JudgeFidelityReward/std": 0.181942880153656, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 1060 }, { "clip_ratio/high_max": 0.0013178445864468814, "clip_ratio/high_mean": 0.00013664349098689853, "clip_ratio/low_mean": 8.328219701070338e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021992568508721888, "completions/clipped_ratio": 0.0, "completions/max_length": 239.66666666666666, "completions/mean_length": 76.9765625, "completions/min_length": 33.333333333333336, "epoch": 2.1471774193548385, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 1.9140625, "kl": 0.0037604068871587514, "learning_rate": 6.553959534645235e-07, "loss": 1.4304979413282126e-05, "reward": 1.6120737393697102, "reward_std": 0.1972023844718933, "rewards/FidelityReward/mean": 0.7436474561691284, "rewards/FidelityReward/std": 0.20777396857738495, "rewards/JudgeFidelityReward/mean": 0.7394567330678304, "rewards/JudgeFidelityReward/std": 0.1948329508304596, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.029376169045766194, "step": 1065 }, { "clip_ratio/high_max": 0.001390472147613764, "clip_ratio/high_mean": 0.00016014372231438757, "clip_ratio/low_mean": 9.914450638461857e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002592882374301553, "completions/clipped_ratio": 0.0, "completions/max_length": 191.5, "completions/mean_length": 73.35546875, "completions/min_length": 37.5, "epoch": 2.157258064516129, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.0625, "kl": 0.0040277318563312296, "learning_rate": 6.522240016614757e-07, "loss": 0.00023723251651972533, "reward": 1.6723849773406982, "reward_std": 0.17456740885972977, "rewards/FidelityReward/mean": 0.780602902173996, "rewards/FidelityReward/std": 0.19188039749860764, "rewards/JudgeFidelityReward/mean": 0.7855173051357269, "rewards/JudgeFidelityReward/std": 0.18717030435800552, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1070 }, { "clip_ratio/high_max": 0.0012925559654831885, "clip_ratio/high_mean": 0.00013108087296131998, "clip_ratio/low_mean": 0.00014070958859520034, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002717904513701797, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/mean_length": 75.490234375, "completions/min_length": 32.0, "epoch": 2.1673387096774195, "frac_reward_zero_std": 0.046875, "grad_norm": 2.15625, "kl": 0.0039868096355348825, "learning_rate": 6.49045283255263e-07, "loss": 0.000200706347823143, "reward": 1.5717219511667888, "reward_std": 0.20291295647621155, "rewards/FidelityReward/mean": 0.7117828925450643, "rewards/FidelityReward/std": 0.2123639831940333, "rewards/JudgeFidelityReward/mean": 0.7237844268480936, "rewards/JudgeFidelityReward/std": 0.20994813740253448, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.05018910765647888, "step": 1075 }, { "clip_ratio/high_max": 0.001955709746107459, "clip_ratio/high_mean": 0.0002380012534558773, "clip_ratio/low_mean": 0.00013913549191784113, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003771367482841015, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/mean_length": 78.1669921875, "completions/min_length": 37.0, "epoch": 2.1774193548387095, "frac_reward_zero_std": 0.03125, "grad_norm": 2.0625, "kl": 0.003961235657334328, "learning_rate": 6.458599395450614e-07, "loss": 0.0001941818161867559, "reward": 1.62100088596344, "reward_std": 0.19864284992218018, "rewards/FidelityReward/mean": 0.7454710900783539, "rewards/FidelityReward/std": 0.20115382969379425, "rewards/JudgeFidelityReward/mean": 0.751059502363205, "rewards/JudgeFidelityReward/std": 0.19223807752132416, "rewards/SelfEvolvingFormatReward/mean": 1.0, "rewards/SelfEvolvingFormatReward/std": 0.0, "step": 1080 }, { "clip_ratio/high_max": 0.0006552981678396464, "clip_ratio/high_mean": 4.511146762524732e-05, "clip_ratio/low_mean": 0.00011154391831951216, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015665538667235522, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/mean_length": 75.92057291666667, "completions/min_length": 36.0, "epoch": 2.1875, "frac_reward_zero_std": 0.0625, "grad_norm": 2.109375, "kl": 0.003923094319179654, "learning_rate": 6.426681121245527e-07, "loss": 0.00018363459967076778, "reward": 1.6355974674224854, "reward_std": 0.18435425062974295, "rewards/FidelityReward/mean": 0.7557785709698995, "rewards/FidelityReward/std": 0.19920664529005686, "rewards/JudgeFidelityReward/mean": 0.7648460070292155, "rewards/JudgeFidelityReward/std": 0.1887151449918747, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.06957309817274411, "step": 1085 }, { "clip_ratio/high_max": 0.0014727634843438864, "clip_ratio/high_mean": 0.00016535782197024673, "clip_ratio/low_mean": 7.295819523278624e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023831601138226687, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/mean_length": 75.8388671875, "completions/min_length": 34.0, "epoch": 2.1975806451612905, "frac_reward_zero_std": 0.046875, "grad_norm": 2.40625, "kl": 0.004085670178756118, "learning_rate": 6.394699428756298e-07, "loss": 0.00029923536349087956, "reward": 1.5891342759132385, "reward_std": 0.2088397592306137, "rewards/FidelityReward/mean": 0.7274193167686462, "rewards/FidelityReward/std": 0.2073017656803131, "rewards/JudgeFidelityReward/mean": 0.7253831028938293, "rewards/JudgeFidelityReward/std": 0.18734777718782425, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 1090 }, { "clip_ratio/high_max": 0.0007816794095560908, "clip_ratio/high_mean": 9.039028955157847e-05, "clip_ratio/low_mean": 8.150613866746426e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017189642530865968, "completions/clipped_ratio": 0.0, "completions/max_length": 239.66666666666666, "completions/mean_length": 75.99283854166667, "completions/min_length": 32.333333333333336, "epoch": 2.2076612903225805, "frac_reward_zero_std": 0.078125, "grad_norm": 2.125, "kl": 0.003770432248711586, "learning_rate": 6.362655739620905e-07, "loss": 0.00019237929955124856, "reward": 1.635778506596883, "reward_std": 0.1844893842935562, "rewards/FidelityReward/mean": 0.7558241287867228, "rewards/FidelityReward/std": 0.19919201731681824, "rewards/JudgeFidelityReward/mean": 0.7618616819381714, "rewards/JudgeFidelityReward/std": 0.1873012234767278, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 1095 }, { "clip_ratio/high_max": 0.0017190649174153804, "clip_ratio/high_mean": 0.00021361244725994765, "clip_ratio/low_mean": 7.452955906046554e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002881420077756047, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/mean_length": 73.892578125, "completions/min_length": 33.0, "epoch": 2.217741935483871, "frac_reward_zero_std": 0.1015625, "grad_norm": 2.203125, "kl": 0.0038210734259337188, "learning_rate": 6.330551478233171e-07, "loss": 0.0002038647886365652, "reward": 1.627172589302063, "reward_std": 0.1786351576447487, "rewards/FidelityReward/mean": 0.7615784108638763, "rewards/FidelityReward/std": 0.19472111016511917, "rewards/JudgeFidelityReward/mean": 0.7331413626670837, "rewards/JudgeFidelityReward/std": 0.18391916155815125, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 1100 }, { "clip_ratio/high_max": 0.0011371917556971312, "clip_ratio/high_mean": 0.0001939309760928154, "clip_ratio/low_mean": 6.332133780233562e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025725230807438493, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/mean_length": 77.95442708333333, "completions/min_length": 34.333333333333336, "epoch": 2.2278225806451615, "frac_reward_zero_std": 0.03125, "grad_norm": 2.0, "kl": 0.003998400969430804, "learning_rate": 6.298388071679464e-07, "loss": 0.00031229108572006226, "reward": 1.6338104406992595, "reward_std": 0.19927958647410074, "rewards/FidelityReward/mean": 0.755238930384318, "rewards/FidelityReward/std": 0.1957712024450302, "rewards/JudgeFidelityReward/mean": 0.7603981494903564, "rewards/JudgeFidelityReward/std": 0.1852752019961675, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04627847671508789, "step": 1105 }, { "clip_ratio/high_max": 0.0017641603481024504, "clip_ratio/high_mean": 0.00019094158487860114, "clip_ratio/low_mean": 0.00016291850915877147, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003538600984029472, "completions/clipped_ratio": 0.0, "completions/max_length": 205.5, "completions/mean_length": 75.0078125, "completions/min_length": 32.5, "epoch": 2.2379032258064515, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.0625, "kl": 0.003928785864263773, "learning_rate": 6.26616694967524e-07, "loss": 0.0001853067660704255, "reward": 1.6260218620300293, "reward_std": 0.19525451213121414, "rewards/FidelityReward/mean": 0.7489786744117737, "rewards/FidelityReward/std": 0.20333216339349747, "rewards/JudgeFidelityReward/mean": 0.7579925060272217, "rewards/JudgeFidelityReward/std": 0.1769309714436531, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06243881583213806, "step": 1110 }, { "clip_ratio/high_max": 0.0014755658805370331, "clip_ratio/high_mean": 0.00013092489971313625, "clip_ratio/low_mean": 4.622387350536883e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017714877030812205, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/mean_length": 75.94856770833333, "completions/min_length": 31.333333333333332, "epoch": 2.247983870967742, "frac_reward_zero_std": 0.046875, "grad_norm": 2.34375, "kl": 0.004047081246972084, "learning_rate": 6.233889544501505e-07, "loss": 0.00012679467909038067, "reward": 1.6100223461786907, "reward_std": 0.20327386756738028, "rewards/FidelityReward/mean": 0.7382295330365499, "rewards/FidelityReward/std": 0.19814724723498026, "rewards/JudgeFidelityReward/mean": 0.7468408147493998, "rewards/JudgeFidelityReward/std": 0.17906107505162558, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04410756006836891, "step": 1115 }, { "clip_ratio/high_max": 0.001705309539102018, "clip_ratio/high_mean": 0.00034519172622822226, "clip_ratio/low_mean": 0.00016599826631136239, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005111900041811168, "completions/clipped_ratio": 0.0, "completions/max_length": 232.5, "completions/mean_length": 78.5576171875, "completions/min_length": 34.0, "epoch": 2.258064516129032, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.125, "kl": 0.004067629436030984, "learning_rate": 6.201557290941144e-07, "loss": 3.1643983675166965e-05, "reward": 1.619163155555725, "reward_std": 0.19626711308956146, "rewards/FidelityReward/mean": 0.749004065990448, "rewards/FidelityReward/std": 0.1996472403407097, "rewards/JudgeFidelityReward/mean": 0.7461776435375214, "rewards/JudgeFidelityReward/std": 0.17344991117715836, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 1120 }, { "clip_ratio/high_max": 0.0012413985561579466, "clip_ratio/high_mean": 0.0001468755363021046, "clip_ratio/low_mean": 5.899521929677576e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002058707585092634, "completions/clipped_ratio": 0.0, "completions/max_length": 228.66666666666666, "completions/mean_length": 77.15755208333333, "completions/min_length": 34.666666666666664, "epoch": 2.2681451612903225, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 1.8828125, "kl": 0.00394522724673152, "learning_rate": 6.169171626215136e-07, "loss": 0.00022801547311246395, "reward": 1.6393024921417236, "reward_std": 0.17886992792288461, "rewards/FidelityReward/mean": 0.7574363748232523, "rewards/FidelityReward/std": 0.21178497870763144, "rewards/JudgeFidelityReward/mean": 0.7656852801640829, "rewards/JudgeFidelityReward/std": 0.19959395627180734, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 1125 }, { "clip_ratio/high_max": 0.0016466637142002582, "clip_ratio/high_mean": 0.0002581905631814152, "clip_ratio/low_mean": 0.00010739476056187414, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036558533320203425, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/mean_length": 78.2451171875, "completions/min_length": 32.5, "epoch": 2.278225806451613, "frac_reward_zero_std": 0.0625, "grad_norm": 2.046875, "kl": 0.0042883620597422125, "learning_rate": 6.136733989918681e-07, "loss": 0.0002496554981917143, "reward": 1.6860529780387878, "reward_std": 0.18303436040878296, "rewards/FidelityReward/mean": 0.7871854305267334, "rewards/FidelityReward/std": 0.1833813264966011, "rewards/JudgeFidelityReward/mean": 0.8035945296287537, "rewards/JudgeFidelityReward/std": 0.16155581176280975, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 1130 }, { "clip_ratio/high_max": 0.000961717707104981, "clip_ratio/high_mean": 0.00013576634810306132, "clip_ratio/low_mean": 8.230856037698686e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021807491430081427, "completions/clipped_ratio": 0.0, "completions/max_length": 210.33333333333334, "completions/mean_length": 76.37890625, "completions/min_length": 33.666666666666664, "epoch": 2.288306451612903, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 1.8984375, "kl": 0.004005639906972646, "learning_rate": 6.104245823957191e-07, "loss": 5.5282143875956535e-05, "reward": 1.6538995901743572, "reward_std": 0.18789859116077423, "rewards/FidelityReward/mean": 0.7738338907559713, "rewards/FidelityReward/std": 0.19339367747306824, "rewards/JudgeFidelityReward/mean": 0.7627354860305786, "rewards/JudgeFidelityReward/std": 0.19526704649130502, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.05027572065591812, "step": 1135 }, { "clip_ratio/high_max": 0.0010193816153332591, "clip_ratio/high_mean": 0.00012516858405433595, "clip_ratio/low_mean": 0.00012052712845616043, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024569571833126246, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/mean_length": 78.42578125, "completions/min_length": 35.5, "epoch": 2.2983870967741935, "frac_reward_zero_std": 0.09375, "grad_norm": 2.109375, "kl": 0.0040420506615191695, "learning_rate": 6.071708572482211e-07, "loss": 0.00022310172207653522, "reward": 1.652437448501587, "reward_std": 0.1808178946375847, "rewards/FidelityReward/mean": 0.76431804895401, "rewards/FidelityReward/std": 0.21837111562490463, "rewards/JudgeFidelityReward/mean": 0.7811216115951538, "rewards/JudgeFidelityReward/std": 0.20542237162590027, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06941771507263184, "step": 1140 }, { "clip_ratio/high_max": 0.0015832424629479646, "clip_ratio/high_mean": 0.0002674597664736211, "clip_ratio/low_mean": 0.00014282108168117702, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004102808656170964, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/mean_length": 74.23502604166667, "completions/min_length": 33.666666666666664, "epoch": 2.308467741935484, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.0625, "kl": 0.0038639677222818136, "learning_rate": 6.039123681827209e-07, "loss": 0.00021673091687262057, "reward": 1.6095959345499675, "reward_std": 0.19910452763239542, "rewards/FidelityReward/mean": 0.7376670042673746, "rewards/FidelityReward/std": 0.20652670661608377, "rewards/JudgeFidelityReward/mean": 0.7484151124954224, "rewards/JudgeFidelityReward/std": 0.1879753371079763, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.054841707150141396, "step": 1145 }, { "clip_ratio/high_max": 0.001448482647538185, "clip_ratio/high_mean": 0.00022734622471034526, "clip_ratio/low_mean": 8.726036612642929e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003146065864712, "completions/clipped_ratio": 0.0, "completions/max_length": 224.5, "completions/mean_length": 73.9404296875, "completions/min_length": 35.5, "epoch": 2.318548387096774, "frac_reward_zero_std": 0.078125, "grad_norm": 1.9453125, "kl": 0.004115621140226722, "learning_rate": 6.0064926004433e-07, "loss": 0.0001581474090926349, "reward": 1.598253846168518, "reward_std": 0.18969058990478516, "rewards/FidelityReward/mean": 0.7252065241336823, "rewards/FidelityReward/std": 0.22106246650218964, "rewards/JudgeFidelityReward/mean": 0.749024361371994, "rewards/JudgeFidelityReward/std": 0.19698620587587357, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1150 }, { "clip_ratio/high_max": 0.001056085992604494, "clip_ratio/high_mean": 0.00010996404744219035, "clip_ratio/low_mean": 4.1599464748287576e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015156350564211608, "completions/clipped_ratio": 0.0, "completions/max_length": 194.66666666666666, "completions/mean_length": 74.12825520833333, "completions/min_length": 31.333333333333332, "epoch": 2.3286290322580645, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 2.03125, "kl": 0.00379061852581799, "learning_rate": 5.973816778834844e-07, "loss": 0.0002689690096303821, "reward": 1.6323726574579875, "reward_std": 0.19148909052213034, "rewards/FidelityReward/mean": 0.7556751767794291, "rewards/FidelityReward/std": 0.20024735728899637, "rewards/JudgeFidelityReward/mean": 0.7586031953493754, "rewards/JudgeFidelityReward/std": 0.18215694030125937, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.058276752630869545, "step": 1155 }, { "clip_ratio/high_max": 0.0020429604221135376, "clip_ratio/high_mean": 0.0002727884217165411, "clip_ratio/low_mean": 0.00018015365349128843, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00045294207520782945, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/mean_length": 76.9169921875, "completions/min_length": 33.5, "epoch": 2.338709677419355, "frac_reward_zero_std": 0.1015625, "grad_norm": 1.9765625, "kl": 0.004038735199719668, "learning_rate": 5.941097669494979e-07, "loss": 4.909478011541069e-05, "reward": 1.6345115900039673, "reward_std": 0.18828973174095154, "rewards/FidelityReward/mean": 0.7605316340923309, "rewards/FidelityReward/std": 0.19516263902187347, "rewards/JudgeFidelityReward/mean": 0.7518661320209503, "rewards/JudgeFidelityReward/std": 0.18083325773477554, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 1160 }, { "clip_ratio/high_max": 0.001486968621611595, "clip_ratio/high_mean": 0.0002373912138864398, "clip_ratio/low_mean": 6.732655019732192e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003047177568078041, "completions/clipped_ratio": 0.0, "completions/max_length": 224.33333333333334, "completions/mean_length": 76.693359375, "completions/min_length": 34.0, "epoch": 2.348790322580645, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 1.9609375, "kl": 0.003972074761986732, "learning_rate": 5.908336726841053e-07, "loss": 0.00013052780414000154, "reward": 1.6273849805196126, "reward_std": 0.18839235603809357, "rewards/FidelityReward/mean": 0.7483519514401754, "rewards/FidelityReward/std": 0.20570963124434152, "rewards/JudgeFidelityReward/mean": 0.7593680818875631, "rewards/JudgeFidelityReward/std": 0.19777134557565054, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.02081293861071269, "step": 1165 }, { "clip_ratio/high_max": 0.001976414816454053, "clip_ratio/high_mean": 0.00029161740676499905, "clip_ratio/low_mean": 8.213704495574348e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037375446408987044, "completions/clipped_ratio": 0.0, "completions/max_length": 183.5, "completions/mean_length": 75.8896484375, "completions/min_length": 31.0, "epoch": 2.3588709677419355, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.078125, "kl": 0.004172731656581163, "learning_rate": 5.875535407149974e-07, "loss": 0.0002075322438031435, "reward": 1.5782678127288818, "reward_std": 0.21137062460184097, "rewards/FidelityReward/mean": 0.7136732339859009, "rewards/FidelityReward/std": 0.2176637351512909, "rewards/JudgeFidelityReward/mean": 0.7330954670906067, "rewards/JudgeFidelityReward/std": 0.1945890635251999, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 1170 }, { "clip_ratio/high_max": 0.0012057278770953416, "clip_ratio/high_mean": 0.0001296800561249256, "clip_ratio/low_mean": 7.074720633681864e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002004272653721273, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/mean_length": 75.62239583333333, "completions/min_length": 33.666666666666664, "epoch": 2.368951612903226, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 1.8984375, "kl": 0.0039931369945406915, "learning_rate": 5.842695168493474e-07, "loss": 0.00022138431668281556, "reward": 1.6087760925292969, "reward_std": 0.1893930733203888, "rewards/FidelityReward/mean": 0.7396077315012614, "rewards/FidelityReward/std": 0.20465249319871268, "rewards/JudgeFidelityReward/mean": 0.7428940931955973, "rewards/JudgeFidelityReward/std": 0.18747848272323608, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.054841707150141396, "step": 1175 }, { "clip_ratio/high_max": 0.0018541623838245868, "clip_ratio/high_mean": 0.00022886617807671428, "clip_ratio/low_mean": 0.00010364708432462067, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000332513265311718, "completions/clipped_ratio": 0.0, "completions/max_length": 216.5, "completions/mean_length": 78.517578125, "completions/min_length": 31.0, "epoch": 2.379032258064516, "frac_reward_zero_std": 0.046875, "grad_norm": 2.328125, "kl": 0.00415586531162262, "learning_rate": 5.809817470673293e-07, "loss": 0.00020301577169448137, "reward": 1.6150477528572083, "reward_std": 0.1970694363117218, "rewards/FidelityReward/mean": 0.7398036122322083, "rewards/FidelityReward/std": 0.20202817022800446, "rewards/JudgeFidelityReward/mean": 0.7524412274360657, "rewards/JudgeFidelityReward/std": 0.18259839713573456, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 1180 }, { "clip_ratio/high_max": 0.001168646290898323, "clip_ratio/high_mean": 0.00012102309556212276, "clip_ratio/low_mean": 6.86777348164469e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001897008274681866, "completions/clipped_ratio": 0.0, "completions/max_length": 258.3333333333333, "completions/mean_length": 74.15494791666667, "completions/min_length": 33.666666666666664, "epoch": 2.3891129032258065, "frac_reward_zero_std": 0.08854166666666667, "grad_norm": 2.328125, "kl": 0.004007793264463544, "learning_rate": 5.776903775156293e-07, "loss": 0.0002864662557840347, "reward": 1.5846998294194539, "reward_std": 0.1885654628276825, "rewards/FidelityReward/mean": 0.7179255882898966, "rewards/FidelityReward/std": 0.21420817077159882, "rewards/JudgeFidelityReward/mean": 0.7355016271273295, "rewards/JudgeFidelityReward/std": 0.185079092780749, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 1185 }, { "clip_ratio/high_max": 0.0018186116591095924, "clip_ratio/high_mean": 0.00022004894562996924, "clip_ratio/low_mean": 0.00010412580595584587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032417475013062357, "completions/clipped_ratio": 0.0, "completions/max_length": 258.5, "completions/mean_length": 76.15625, "completions/min_length": 32.0, "epoch": 2.399193548387097, "frac_reward_zero_std": 0.078125, "grad_norm": 2.125, "kl": 0.004042453784495592, "learning_rate": 5.743955545009497e-07, "loss": 0.00018353264313191174, "reward": 1.622643530368805, "reward_std": 0.19730082154273987, "rewards/FidelityReward/mean": 0.745408296585083, "rewards/FidelityReward/std": 0.2113134264945984, "rewards/JudgeFidelityReward/mean": 0.7603297829627991, "rewards/JudgeFidelityReward/std": 0.18461614847183228, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07131390832364559, "step": 1190 }, { "clip_ratio/high_max": 0.00101265087723732, "clip_ratio/high_mean": 0.00015373491332866253, "clip_ratio/low_mean": 6.650528230238706e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002202401985414326, "completions/clipped_ratio": 0.0, "completions/max_length": 230.33333333333334, "completions/mean_length": 76.74544270833333, "completions/min_length": 37.0, "epoch": 2.409274193548387, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 1.984375, "kl": 0.00407120524905622, "learning_rate": 5.71097424483504e-07, "loss": 0.00020046853460371495, "reward": 1.6250707705815632, "reward_std": 0.18555200596650442, "rewards/FidelityReward/mean": 0.7526975274085999, "rewards/FidelityReward/std": 0.2049987961848577, "rewards/JudgeFidelityReward/mean": 0.7473506530125936, "rewards/JudgeFidelityReward/std": 0.19419310986995697, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.05027572065591812, "step": 1195 }, { "clip_ratio/high_max": 0.0017558083636686206, "clip_ratio/high_mean": 0.00027277281042188407, "clip_ratio/low_mean": 5.084213989903219e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003236149437725544, "completions/clipped_ratio": 0.0, "completions/max_length": 222.5, "completions/mean_length": 76.7939453125, "completions/min_length": 33.0, "epoch": 2.4193548387096775, "frac_reward_zero_std": 0.0625, "grad_norm": 2.171875, "kl": 0.0042100260499864815, "learning_rate": 5.677961340705076e-07, "loss": 9.343413403257727e-05, "reward": 1.6345489025115967, "reward_std": 0.19202379882335663, "rewards/FidelityReward/mean": 0.7495194971561432, "rewards/FidelityReward/std": 0.20569830387830734, "rewards/JudgeFidelityReward/mean": 0.7759182751178741, "rewards/JudgeFidelityReward/std": 0.17926941066980362, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 1200 }, { "clip_ratio/high_max": 0.0009713807143270969, "clip_ratio/high_mean": 0.0001011845946777612, "clip_ratio/low_mean": 9.744325070641935e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019862785702571273, "completions/clipped_ratio": 0.0, "completions/max_length": 242.66666666666666, "completions/mean_length": 76.03971354166667, "completions/min_length": 32.666666666666664, "epoch": 2.429435483870968, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.21875, "kl": 0.004171267151832581, "learning_rate": 5.64491830009661e-07, "loss": 0.00019867836963385343, "reward": 1.5873470306396484, "reward_std": 0.20212118327617645, "rewards/FidelityReward/mean": 0.7200810114542643, "rewards/FidelityReward/std": 0.21174512803554535, "rewards/JudgeFidelityReward/mean": 0.7377873063087463, "rewards/JudgeFidelityReward/std": 0.19503015776475272, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.05635726824402809, "step": 1205 }, { "clip_ratio/high_max": 0.0015574421733617783, "clip_ratio/high_mean": 0.0002383135724812746, "clip_ratio/low_mean": 0.0001492405979661271, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038755416171625255, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/mean_length": 76.779296875, "completions/min_length": 34.0, "epoch": 2.439516129032258, "frac_reward_zero_std": 0.046875, "grad_norm": 2.015625, "kl": 0.004251551628112793, "learning_rate": 5.611846591826258e-07, "loss": 0.0002703075297176838, "reward": 1.6527222990989685, "reward_std": 0.18581219017505646, "rewards/FidelityReward/mean": 0.7680404484272003, "rewards/FidelityReward/std": 0.2088232859969139, "rewards/JudgeFidelityReward/mean": 0.7732698917388916, "rewards/JudgeFidelityReward/std": 0.1953643560409546, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 1210 }, { "clip_ratio/high_max": 0.00149885849095881, "clip_ratio/high_mean": 0.00022524602245539428, "clip_ratio/low_mean": 0.00011095157242380083, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000336197589058429, "completions/clipped_ratio": 0.0, "completions/max_length": 216.66666666666666, "completions/mean_length": 75.36848958333333, "completions/min_length": 33.333333333333336, "epoch": 2.4495967741935485, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.21875, "kl": 0.003954521054401994, "learning_rate": 5.578747685984962e-07, "loss": 0.00019917585887014867, "reward": 1.6113603909810383, "reward_std": 0.19419966638088226, "rewards/FidelityReward/mean": 0.7431813677151998, "rewards/FidelityReward/std": 0.2031117578347524, "rewards/JudgeFidelityReward/mean": 0.7389621535936991, "rewards/JudgeFidelityReward/std": 0.1995850751797358, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04019692912697792, "step": 1215 }, { "clip_ratio/high_max": 0.0018694492988288402, "clip_ratio/high_mean": 0.00026795403682626783, "clip_ratio/low_mean": 0.0001262016870896332, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039415573701262474, "completions/clipped_ratio": 0.0, "completions/max_length": 255.5, "completions/mean_length": 75.916015625, "completions/min_length": 34.0, "epoch": 2.4596774193548385, "frac_reward_zero_std": 0.078125, "grad_norm": 2.1875, "kl": 0.0039432439487427475, "learning_rate": 5.545623053872635e-07, "loss": 0.0002519825240597129, "reward": 1.6126078367233276, "reward_std": 0.19000792503356934, "rewards/FidelityReward/mean": 0.7397730350494385, "rewards/FidelityReward/std": 0.21533549576997757, "rewards/JudgeFidelityReward/mean": 0.7476226091384888, "rewards/JudgeFidelityReward/std": 0.21106699854135513, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1220 }, { "clip_ratio/high_max": 0.00118274400010705, "clip_ratio/high_mean": 0.00022184746339917182, "clip_ratio/low_mean": 6.491971143987029e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00028676717774942515, "completions/clipped_ratio": 0.0, "completions/max_length": 231.66666666666666, "completions/mean_length": 74.68815104166667, "completions/min_length": 34.333333333333336, "epoch": 2.469758064516129, "frac_reward_zero_std": 0.08854166666666667, "grad_norm": 2.046875, "kl": 0.003994304314255714, "learning_rate": 5.512474167932772e-07, "loss": 0.00014052018523216247, "reward": 1.626383662223816, "reward_std": 0.18503318230311075, "rewards/FidelityReward/mean": 0.7501449982325236, "rewards/FidelityReward/std": 0.2151230275630951, "rewards/JudgeFidelityReward/mean": 0.7544304728507996, "rewards/JudgeFidelityReward/std": 0.2030464013417562, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 1225 }, { "clip_ratio/high_max": 0.0011840266408398748, "clip_ratio/high_mean": 0.00012400326668284833, "clip_ratio/low_mean": 0.00011155269748996943, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023555596126243473, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/mean_length": 75.6220703125, "completions/min_length": 32.5, "epoch": 2.4798387096774195, "frac_reward_zero_std": 0.1015625, "grad_norm": 2.046875, "kl": 0.0042898344807326795, "learning_rate": 5.479302501686988e-07, "loss": 0.0001955177169293165, "reward": 1.6357569694519043, "reward_std": 0.16987250000238419, "rewards/FidelityReward/mean": 0.7529344856739044, "rewards/FidelityReward/std": 0.2019934430718422, "rewards/JudgeFidelityReward/mean": 0.7675982117652893, "rewards/JudgeFidelityReward/std": 0.1862371414899826, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 1230 }, { "clip_ratio/high_max": 0.0012175099458545447, "clip_ratio/high_mean": 0.00016005228972062468, "clip_ratio/low_mean": 0.00010704685118980705, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026709914673119786, "completions/clipped_ratio": 0.0, "completions/max_length": 246.33333333333334, "completions/mean_length": 75.89127604166667, "completions/min_length": 32.333333333333336, "epoch": 2.4899193548387095, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 1.9765625, "kl": 0.004089225549250841, "learning_rate": 5.446109529669514e-07, "loss": 0.0002096462296321988, "reward": 1.584494670232137, "reward_std": 0.18736141920089722, "rewards/FidelityReward/mean": 0.7191540002822876, "rewards/FidelityReward/std": 0.23607722421487173, "rewards/JudgeFidelityReward/mean": 0.734587570031484, "rewards/JudgeFidelityReward/std": 0.2206701139609019, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.061009867737690605, "step": 1235 }, { "clip_ratio/high_max": 0.0019856200087815523, "clip_ratio/high_mean": 0.00022633259650319814, "clip_ratio/low_mean": 6.415263633243739e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029048521537333727, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/mean_length": 74.67578125, "completions/min_length": 31.5, "epoch": 2.5, "frac_reward_zero_std": 0.078125, "grad_norm": 2.078125, "kl": 0.003971409471705556, "learning_rate": 5.412896727361662e-07, "loss": 0.0002903250977396965, "reward": 1.6122623085975647, "reward_std": 0.18855786323547363, "rewards/FidelityReward/mean": 0.7346155941486359, "rewards/FidelityReward/std": 0.2120228260755539, "rewards/JudgeFidelityReward/mean": 0.7562699317932129, "rewards/JudgeFidelityReward/std": 0.1823311373591423, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 1240 }, { "clip_ratio/high_max": 0.0011752365622669458, "clip_ratio/high_mean": 0.0001707627554424107, "clip_ratio/low_mean": 7.956549088703469e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002503282274119556, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/mean_length": 75.28645833333333, "completions/min_length": 34.666666666666664, "epoch": 2.5100806451612905, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 2.015625, "kl": 0.004041039431467652, "learning_rate": 5.379665571126231e-07, "loss": 0.00022261966951191424, "reward": 1.5594536066055298, "reward_std": 0.20122846961021423, "rewards/FidelityReward/mean": 0.7050938010215759, "rewards/FidelityReward/std": 0.21880732476711273, "rewards/JudgeFidelityReward/mean": 0.7113237778345743, "rewards/JudgeFidelityReward/std": 0.20302729805310568, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.05027572065591812, "step": 1245 }, { "clip_ratio/high_max": 0.0006442815531045199, "clip_ratio/high_mean": 6.540071917697788e-05, "clip_ratio/low_mean": 9.258278150809929e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015798349777469413, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/mean_length": 76.1279296875, "completions/min_length": 35.5, "epoch": 2.5201612903225805, "frac_reward_zero_std": 0.1015625, "grad_norm": 2.265625, "kl": 0.004131958028301597, "learning_rate": 5.346417538141884e-07, "loss": 0.00016517448239028454, "reward": 1.6102375984191895, "reward_std": 0.1950821802020073, "rewards/FidelityReward/mean": 0.7393851280212402, "rewards/FidelityReward/std": 0.20553827285766602, "rewards/JudgeFidelityReward/mean": 0.7436580955982208, "rewards/JudgeFidelityReward/std": 0.18767570704221725, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1250 }, { "clip_ratio/high_max": 0.000919647328555584, "clip_ratio/high_mean": 0.00011848229623865337, "clip_ratio/low_mean": 6.526495781145059e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018374724895693363, "completions/clipped_ratio": 0.0, "completions/max_length": 236.33333333333334, "completions/mean_length": 76.46809895833333, "completions/min_length": 32.333333333333336, "epoch": 2.530241935483871, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 1.875, "kl": 0.0038406784180551766, "learning_rate": 5.313154106337479e-07, "loss": 0.00020625714678317307, "reward": 1.608141303062439, "reward_std": 0.1885699232419332, "rewards/FidelityReward/mean": 0.7355509996414185, "rewards/FidelityReward/std": 0.19914395610491434, "rewards/JudgeFidelityReward/mean": 0.747133711973826, "rewards/JudgeFidelityReward/std": 0.18609130382537842, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 1255 }, { "clip_ratio/high_max": 0.001807389734312892, "clip_ratio/high_mean": 0.00014976813399698586, "clip_ratio/low_mean": 0.00022029713145457208, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003700652508996427, "completions/clipped_ratio": 0.0, "completions/max_length": 227.5, "completions/mean_length": 77.0546875, "completions/min_length": 33.5, "epoch": 2.540322580645161, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.125, "kl": 0.0041562768165022135, "learning_rate": 5.279876754326379e-07, "loss": 0.00016445426736027, "reward": 1.6224738359451294, "reward_std": 0.20232058316469193, "rewards/FidelityReward/mean": 0.7475622296333313, "rewards/FidelityReward/std": 0.19559484720230103, "rewards/JudgeFidelityReward/mean": 0.7547059059143066, "rewards/JudgeFidelityReward/std": 0.17746661603450775, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06941771507263184, "step": 1260 }, { "clip_ratio/high_max": 0.0014724673237651587, "clip_ratio/high_mean": 0.00020660451846197248, "clip_ratio/low_mean": 5.3482785006053746e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026008731219917536, "completions/clipped_ratio": 0.0, "completions/max_length": 212.66666666666666, "completions/mean_length": 75.04947916666667, "completions/min_length": 32.0, "epoch": 2.5504032258064515, "frac_reward_zero_std": 0.0625, "grad_norm": 2.109375, "kl": 0.004146043490618467, "learning_rate": 5.246586961340722e-07, "loss": 0.00021398193202912808, "reward": 1.5934348901112874, "reward_std": 0.2081620047489802, "rewards/FidelityReward/mean": 0.7304923931757609, "rewards/FidelityReward/std": 0.20077245434125265, "rewards/JudgeFidelityReward/mean": 0.7304422458012899, "rewards/JudgeFidelityReward/std": 0.1836376190185547, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.0649204986790816, "step": 1265 }, { "clip_ratio/high_max": 0.0015119124203920365, "clip_ratio/high_mean": 0.00023361165076494217, "clip_ratio/low_mean": 0.00011325642262818291, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034686807193793354, "completions/clipped_ratio": 0.0, "completions/max_length": 237.5, "completions/mean_length": 76.3291015625, "completions/min_length": 31.5, "epoch": 2.560483870967742, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.0625, "kl": 0.004297358728945255, "learning_rate": 5.213286207165668e-07, "loss": 0.00015782720874994992, "reward": 1.6309803128242493, "reward_std": 0.18276267498731613, "rewards/FidelityReward/mean": 0.7508270442485809, "rewards/FidelityReward/std": 0.21517881006002426, "rewards/JudgeFidelityReward/mean": 0.7622598111629486, "rewards/JudgeFidelityReward/std": 0.20320573449134827, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 1270 }, { "clip_ratio/high_max": 0.0006847576703876257, "clip_ratio/high_mean": 9.482699679210782e-05, "clip_ratio/low_mean": 6.330841788440011e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015813541831448673, "completions/clipped_ratio": 0.0, "completions/max_length": 219.66666666666666, "completions/mean_length": 76.689453125, "completions/min_length": 35.0, "epoch": 2.570564516129032, "frac_reward_zero_std": 0.0625, "grad_norm": 1.9921875, "kl": 0.003935589408501983, "learning_rate": 5.17997597207362e-07, "loss": 0.00014174774987623094, "reward": 1.645845611890157, "reward_std": 0.18302328884601593, "rewards/FidelityReward/mean": 0.7625869711240133, "rewards/FidelityReward/std": 0.2020961195230484, "rewards/JudgeFidelityReward/mean": 0.7678193847338358, "rewards/JudgeFidelityReward/std": 0.19666440784931183, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.029462782045205433, "step": 1275 }, { "clip_ratio/high_max": 0.002406332455575466, "clip_ratio/high_mean": 0.00028177049243822695, "clip_ratio/low_mean": 5.869708547834307e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003404675982892513, "completions/clipped_ratio": 0.0, "completions/max_length": 184.5, "completions/mean_length": 75.3173828125, "completions/min_length": 34.0, "epoch": 2.5806451612903225, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.09375, "kl": 0.004158969270065427, "learning_rate": 5.146657736758416e-07, "loss": 0.00020622985903173685, "reward": 1.6190184354782104, "reward_std": 0.1849580854177475, "rewards/FidelityReward/mean": 0.7439762353897095, "rewards/FidelityReward/std": 0.20154369622468948, "rewards/JudgeFidelityReward/mean": 0.75203737616539, "rewards/JudgeFidelityReward/std": 0.1827543005347252, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1280 }, { "clip_ratio/high_max": 0.001177066331729293, "clip_ratio/high_mean": 0.00015517869032919407, "clip_ratio/low_mean": 3.982668422395363e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000195005361456424, "completions/clipped_ratio": 0.0, "completions/max_length": 206.33333333333334, "completions/mean_length": 76.43489583333333, "completions/min_length": 30.666666666666668, "epoch": 2.590725806451613, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.171875, "kl": 0.004059714451432228, "learning_rate": 5.113332982269531e-07, "loss": 0.00023208358325064183, "reward": 1.6373066902160645, "reward_std": 0.18666942914326987, "rewards/FidelityReward/mean": 0.7588116327921549, "rewards/FidelityReward/std": 0.20953286190827689, "rewards/JudgeFidelityReward/mean": 0.7589431802431742, "rewards/JudgeFidelityReward/std": 0.20983371138572693, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 1285 }, { "clip_ratio/high_max": 0.0016786745749413968, "clip_ratio/high_mean": 0.0002632344840094447, "clip_ratio/low_mean": 0.000140303406806197, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040353788062930107, "completions/clipped_ratio": 0.0, "completions/max_length": 208.5, "completions/mean_length": 74.9775390625, "completions/min_length": 32.0, "epoch": 2.600806451612903, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.0625, "kl": 0.004074402805417776, "learning_rate": 5.080003189946217e-07, "loss": 0.00023489780724048615, "reward": 1.6133909225463867, "reward_std": 0.18759847432374954, "rewards/FidelityReward/mean": 0.7410119771957397, "rewards/FidelityReward/std": 0.22037193924188614, "rewards/JudgeFidelityReward/mean": 0.7476876974105835, "rewards/JudgeFidelityReward/std": 0.2081737071275711, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1290 }, { "clip_ratio/high_max": 0.0013543811161071061, "clip_ratio/high_mean": 0.00018954003462567925, "clip_ratio/low_mean": 0.00013935374445281923, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003288937848992646, "completions/clipped_ratio": 0.0, "completions/max_length": 217.66666666666666, "completions/mean_length": 76.38997395833333, "completions/min_length": 31.666666666666668, "epoch": 2.6108870967741935, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 1.921875, "kl": 0.0040563365910202265, "learning_rate": 5.046669841351669e-07, "loss": 0.0002563635585829616, "reward": 1.6439087390899658, "reward_std": 0.18511570990085602, "rewards/FidelityReward/mean": 0.7639424999554952, "rewards/FidelityReward/std": 0.20038287341594696, "rewards/JudgeFidelityReward/mean": 0.7618856430053711, "rewards/JudgeFidelityReward/std": 0.19174685080846152, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 1295 }, { "clip_ratio/high_max": 0.0012524041580036283, "clip_ratio/high_mean": 0.00020834769820794464, "clip_ratio/low_mean": 0.00013950064894743263, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034784835879690944, "completions/clipped_ratio": 0.0, "completions/max_length": 186.5, "completions/mean_length": 75.81640625, "completions/min_length": 34.0, "epoch": 2.620967741935484, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.0, "kl": 0.004086900036782026, "learning_rate": 5.013334418207161e-07, "loss": 0.00022091297432780266, "reward": 1.6334959864616394, "reward_std": 0.18125726282596588, "rewards/FidelityReward/mean": 0.7540847659111023, "rewards/FidelityReward/std": 0.19905924797058105, "rewards/JudgeFidelityReward/mean": 0.7617520689964294, "rewards/JudgeFidelityReward/std": 0.1795806810259819, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1300 }, { "clip_ratio/high_max": 0.0011311616515740752, "clip_ratio/high_mean": 0.00015626746462658048, "clip_ratio/low_mean": 8.401617233175785e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024028364568948746, "completions/clipped_ratio": 0.0, "completions/max_length": 267.6666666666667, "completions/mean_length": 76.82096354166667, "completions/min_length": 33.333333333333336, "epoch": 2.631048387096774, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 1.953125, "kl": 0.004132726974785328, "learning_rate": 4.979998402326191e-07, "loss": 0.00022887408267706632, "reward": 1.6234522263209026, "reward_std": 0.1875787079334259, "rewards/FidelityReward/mean": 0.7483321030934652, "rewards/FidelityReward/std": 0.1966790407896042, "rewards/JudgeFidelityReward/mean": 0.7521934310595194, "rewards/JudgeFidelityReward/std": 0.18880520264307657, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 1305 }, { "clip_ratio/high_max": 0.0016512760426849126, "clip_ratio/high_mean": 0.000227423821343109, "clip_ratio/low_mean": 8.27042618766427e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003101280773989856, "completions/clipped_ratio": 0.0, "completions/max_length": 245.5, "completions/mean_length": 73.15234375, "completions/min_length": 32.5, "epoch": 2.6411290322580645, "frac_reward_zero_std": 0.109375, "grad_norm": 1.90625, "kl": 0.0041272686794400215, "learning_rate": 4.946663275548599e-07, "loss": 0.00011036267969757318, "reward": 1.6004329919815063, "reward_std": 0.18269993364810944, "rewards/FidelityReward/mean": 0.7342641949653625, "rewards/FidelityReward/std": 0.20979418605566025, "rewards/JudgeFidelityReward/mean": 0.7362437844276428, "rewards/JudgeFidelityReward/std": 0.19679360836744308, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06243881583213806, "step": 1310 }, { "clip_ratio/high_max": 0.001079999189823866, "clip_ratio/high_mean": 0.00015166348312050103, "clip_ratio/low_mean": 9.889643406495452e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025055991718545554, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/mean_length": 76.09830729166667, "completions/min_length": 34.0, "epoch": 2.651209677419355, "frac_reward_zero_std": 0.0625, "grad_norm": 1.875, "kl": 0.004177220445126295, "learning_rate": 4.913330519674705e-07, "loss": 9.18842270039022e-05, "reward": 1.6186986366907756, "reward_std": 0.1971902847290039, "rewards/FidelityReward/mean": 0.7431276440620422, "rewards/FidelityReward/std": 0.2063532421986262, "rewards/JudgeFidelityReward/mean": 0.7550482749938965, "rewards/JudgeFidelityReward/std": 0.19144298632939658, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.061009867737690605, "step": 1315 }, { "clip_ratio/high_max": 0.0013649923726916312, "clip_ratio/high_mean": 0.0001987426890991628, "clip_ratio/low_mean": 0.00016996688791550695, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003687096061185002, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/mean_length": 78.15625, "completions/min_length": 34.0, "epoch": 2.661290322580645, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.046875, "kl": 0.00414623124524951, "learning_rate": 4.880001616399439e-07, "loss": 0.00013413092819973825, "reward": 1.6490442156791687, "reward_std": 0.17760057002305984, "rewards/FidelityReward/mean": 0.769759327173233, "rewards/FidelityReward/std": 0.19597623497247696, "rewards/JudgeFidelityReward/mean": 0.7634527087211609, "rewards/JudgeFidelityReward/std": 0.18448365479707718, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06941771507263184, "step": 1320 }, { "clip_ratio/high_max": 0.001281939959153533, "clip_ratio/high_mean": 0.00017423772951588034, "clip_ratio/low_mean": 9.886280167847872e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002731005311943591, "completions/clipped_ratio": 0.0, "completions/max_length": 230.33333333333334, "completions/mean_length": 76.31770833333333, "completions/min_length": 32.0, "epoch": 2.6713709677419355, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 1.984375, "kl": 0.003936846693977714, "learning_rate": 4.846678047246475e-07, "loss": 0.00026547485031187535, "reward": 1.6253295342127483, "reward_std": 0.18792015810807547, "rewards/FidelityReward/mean": 0.7464190125465393, "rewards/FidelityReward/std": 0.19512197375297546, "rewards/JudgeFidelityReward/mean": 0.7617272337277731, "rewards/JudgeFidelityReward/std": 0.1760599265495936, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.058838951090971627, "step": 1325 }, { "clip_ratio/high_max": 0.001255348138511181, "clip_ratio/high_mean": 0.00024559958837926387, "clip_ratio/low_mean": 0.0001092630067432765, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035486258566379546, "completions/clipped_ratio": 0.0, "completions/max_length": 199.5, "completions/mean_length": 75.8916015625, "completions/min_length": 37.0, "epoch": 2.681451612903226, "frac_reward_zero_std": 0.09375, "grad_norm": 2.140625, "kl": 0.004149646591395139, "learning_rate": 4.813361293502379e-07, "loss": 0.00014016155619174243, "reward": 1.5843898057937622, "reward_std": 0.1945435106754303, "rewards/FidelityReward/mean": 0.7265329957008362, "rewards/FidelityReward/std": 0.20995819568634033, "rewards/JudgeFidelityReward/mean": 0.7196198403835297, "rewards/JudgeFidelityReward/std": 0.21443425863981247, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 1330 }, { "clip_ratio/high_max": 0.000807131826877594, "clip_ratio/high_mean": 0.00014574110973626376, "clip_ratio/low_mean": 0.0001405116927344352, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00028625280829146503, "completions/clipped_ratio": 0.0, "completions/max_length": 219.33333333333334, "completions/mean_length": 77.48372395833333, "completions/min_length": 33.333333333333336, "epoch": 2.691532258064516, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 1.875, "kl": 0.0044172054156661035, "learning_rate": 4.78005283615076e-07, "loss": 0.00015554854180663825, "reward": 1.5822635889053345, "reward_std": 0.20490855971972147, "rewards/FidelityReward/mean": 0.7214023073514303, "rewards/FidelityReward/std": 0.2216231127580007, "rewards/JudgeFidelityReward/mean": 0.7249776919682821, "rewards/JudgeFidelityReward/std": 0.2102701167265574, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04410756006836891, "step": 1335 }, { "clip_ratio/high_max": 0.0021056531462818385, "clip_ratio/high_mean": 0.0003352659638039768, "clip_ratio/low_mean": 0.00010330479126423598, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00043857075506821276, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/mean_length": 78.9521484375, "completions/min_length": 33.0, "epoch": 2.7016129032258065, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.078125, "kl": 0.004366598650813102, "learning_rate": 4.746754155806437e-07, "loss": 0.00025448151864111425, "reward": 1.673009216785431, "reward_std": 0.18151773512363434, "rewards/FidelityReward/mean": 0.781380832195282, "rewards/FidelityReward/std": 0.1936495155096054, "rewards/JudgeFidelityReward/mean": 0.7900926768779755, "rewards/JudgeFidelityReward/std": 0.1720784306526184, "rewards/SelfEvolvingFormatReward/mean": 0.9931640625, "rewards/SelfEvolvingFormatReward/std": 0.0822625607252121, "step": 1340 }, { "clip_ratio/high_max": 0.0013581624254584313, "clip_ratio/high_mean": 0.0001948063028976321, "clip_ratio/low_mean": 5.332855944288895e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002481348579749465, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 73.72526041666667, "completions/min_length": 34.666666666666664, "epoch": 2.711693548387097, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 2.34375, "kl": 0.00438658818602562, "learning_rate": 4.713466732649628e-07, "loss": 0.0003060012124478817, "reward": 1.6214604775110881, "reward_std": 0.1893476645151774, "rewards/FidelityReward/mean": 0.7468114097913107, "rewards/FidelityReward/std": 0.2078935702641805, "rewards/JudgeFidelityReward/mean": 0.7538554271062216, "rewards/JudgeFidelityReward/std": 0.1905961980422338, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.054841707150141396, "step": 1345 }, { "clip_ratio/high_max": 0.0017459708265960216, "clip_ratio/high_mean": 0.0002765239682048559, "clip_ratio/low_mean": 0.00012359633692540228, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040012031095102427, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/mean_length": 73.76171875, "completions/min_length": 35.5, "epoch": 2.721774193548387, "frac_reward_zero_std": 0.1015625, "grad_norm": 1.9921875, "kl": 0.004219447448849678, "learning_rate": 4.6801920463601507e-07, "loss": 0.00032717203721404075, "reward": 1.6294910907745361, "reward_std": 0.18324125558137894, "rewards/FidelityReward/mean": 0.7526690065860748, "rewards/FidelityReward/std": 0.19607071578502655, "rewards/JudgeFidelityReward/mean": 0.756573885679245, "rewards/JudgeFidelityReward/std": 0.18053678423166275, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1350 }, { "clip_ratio/high_max": 0.0010849739890545606, "clip_ratio/high_mean": 0.00014202208258211612, "clip_ratio/low_mean": 4.9698005022946744e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019172008032910525, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/mean_length": 75.89322916666667, "completions/min_length": 33.333333333333336, "epoch": 2.7318548387096775, "frac_reward_zero_std": 0.0625, "grad_norm": 2.140625, "kl": 0.004231705609709024, "learning_rate": 4.6469315760516457e-07, "loss": 0.00010425058426335453, "reward": 1.6495884656906128, "reward_std": 0.18049381176630655, "rewards/FidelityReward/mean": 0.763097902139028, "rewards/FidelityReward/std": 0.19472646713256836, "rewards/JudgeFidelityReward/mean": 0.7762364347775778, "rewards/JudgeFidelityReward/std": 0.17515176037947336, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04410756006836891, "step": 1355 }, { "clip_ratio/high_max": 0.0017543572466820478, "clip_ratio/high_mean": 0.0002345970890019089, "clip_ratio/low_mean": 0.00015383248100988566, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038842958165332677, "completions/clipped_ratio": 0.0, "completions/max_length": 266.5, "completions/mean_length": 75.828125, "completions/min_length": 33.5, "epoch": 2.741935483870968, "frac_reward_zero_std": 0.0625, "grad_norm": 2.28125, "kl": 0.004214533418416977, "learning_rate": 4.6136868002058336e-07, "loss": 0.0002490297658368945, "reward": 1.6236250400543213, "reward_std": 0.18796207755804062, "rewards/FidelityReward/mean": 0.7479370832443237, "rewards/FidelityReward/std": 0.20082221925258636, "rewards/JudgeFidelityReward/mean": 0.7533290088176727, "rewards/JudgeFidelityReward/std": 0.1954842284321785, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1360 }, { "clip_ratio/high_max": 0.0012559148482978344, "clip_ratio/high_mean": 0.0001799147401470691, "clip_ratio/low_mean": 9.235724573954939e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002722719917073846, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 76.57682291666667, "completions/min_length": 34.333333333333336, "epoch": 2.752016129032258, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 1.9921875, "kl": 0.0043905024416744706, "learning_rate": 4.580459196606789e-07, "loss": 0.00013399471063166857, "reward": 1.6518789132436116, "reward_std": 0.19126567741235098, "rewards/FidelityReward/mean": 0.7691235144933065, "rewards/FidelityReward/std": 0.20238221685091654, "rewards/JudgeFidelityReward/mean": 0.770068089167277, "rewards/JudgeFidelityReward/std": 0.18914768596490225, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.0649204986790816, "step": 1365 }, { "clip_ratio/high_max": 0.0020986156072467566, "clip_ratio/high_mean": 0.00029167677275836467, "clip_ratio/low_mean": 0.00014684131019748747, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00043851807713508607, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/mean_length": 80.3701171875, "completions/min_length": 32.5, "epoch": 2.7620967741935485, "frac_reward_zero_std": 0.0625, "grad_norm": 2.078125, "kl": 0.0043158245272934435, "learning_rate": 4.54725024227525e-07, "loss": 0.00015601858031004666, "reward": 1.6015700101852417, "reward_std": 0.2018570899963379, "rewards/FidelityReward/mean": 0.7326491177082062, "rewards/FidelityReward/std": 0.22266670316457748, "rewards/JudgeFidelityReward/mean": 0.7456543147563934, "rewards/JudgeFidelityReward/std": 0.20227206498384476, "rewards/SelfEvolvingFormatReward/mean": 0.9921875, "rewards/SelfEvolvingFormatReward/std": 0.08812850713729858, "step": 1370 }, { "clip_ratio/high_max": 0.0011043770937249064, "clip_ratio/high_mean": 0.0001396089734043926, "clip_ratio/low_mean": 8.095649536699057e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022056546295061706, "completions/clipped_ratio": 0.0006510416666666666, "completions/max_length": 276.3333333333333, "completions/mean_length": 76.453125, "completions/min_length": 34.666666666666664, "epoch": 2.772177419354839, "frac_reward_zero_std": 0.109375, "grad_norm": 1.9296875, "kl": 0.00422217482700944, "learning_rate": 4.5140614134029715e-07, "loss": 0.00023827282711863518, "reward": 1.6325603723526, "reward_std": 0.18384760121504465, "rewards/FidelityReward/mean": 0.7576040228207906, "rewards/FidelityReward/std": 0.20014254252115884, "rewards/JudgeFidelityReward/mean": 0.751865824063619, "rewards/JudgeFidelityReward/std": 0.19187837839126587, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 1375 }, { "clip_ratio/high_max": 0.0015779898967593908, "clip_ratio/high_mean": 0.0001862688106484711, "clip_ratio/low_mean": 0.00014399305655388162, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033026186283677814, "completions/clipped_ratio": 0.0, "completions/max_length": 216.5, "completions/mean_length": 76.7568359375, "completions/min_length": 36.0, "epoch": 2.782258064516129, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.09375, "kl": 0.004331063618883491, "learning_rate": 4.480894185287091e-07, "loss": 7.186006987467409e-05, "reward": 1.633026361465454, "reward_std": 0.18889442831277847, "rewards/FidelityReward/mean": 0.7550650238990784, "rewards/FidelityReward/std": 0.19906890392303467, "rewards/JudgeFidelityReward/mean": 0.7588522732257843, "rewards/JudgeFidelityReward/std": 0.1913144662976265, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1380 }, { "clip_ratio/high_max": 0.0010166663210839032, "clip_ratio/high_mean": 0.00015345395368058235, "clip_ratio/low_mean": 0.00010548614081926644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025894008576869964, "completions/clipped_ratio": 0.0, "completions/max_length": 240.33333333333334, "completions/mean_length": 75.16471354166667, "completions/min_length": 34.0, "epoch": 2.7923387096774195, "frac_reward_zero_std": 0.0625, "grad_norm": 2.015625, "kl": 0.004166661202907563, "learning_rate": 4.447750032264565e-07, "loss": 0.0001537183066830039, "reward": 1.6207931439081829, "reward_std": 0.19192004203796387, "rewards/FidelityReward/mean": 0.7442713379859924, "rewards/FidelityReward/std": 0.19922634462515512, "rewards/JudgeFidelityReward/mean": 0.7562987804412842, "rewards/JudgeFidelityReward/std": 0.17245863874753317, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04627847671508789, "step": 1385 }, { "clip_ratio/high_max": 0.002408733032643795, "clip_ratio/high_mean": 0.0002923789084888995, "clip_ratio/low_mean": 0.00012468039712985047, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004170592990703881, "completions/clipped_ratio": 0.0, "completions/max_length": 297.5, "completions/mean_length": 77.2939453125, "completions/min_length": 32.0, "epoch": 2.8024193548387095, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.171875, "kl": 0.004396708775311708, "learning_rate": 4.4146304276466186e-07, "loss": 0.00020952424965798856, "reward": 1.6283058524131775, "reward_std": 0.18846063315868378, "rewards/FidelityReward/mean": 0.748351514339447, "rewards/FidelityReward/std": 0.20416703075170517, "rewards/JudgeFidelityReward/mean": 0.7638148665428162, "rewards/JudgeFidelityReward/std": 0.19245748966932297, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 1390 }, { "clip_ratio/high_max": 0.0009776441380381583, "clip_ratio/high_mean": 0.00013389362138696014, "clip_ratio/low_mean": 8.985217718873172e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022374578984454274, "completions/clipped_ratio": 0.0, "completions/max_length": 195.33333333333334, "completions/mean_length": 76.51041666666667, "completions/min_length": 33.333333333333336, "epoch": 2.8125, "frac_reward_zero_std": 0.109375, "grad_norm": 2.140625, "kl": 0.00422991132363677, "learning_rate": 4.381536843653261e-07, "loss": 0.00020426863338798284, "reward": 1.6007639169692993, "reward_std": 0.19232454895973206, "rewards/FidelityReward/mean": 0.730332612991333, "rewards/FidelityReward/std": 0.21520723899205527, "rewards/JudgeFidelityReward/mean": 0.7441178560256958, "rewards/JudgeFidelityReward/std": 0.20090411603450775, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.05635726824402809, "step": 1395 }, { "clip_ratio/high_max": 0.0015245092567056418, "clip_ratio/high_mean": 0.00020095222862437367, "clip_ratio/low_mean": 9.379621042171493e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029474843759089706, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/mean_length": 75.7900390625, "completions/min_length": 33.5, "epoch": 2.8225806451612905, "frac_reward_zero_std": 0.1015625, "grad_norm": 2.03125, "kl": 0.004444099869579077, "learning_rate": 4.348470751347849e-07, "loss": 0.00014865888515487314, "reward": 1.6279776096343994, "reward_std": 0.1872102990746498, "rewards/FidelityReward/mean": 0.7518139183521271, "rewards/FidelityReward/std": 0.19087261706590652, "rewards/JudgeFidelityReward/mean": 0.7562336325645447, "rewards/JudgeFidelityReward/std": 0.18967923521995544, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06243881583213806, "step": 1400 }, { "clip_ratio/high_max": 0.0012042084243148565, "clip_ratio/high_mean": 0.00014619407593272627, "clip_ratio/low_mean": 8.182537276297807e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022801944287493826, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/mean_length": 75.080078125, "completions/min_length": 33.333333333333336, "epoch": 2.8326612903225805, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 1.9140625, "kl": 0.004230500292032957, "learning_rate": 4.315433620571683e-07, "loss": 0.00015714481705799698, "reward": 1.6126104195912678, "reward_std": 0.17728114624818167, "rewards/FidelityReward/mean": 0.7395870486895243, "rewards/FidelityReward/std": 0.21887558698654175, "rewards/JudgeFidelityReward/mean": 0.7473487655321757, "rewards/JudgeFidelityReward/std": 0.20946595072746277, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.02081293861071269, "step": 1405 }, { "clip_ratio/high_max": 0.0021631830371916295, "clip_ratio/high_mean": 0.0003205470275133848, "clip_ratio/low_mean": 7.574795017717406e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003962949966080487, "completions/clipped_ratio": 0.0, "completions/max_length": 233.5, "completions/mean_length": 77.7978515625, "completions/min_length": 31.0, "epoch": 2.842741935483871, "frac_reward_zero_std": 0.0625, "grad_norm": 2.171875, "kl": 0.004241359047591686, "learning_rate": 4.282426919878678e-07, "loss": 0.00026121551636606457, "reward": 1.5987914204597473, "reward_std": 0.2051543965935707, "rewards/FidelityReward/mean": 0.7310240566730499, "rewards/FidelityReward/std": 0.19583036750555038, "rewards/JudgeFidelityReward/mean": 0.73651123046875, "rewards/JudgeFidelityReward/std": 0.18204415589571, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 1410 }, { "clip_ratio/high_max": 0.0009990975726395846, "clip_ratio/high_mean": 0.00014402512460947036, "clip_ratio/low_mean": 8.456496288999915e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002285900874994695, "completions/clipped_ratio": 0.0, "completions/max_length": 199.66666666666666, "completions/mean_length": 74.0703125, "completions/min_length": 35.0, "epoch": 2.852822580645161, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.046875, "kl": 0.004083392443135381, "learning_rate": 4.249452116470082e-07, "loss": 4.2025759466923776e-05, "reward": 1.6026891469955444, "reward_std": 0.1921013444662094, "rewards/FidelityReward/mean": 0.7388755480448405, "rewards/FidelityReward/std": 0.21799173951148987, "rewards/JudgeFidelityReward/mean": 0.7302313049634298, "rewards/JudgeFidelityReward/std": 0.20286767184734344, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04162587722142538, "step": 1415 }, { "clip_ratio/high_max": 0.0023216577246785164, "clip_ratio/high_mean": 0.00022361676092259587, "clip_ratio/low_mean": 8.446521824225783e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030808196752332153, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 75.89453125, "completions/min_length": 32.0, "epoch": 2.8629032258064515, "frac_reward_zero_std": 0.078125, "grad_norm": 2.171875, "kl": 0.004561043065041304, "learning_rate": 4.2165106761292604e-07, "loss": 5.151790101081133e-05, "reward": 1.5755852460861206, "reward_std": 0.20125409215688705, "rewards/FidelityReward/mean": 0.7149289548397064, "rewards/FidelityReward/std": 0.2162826731801033, "rewards/JudgeFidelityReward/mean": 0.7242422103881836, "rewards/JudgeFidelityReward/std": 0.18311040103435516, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1420 }, { "clip_ratio/high_max": 0.0007139159599319101, "clip_ratio/high_mean": 0.0001039896160364151, "clip_ratio/low_mean": 0.00014022726099938154, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024421688867732884, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/mean_length": 76.15625, "completions/min_length": 32.333333333333336, "epoch": 2.872983870967742, "frac_reward_zero_std": 0.11458333333333333, "grad_norm": 1.9375, "kl": 0.004371127020567656, "learning_rate": 4.183604063156533e-07, "loss": 0.00019226550357416273, "reward": 1.6105430523554485, "reward_std": 0.1934854338566462, "rewards/FidelityReward/mean": 0.7425312797228495, "rewards/FidelityReward/std": 0.20112736026446024, "rewards/JudgeFidelityReward/mean": 0.741882860660553, "rewards/JudgeFidelityReward/std": 0.19653368989626566, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07443709174791972, "step": 1425 }, { "clip_ratio/high_max": 0.001557924645021558, "clip_ratio/high_mean": 0.00024537218268960714, "clip_ratio/low_mean": 0.00017377220501657575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00041914439061656593, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/mean_length": 76.1865234375, "completions/min_length": 32.5, "epoch": 2.883064516129032, "frac_reward_zero_std": 0.0703125, "grad_norm": 1.9375, "kl": 0.004409455228596926, "learning_rate": 4.150733740304088e-07, "loss": 7.354437839239836e-05, "reward": 1.636068880558014, "reward_std": 0.18425977230072021, "rewards/FidelityReward/mean": 0.7555773556232452, "rewards/FidelityReward/std": 0.1999068558216095, "rewards/JudgeFidelityReward/mean": 0.7609830796718597, "rewards/JudgeFidelityReward/std": 0.17058084905147552, "rewards/SelfEvolvingFormatReward/mean": 1.0, "rewards/SelfEvolvingFormatReward/std": 0.0, "step": 1430 }, { "clip_ratio/high_max": 0.0008889487013220787, "clip_ratio/high_mean": 0.00011894649360328912, "clip_ratio/low_mean": 9.72551031736657e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002162015880458057, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/mean_length": 76.74153645833333, "completions/min_length": 32.666666666666664, "epoch": 2.8931451612903225, "frac_reward_zero_std": 0.08854166666666667, "grad_norm": 1.984375, "kl": 0.004031527740880847, "learning_rate": 4.117901168710959e-07, "loss": 0.0001794284675270319, "reward": 1.6277955373128254, "reward_std": 0.18916964530944824, "rewards/FidelityReward/mean": 0.7522672017415365, "rewards/FidelityReward/std": 0.1986440122127533, "rewards/JudgeFidelityReward/mean": 0.7536607980728149, "rewards/JudgeFidelityReward/std": 0.19005919496218363, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04019692912697792, "step": 1435 }, { "clip_ratio/high_max": 0.0023626741487532852, "clip_ratio/high_mean": 0.00031852038810029624, "clip_ratio/low_mean": 0.00017348708934150636, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004920074832625687, "completions/clipped_ratio": 0.0, "completions/max_length": 240.5, "completions/mean_length": 76.2841796875, "completions/min_length": 34.0, "epoch": 2.903225806451613, "frac_reward_zero_std": 0.046875, "grad_norm": 2.171875, "kl": 0.0041041096672415735, "learning_rate": 4.085107807838072e-07, "loss": 2.858219959307462e-05, "reward": 1.591733455657959, "reward_std": 0.20419111102819443, "rewards/FidelityReward/mean": 0.7306037843227386, "rewards/FidelityReward/std": 0.20924603193998337, "rewards/JudgeFidelityReward/mean": 0.7251890599727631, "rewards/JudgeFidelityReward/std": 0.20415013283491135, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1440 }, { "clip_ratio/high_max": 0.0010577261913567782, "clip_ratio/high_mean": 0.00017069109599106015, "clip_ratio/low_mean": 8.049978932831437e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002511908882297575, "completions/clipped_ratio": 0.0, "completions/max_length": 216.66666666666666, "completions/mean_length": 75.10807291666667, "completions/min_length": 32.333333333333336, "epoch": 2.913306451612903, "frac_reward_zero_std": 0.078125, "grad_norm": 2.03125, "kl": 0.004384382534772158, "learning_rate": 4.0523551154033806e-07, "loss": 0.00013553122989833354, "reward": 1.615025480588277, "reward_std": 0.20216807226339975, "rewards/FidelityReward/mean": 0.7428353230158488, "rewards/FidelityReward/std": 0.2044462263584137, "rewards/JudgeFidelityReward/mean": 0.7489375670750936, "rewards/JudgeFidelityReward/std": 0.1892294685045878, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.06709141532580058, "step": 1445 }, { "clip_ratio/high_max": 0.0019890781957656147, "clip_ratio/high_mean": 0.00027976669371128083, "clip_ratio/low_mean": 0.00010907158648478798, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038883826928213237, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/mean_length": 78.1376953125, "completions/min_length": 32.5, "epoch": 2.9233870967741935, "frac_reward_zero_std": 0.046875, "grad_norm": 2.015625, "kl": 0.004440175741910935, "learning_rate": 4.0196445473170543e-07, "loss": 0.00031669405288994314, "reward": 1.5705560445785522, "reward_std": 0.2111339122056961, "rewards/FidelityReward/mean": 0.7182481288909912, "rewards/FidelityReward/std": 0.20206797868013382, "rewards/JudgeFidelityReward/mean": 0.7065688371658325, "rewards/JudgeFidelityReward/std": 0.1967698112130165, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1450 }, { "clip_ratio/high_max": 0.0013604517560452222, "clip_ratio/high_mean": 0.0001754963188432157, "clip_ratio/low_mean": 0.00010376987629570066, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027926620095968245, "completions/clipped_ratio": 0.0, "completions/max_length": 215.33333333333334, "completions/mean_length": 78.30598958333333, "completions/min_length": 33.0, "epoch": 2.933467741935484, "frac_reward_zero_std": 0.046875, "grad_norm": 2.0625, "kl": 0.00422139810398221, "learning_rate": 3.9869775576167664e-07, "loss": 0.00015736145433038474, "reward": 1.670215328534444, "reward_std": 0.18942660093307495, "rewards/FidelityReward/mean": 0.7797142664591471, "rewards/FidelityReward/std": 0.19703767697016397, "rewards/JudgeFidelityReward/mean": 0.7875125010808309, "rewards/JudgeFidelityReward/std": 0.19018070896466574, "rewards/SelfEvolvingFormatReward/mean": 0.9934895833333334, "rewards/SelfEvolvingFormatReward/std": 0.07908969124158223, "step": 1455 }, { "clip_ratio/high_max": 0.0015137574402615428, "clip_ratio/high_mean": 0.0002085659245494753, "clip_ratio/low_mean": 9.416332759428769e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030272925505414605, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/mean_length": 74.669921875, "completions/min_length": 32.0, "epoch": 2.943548387096774, "frac_reward_zero_std": 0.1015625, "grad_norm": 2.125, "kl": 0.004120046691969037, "learning_rate": 3.9543555984030654e-07, "loss": 0.00032834005542099477, "reward": 1.5855239033699036, "reward_std": 0.19220462441444397, "rewards/FidelityReward/mean": 0.7221938073635101, "rewards/FidelityReward/std": 0.2225090190768242, "rewards/JudgeFidelityReward/mean": 0.7315429449081421, "rewards/JudgeFidelityReward/std": 0.19946055859327316, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06941771507263184, "step": 1460 }, { "clip_ratio/high_max": 0.0015086655505001545, "clip_ratio/high_mean": 0.00022983482922427355, "clip_ratio/low_mean": 0.00011794440215453505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003477792255580425, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/mean_length": 77.72526041666667, "completions/min_length": 34.0, "epoch": 2.9536290322580645, "frac_reward_zero_std": 0.0625, "grad_norm": 2.09375, "kl": 0.00402322500012815, "learning_rate": 3.921780119774818e-07, "loss": 0.00014378705527633427, "reward": 1.6377743085225422, "reward_std": 0.18969004352887472, "rewards/FidelityReward/mean": 0.7622198859850565, "rewards/FidelityReward/std": 0.19556927680969238, "rewards/JudgeFidelityReward/mean": 0.7530620098114014, "rewards/JudgeFidelityReward/std": 0.1906207948923111, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 1465 }, { "clip_ratio/high_max": 0.0017624173313379287, "clip_ratio/high_mean": 0.00027740063378587363, "clip_ratio/low_mean": 0.0001465487090172246, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00042394932825118303, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/mean_length": 75.7265625, "completions/min_length": 35.0, "epoch": 2.963709677419355, "frac_reward_zero_std": 0.046875, "grad_norm": 2.3125, "kl": 0.004419880080968141, "learning_rate": 3.8892525697647544e-07, "loss": 0.0001957969041541219, "reward": 1.5831483602523804, "reward_std": 0.1957429200410843, "rewards/FidelityReward/mean": 0.7227934300899506, "rewards/FidelityReward/std": 0.2084469497203827, "rewards/JudgeFidelityReward/mean": 0.7236396968364716, "rewards/JudgeFidelityReward/std": 0.20071960240602493, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1470 }, { "clip_ratio/high_max": 0.0014427250251173973, "clip_ratio/high_mean": 0.00019001842010766268, "clip_ratio/low_mean": 6.743887497577817e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025745729217305777, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/mean_length": 76.93098958333333, "completions/min_length": 34.666666666666664, "epoch": 2.973790322580645, "frac_reward_zero_std": 0.0625, "grad_norm": 1.8515625, "kl": 0.00425788126885891, "learning_rate": 3.8567743942751015e-07, "loss": 0.00027771012391895054, "reward": 1.6109934250513713, "reward_std": 0.19516581296920776, "rewards/FidelityReward/mean": 0.742837111155192, "rewards/FidelityReward/std": 0.20835164686044058, "rewards/JudgeFidelityReward/mean": 0.7408698598543803, "rewards/JudgeFidelityReward/std": 0.2059200257062912, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.06566246723135312, "step": 1475 }, { "clip_ratio/high_max": 0.0019337456207722426, "clip_ratio/high_mean": 0.000326740532182157, "clip_ratio/low_mean": 0.00017680992605164647, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005035504582338035, "completions/clipped_ratio": 0.0, "completions/max_length": 209.5, "completions/mean_length": 74.75, "completions/min_length": 33.5, "epoch": 2.9838709677419355, "frac_reward_zero_std": 0.1015625, "grad_norm": 2.0625, "kl": 0.004160072933882475, "learning_rate": 3.8243470370133076e-07, "loss": 5.6398636661469934e-05, "reward": 1.666632056236267, "reward_std": 0.16455382108688354, "rewards/FidelityReward/mean": 0.7796891033649445, "rewards/FidelityReward/std": 0.18097800016403198, "rewards/JudgeFidelityReward/mean": 0.7797451615333557, "rewards/JudgeFidelityReward/std": 0.1595267876982689, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07131390832364559, "step": 1480 }, { "clip_ratio/high_max": 0.0007523973239585757, "clip_ratio/high_mean": 9.079325245693326e-05, "clip_ratio/low_mean": 8.045319118537009e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017124644364230335, "completions/clipped_ratio": 0.0, "completions/max_length": 222.33333333333334, "completions/mean_length": 75.62174479166667, "completions/min_length": 33.333333333333336, "epoch": 2.993951612903226, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 1.8828125, "kl": 0.004209164064377546, "learning_rate": 3.7919719394278683e-07, "loss": 0.0001941432827152312, "reward": 1.6467208464940388, "reward_std": 0.17648620903491974, "rewards/FidelityReward/mean": 0.7606552839279175, "rewards/FidelityReward/std": 0.2092359960079193, "rewards/JudgeFidelityReward/mean": 0.7734331488609314, "rewards/JudgeFidelityReward/std": 0.18004567921161652, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.02081293861071269, "step": 1485 }, { "clip_ratio/high_max": 0.001298942556604743, "clip_ratio/high_mean": 0.0001917942485306412, "clip_ratio/low_mean": 8.925195143092424e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002810462028719485, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/mean_length": 78.0126953125, "completions/min_length": 32.0, "epoch": 3.004032258064516, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.0625, "kl": 0.004426185041666031, "learning_rate": 3.759650540644252e-07, "loss": 0.00011856229975819588, "reward": 1.5977199077606201, "reward_std": 0.20023313164710999, "rewards/FidelityReward/mean": 0.7334166169166565, "rewards/FidelityReward/std": 0.20614196360111237, "rewards/JudgeFidelityReward/mean": 0.7344659864902496, "rewards/JudgeFidelityReward/std": 0.19503919780254364, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 1490 }, { "clip_ratio/high_max": 0.0014918695203959941, "clip_ratio/high_mean": 0.00023441410157829524, "clip_ratio/low_mean": 8.368127164430917e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003180953906849027, "completions/clipped_ratio": 0.0, "completions/max_length": 197.66666666666666, "completions/mean_length": 76.39453125, "completions/min_length": 34.0, "epoch": 3.0141129032258065, "frac_reward_zero_std": 0.08854166666666667, "grad_norm": 1.9921875, "kl": 0.0040013177786022425, "learning_rate": 3.7273842774009324e-07, "loss": 0.0002220073714852333, "reward": 1.6295413573582966, "reward_std": 0.18376330037911734, "rewards/FidelityReward/mean": 0.7530357042948405, "rewards/FidelityReward/std": 0.21461229026317596, "rewards/JudgeFidelityReward/mean": 0.7536624073982239, "rewards/JudgeFidelityReward/std": 0.2127431333065033, "rewards/SelfEvolvingFormatReward/mean": 0.9993489583333334, "rewards/SelfEvolvingFormatReward/std": 0.014731391022602717, "step": 1495 }, { "clip_ratio/high_max": 0.0018739799037575723, "clip_ratio/high_mean": 0.00032418540795333686, "clip_ratio/low_mean": 8.337496692547574e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040756036760285496, "completions/clipped_ratio": 0.0, "completions/max_length": 235.5, "completions/mean_length": 75.904296875, "completions/min_length": 34.0, "epoch": 3.024193548387097, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.125, "kl": 0.004236454889178276, "learning_rate": 3.6951745839855127e-07, "loss": 0.0002892578020691872, "reward": 1.6491729617118835, "reward_std": 0.18977582454681396, "rewards/FidelityReward/mean": 0.7649595439434052, "rewards/FidelityReward/std": 0.1966009959578514, "rewards/JudgeFidelityReward/mean": 0.7723330855369568, "rewards/JudgeFidelityReward/std": 0.18629784137010574, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06243881583213806, "step": 1500 }, { "clip_ratio/high_max": 0.0007823411375284195, "clip_ratio/high_mean": 0.00015659242635592818, "clip_ratio/low_mean": 6.75550807500258e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022414750419557093, "completions/clipped_ratio": 0.0, "completions/max_length": 217.66666666666666, "completions/mean_length": 75.7109375, "completions/min_length": 32.333333333333336, "epoch": 3.034274193548387, "frac_reward_zero_std": 0.15104166666666666, "grad_norm": 2.015625, "kl": 0.00436042882502079, "learning_rate": 3.663022892170979e-07, "loss": 0.00015791619662195445, "reward": 1.6399012009302776, "reward_std": 0.17341832319895426, "rewards/FidelityReward/mean": 0.7567956844965616, "rewards/FidelityReward/std": 0.20888936519622803, "rewards/JudgeFidelityReward/mean": 0.770768404006958, "rewards/JudgeFidelityReward/std": 0.19224735101064047, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.06709141532580058, "step": 1505 }, { "clip_ratio/high_max": 0.0016980370506644248, "clip_ratio/high_mean": 0.00020796385942958295, "clip_ratio/low_mean": 0.0001315189787419513, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003394828410819173, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 76.1943359375, "completions/min_length": 34.0, "epoch": 3.0443548387096775, "frac_reward_zero_std": 0.046875, "grad_norm": 1.9921875, "kl": 0.004279325995594263, "learning_rate": 3.630930631152055e-07, "loss": 0.00019451957195997237, "reward": 1.628726303577423, "reward_std": 0.19550412893295288, "rewards/FidelityReward/mean": 0.7483364641666412, "rewards/FidelityReward/std": 0.20886539667844772, "rewards/JudgeFidelityReward/mean": 0.7637094855308533, "rewards/JudgeFidelityReward/std": 0.18344982713460922, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1510 }, { "clip_ratio/high_max": 0.001574007049202919, "clip_ratio/high_mean": 0.0001774646923877299, "clip_ratio/low_mean": 5.585728795267642e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002333219861611724, "completions/clipped_ratio": 0.0, "completions/max_length": 215.33333333333334, "completions/mean_length": 76.05403645833333, "completions/min_length": 32.0, "epoch": 3.0544354838709675, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.171875, "kl": 0.004265718068927527, "learning_rate": 3.598899227481662e-07, "loss": 9.448554483242333e-05, "reward": 1.5968124866485596, "reward_std": 0.19929592311382294, "rewards/FidelityReward/mean": 0.728130022684733, "rewards/FidelityReward/std": 0.22277435660362244, "rewards/JudgeFidelityReward/mean": 0.74192214012146, "rewards/JudgeFidelityReward/std": 0.20282082259655, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.06709141532580058, "step": 1515 }, { "clip_ratio/high_max": 0.001179597619920969, "clip_ratio/high_mean": 0.00016367369680665434, "clip_ratio/low_mean": 0.0001626364653930068, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032631016219966116, "completions/clipped_ratio": 0.0, "completions/max_length": 192.5, "completions/mean_length": 75.0009765625, "completions/min_length": 34.0, "epoch": 3.064516129032258, "frac_reward_zero_std": 0.0703125, "grad_norm": 1.9453125, "kl": 0.004336004611104727, "learning_rate": 3.566930105007524e-07, "loss": 0.00022069367114454507, "reward": 1.6092702746391296, "reward_std": 0.19049712270498276, "rewards/FidelityReward/mean": 0.7417061626911163, "rewards/FidelityReward/std": 0.21347946673631668, "rewards/JudgeFidelityReward/mean": 0.7390344738960266, "rewards/JudgeFidelityReward/std": 0.20985966175794601, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 1520 }, { "clip_ratio/high_max": 0.001219204836525023, "clip_ratio/high_mean": 9.330154280178249e-05, "clip_ratio/low_mean": 8.123049919959158e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017453203909099103, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/mean_length": 75.97135416666667, "completions/min_length": 33.333333333333336, "epoch": 3.0745967741935485, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 2.125, "kl": 0.004333311505615711, "learning_rate": 3.5350246848088574e-07, "loss": 0.00022216523066163064, "reward": 1.6126288572947185, "reward_std": 0.18569870789845785, "rewards/FidelityReward/mean": 0.7427670359611511, "rewards/FidelityReward/std": 0.20219666759173074, "rewards/JudgeFidelityReward/mean": 0.7429788708686829, "rewards/JudgeFidelityReward/std": 0.19428075850009918, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04410756006836891, "step": 1525 }, { "clip_ratio/high_max": 0.001931694522500038, "clip_ratio/high_mean": 0.00018328242877032607, "clip_ratio/low_mean": 0.00012416348326951265, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030744591495022176, "completions/clipped_ratio": 0.0, "completions/max_length": 186.5, "completions/mean_length": 76.4833984375, "completions/min_length": 36.5, "epoch": 3.0846774193548385, "frac_reward_zero_std": 0.046875, "grad_norm": 2.265625, "kl": 0.0042150754947215315, "learning_rate": 3.50318438513321e-07, "loss": 0.00014544003643095492, "reward": 1.6485867500305176, "reward_std": 0.19273124635219574, "rewards/FidelityReward/mean": 0.7660370171070099, "rewards/FidelityReward/std": 0.19227220118045807, "rewards/JudgeFidelityReward/mean": 0.7670525908470154, "rewards/JudgeFidelityReward/std": 0.18701698631048203, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 1530 }, { "clip_ratio/high_max": 0.0015041522216051816, "clip_ratio/high_mean": 0.000191460270434618, "clip_ratio/low_mean": 8.554333471693099e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027700362261384726, "completions/clipped_ratio": 0.0, "completions/max_length": 223.33333333333334, "completions/mean_length": 74.01041666666667, "completions/min_length": 31.666666666666668, "epoch": 3.094758064516129, "frac_reward_zero_std": 0.109375, "grad_norm": 1.859375, "kl": 0.004185901070013642, "learning_rate": 3.47141062133342e-07, "loss": 0.00019949667621403931, "reward": 1.6133345762888591, "reward_std": 0.17784423132737479, "rewards/FidelityReward/mean": 0.7379241387049357, "rewards/FidelityReward/std": 0.20812426010767618, "rewards/JudgeFidelityReward/mean": 0.7521229783693949, "rewards/JudgeFidelityReward/std": 0.18015374739964804, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.029462782045205433, "step": 1535 }, { "clip_ratio/high_max": 0.0013446829281747342, "clip_ratio/high_mean": 0.00027592703117989, "clip_ratio/low_mean": 7.929912098916247e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035522616235539317, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/mean_length": 76.169921875, "completions/min_length": 36.0, "epoch": 3.1048387096774195, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.203125, "kl": 0.0043395147658884525, "learning_rate": 3.4397048058046964e-07, "loss": 0.00012779562966898085, "reward": 1.6821246147155762, "reward_std": 0.18234026432037354, "rewards/FidelityReward/mean": 0.7879771590232849, "rewards/FidelityReward/std": 0.19485438615083694, "rewards/JudgeFidelityReward/mean": 0.7912246584892273, "rewards/JudgeFidelityReward/std": 0.19003579765558243, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1540 }, { "clip_ratio/high_max": 0.0013990132603794337, "clip_ratio/high_mean": 0.00016592239844612778, "clip_ratio/low_mean": 3.763105341931805e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020355345332063736, "completions/clipped_ratio": 0.0, "completions/max_length": 209.66666666666666, "completions/mean_length": 75.64778645833333, "completions/min_length": 34.333333333333336, "epoch": 3.1149193548387095, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 1.8828125, "kl": 0.004425340890884399, "learning_rate": 3.4080683479218375e-07, "loss": 0.0002764678094536066, "reward": 1.628022829691569, "reward_std": 0.18493367234865823, "rewards/FidelityReward/mean": 0.7477365136146545, "rewards/FidelityReward/std": 0.2098990778128306, "rewards/JudgeFidelityReward/mean": 0.7631767392158508, "rewards/JudgeFidelityReward/std": 0.19139990210533142, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04162587722142538, "step": 1545 }, { "clip_ratio/high_max": 0.0019961575977504253, "clip_ratio/high_mean": 0.00015815101796761154, "clip_ratio/low_mean": 9.6337721333839e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002544887247495353, "completions/clipped_ratio": 0.0, "completions/max_length": 204.5, "completions/mean_length": 76.64453125, "completions/min_length": 32.5, "epoch": 3.125, "frac_reward_zero_std": 0.09375, "grad_norm": 2.203125, "kl": 0.004474383872002363, "learning_rate": 3.3765026539765827e-07, "loss": 0.00023862062953412534, "reward": 1.6379239559173584, "reward_std": 0.18683135509490967, "rewards/FidelityReward/mean": 0.7617000341415405, "rewards/FidelityReward/std": 0.19611185789108276, "rewards/JudgeFidelityReward/mean": 0.753424346446991, "rewards/JudgeFidelityReward/std": 0.17841088771820068, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 1550 }, { "clip_ratio/high_max": 0.0012130966410040856, "clip_ratio/high_mean": 0.00013117121416144072, "clip_ratio/low_mean": 8.252083498518913e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000213692057877779, "completions/clipped_ratio": 0.0, "completions/max_length": 248.33333333333334, "completions/mean_length": 76.859375, "completions/min_length": 32.666666666666664, "epoch": 3.1350806451612905, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 1.9609375, "kl": 0.004343625530600548, "learning_rate": 3.3450091271151e-07, "loss": 0.00018631115090101958, "reward": 1.6303926706314087, "reward_std": 0.18392571806907654, "rewards/FidelityReward/mean": 0.7525760730107626, "rewards/FidelityReward/std": 0.201567014058431, "rewards/JudgeFidelityReward/mean": 0.7614925702412924, "rewards/JudgeFidelityReward/std": 0.18686832984288534, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07145174965262413, "step": 1555 }, { "clip_ratio/high_max": 0.0022773534525185823, "clip_ratio/high_mean": 0.0003147793293464929, "clip_ratio/low_mean": 9.900932782329619e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00041378867463208734, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/mean_length": 78.70703125, "completions/min_length": 36.5, "epoch": 3.1451612903225805, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.09375, "kl": 0.004250586964190006, "learning_rate": 3.3135891672756134e-07, "loss": 0.00019454482244327665, "reward": 1.6048424243927002, "reward_std": 0.21026304364204407, "rewards/FidelityReward/mean": 0.734002560377121, "rewards/FidelityReward/std": 0.2019316479563713, "rewards/JudgeFidelityReward/mean": 0.7455857396125793, "rewards/JudgeFidelityReward/std": 0.19920966774225235, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06243881583213806, "step": 1560 }, { "clip_ratio/high_max": 0.0014292635954916478, "clip_ratio/high_mean": 0.0001871075772214681, "clip_ratio/low_mean": 3.188387127011083e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021899144630879163, "completions/clipped_ratio": 0.0, "completions/max_length": 214.66666666666666, "completions/mean_length": 75.046875, "completions/min_length": 32.666666666666664, "epoch": 3.155241935483871, "frac_reward_zero_std": 0.10416666666666667, "grad_norm": 2.015625, "kl": 0.004263715911656618, "learning_rate": 3.2822441711261726e-07, "loss": 0.000146392616443336, "reward": 1.6420251528422039, "reward_std": 0.17079607645670572, "rewards/FidelityReward/mean": 0.7626644968986511, "rewards/FidelityReward/std": 0.2058107703924179, "rewards/JudgeFidelityReward/mean": 0.7600234150886536, "rewards/JudgeFidelityReward/std": 0.20115481813748678, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.029462782045205433, "step": 1565 }, { "clip_ratio/high_max": 0.00205672481097281, "clip_ratio/high_mean": 0.0003637509129475802, "clip_ratio/low_mean": 0.00011846130073536187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004822122282348573, "completions/clipped_ratio": 0.0, "completions/max_length": 235.5, "completions/mean_length": 75.6142578125, "completions/min_length": 33.5, "epoch": 3.1653225806451615, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.0625, "kl": 0.004238884802907706, "learning_rate": 3.2509755320025726e-07, "loss": 0.00014214343391358853, "reward": 1.5961594581604004, "reward_std": 0.18958303332328796, "rewards/FidelityReward/mean": 0.7286071181297302, "rewards/FidelityReward/std": 0.21664442867040634, "rewards/JudgeFidelityReward/mean": 0.7380341291427612, "rewards/JudgeFidelityReward/std": 0.20040351897478104, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1570 }, { "clip_ratio/high_max": 0.00096029257401824, "clip_ratio/high_mean": 0.00017219198634847998, "clip_ratio/low_mean": 7.171522156568244e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000243907212279737, "completions/clipped_ratio": 0.0, "completions/max_length": 226.66666666666666, "completions/mean_length": 76.44401041666667, "completions/min_length": 33.0, "epoch": 3.1754032258064515, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.03125, "kl": 0.004089136328548193, "learning_rate": 3.2197846398464135e-07, "loss": 0.00017384052043780684, "reward": 1.6077603896458943, "reward_std": 0.19878468910853067, "rewards/FidelityReward/mean": 0.739410916964213, "rewards/FidelityReward/std": 0.20805586377779642, "rewards/JudgeFidelityReward/mean": 0.7393030524253845, "rewards/JudgeFidelityReward/std": 0.20481193562348685, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.05027572065591812, "step": 1575 }, { "clip_ratio/high_max": 0.0017112591303884983, "clip_ratio/high_mean": 0.00017746902594808489, "clip_ratio/low_mean": 0.0001169399736681953, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029440900252666327, "completions/clipped_ratio": 0.0, "completions/max_length": 216.5, "completions/mean_length": 76.1396484375, "completions/min_length": 33.0, "epoch": 3.185483870967742, "frac_reward_zero_std": 0.03125, "grad_norm": 2.015625, "kl": 0.004529956541955471, "learning_rate": 3.188672881143316e-07, "loss": 0.00013215593062341214, "reward": 1.6087847352027893, "reward_std": 0.19427748769521713, "rewards/FidelityReward/mean": 0.7376785576343536, "rewards/FidelityReward/std": 0.22060082107782364, "rewards/JudgeFidelityReward/mean": 0.7461185157299042, "rewards/JudgeFidelityReward/std": 0.20894449949264526, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 1580 }, { "clip_ratio/high_max": 0.001055122073739767, "clip_ratio/high_mean": 0.00011184447794221342, "clip_ratio/low_mean": 7.663771102670581e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018848219187930226, "completions/clipped_ratio": 0.0, "completions/max_length": 210.33333333333334, "completions/mean_length": 75.61197916666667, "completions/min_length": 35.333333333333336, "epoch": 3.195564516129032, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 2.328125, "kl": 0.004245494259521365, "learning_rate": 3.157641638861291e-07, "loss": 0.00012880199356004595, "reward": 1.600952386856079, "reward_std": 0.20430408914883932, "rewards/FidelityReward/mean": 0.7289716800053915, "rewards/FidelityReward/std": 0.21024020512898764, "rewards/JudgeFidelityReward/mean": 0.7472166021664938, "rewards/JudgeFidelityReward/std": 0.19116639097531637, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.05635726824402809, "step": 1585 }, { "clip_ratio/high_max": 0.001960785873234272, "clip_ratio/high_mean": 0.00019922759674955158, "clip_ratio/low_mean": 0.00015963056648615748, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035885816323570906, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/mean_length": 74.69921875, "completions/min_length": 33.5, "epoch": 3.2056451612903225, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.0625, "kl": 0.004478113166987896, "learning_rate": 3.126692292389267e-07, "loss": 0.00012069141957908869, "reward": 1.6207221746444702, "reward_std": 0.19062390178442, "rewards/FidelityReward/mean": 0.7436001896858215, "rewards/FidelityReward/std": 0.2047773376107216, "rewards/JudgeFidelityReward/mean": 0.7571735978126526, "rewards/JudgeFidelityReward/std": 0.18570302426815033, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1590 }, { "clip_ratio/high_max": 0.0011619298718869686, "clip_ratio/high_mean": 0.00012058255961164833, "clip_ratio/low_mean": 8.420927115366794e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020479182712733744, "completions/clipped_ratio": 0.0, "completions/max_length": 210.66666666666666, "completions/mean_length": 74.66796875, "completions/min_length": 34.333333333333336, "epoch": 3.215725806451613, "frac_reward_zero_std": 0.08854166666666667, "grad_norm": 1.78125, "kl": 0.004178423713892698, "learning_rate": 3.09582621747577e-07, "loss": 0.00012923968024551867, "reward": 1.6417287588119507, "reward_std": 0.18166777988274893, "rewards/FidelityReward/mean": 0.7578606208165487, "rewards/FidelityReward/std": 0.19466146330038706, "rewards/JudgeFidelityReward/mean": 0.7696893413861593, "rewards/JudgeFidelityReward/std": 0.1723487824201584, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 1595 }, { "clip_ratio/high_max": 0.0014466462191194296, "clip_ratio/high_mean": 0.00021375114447437228, "clip_ratio/low_mean": 7.694579908275045e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029069693991914394, "completions/clipped_ratio": 0.0, "completions/max_length": 232.5, "completions/mean_length": 77.9814453125, "completions/min_length": 33.5, "epoch": 3.225806451612903, "frac_reward_zero_std": 0.0546875, "grad_norm": 1.8984375, "kl": 0.004577278532087803, "learning_rate": 3.0650447861677677e-07, "loss": 0.00019974592141807078, "reward": 1.6518290042877197, "reward_std": 0.19490108639001846, "rewards/FidelityReward/mean": 0.7628182470798492, "rewards/FidelityReward/std": 0.2015577107667923, "rewards/JudgeFidelityReward/mean": 0.7809511423110962, "rewards/JudgeFidelityReward/std": 0.18717221170663834, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1600 }, { "clip_ratio/high_max": 0.0018277979921549558, "clip_ratio/high_mean": 0.00019710835185833276, "clip_ratio/low_mean": 5.762981454608962e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002547381562180817, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/mean_length": 75.32161458333333, "completions/min_length": 34.666666666666664, "epoch": 3.2358870967741935, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.1875, "kl": 0.004287421377375722, "learning_rate": 3.0343493667496855e-07, "loss": 0.00024009812623262405, "reward": 1.6373159488042195, "reward_std": 0.18764595190684, "rewards/FidelityReward/mean": 0.7581291198730469, "rewards/FidelityReward/std": 0.19310696423053741, "rewards/JudgeFidelityReward/mean": 0.7609778642654419, "rewards/JudgeFidelityReward/std": 0.184994767109553, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.05027572065591812, "step": 1605 }, { "clip_ratio/high_max": 0.0019677660427987577, "clip_ratio/high_mean": 0.00027879694825969634, "clip_ratio/low_mean": 0.00014512433554045857, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00042392128380015494, "completions/clipped_ratio": 0.0, "completions/max_length": 218.5, "completions/mean_length": 75.779296875, "completions/min_length": 37.0, "epoch": 3.245967741935484, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.171875, "kl": 0.004467209242284298, "learning_rate": 3.0037413236825794e-07, "loss": 0.00013487886171787978, "reward": 1.6115704774856567, "reward_std": 0.1838598996400833, "rewards/FidelityReward/mean": 0.7377817630767822, "rewards/FidelityReward/std": 0.20955479890108109, "rewards/JudgeFidelityReward/mean": 0.7534367442131042, "rewards/JudgeFidelityReward/std": 0.18400969356298447, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 1610 }, { "clip_ratio/high_max": 0.0011528315022587775, "clip_ratio/high_mean": 0.00019275767845101656, "clip_ratio/low_mean": 0.00010095496254507453, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002937126439064741, "completions/clipped_ratio": 0.0, "completions/max_length": 218.33333333333334, "completions/mean_length": 76.74609375, "completions/min_length": 31.0, "epoch": 3.256048387096774, "frac_reward_zero_std": 0.036458333333333336, "grad_norm": 2.109375, "kl": 0.004181062243878842, "learning_rate": 2.9732220175434876e-07, "loss": 7.73220497649163e-05, "reward": 1.5904010931650798, "reward_std": 0.20353381832440695, "rewards/FidelityReward/mean": 0.7239082455635071, "rewards/FidelityReward/std": 0.20361902316411337, "rewards/JudgeFidelityReward/mean": 0.736240804195404, "rewards/JudgeFidelityReward/std": 0.18051075438658395, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.054928320149580635, "step": 1615 }, { "clip_ratio/high_max": 0.001858157035894692, "clip_ratio/high_mean": 0.00031854957051109524, "clip_ratio/low_mean": 0.00011944508587475866, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00043799466220662, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/mean_length": 78.6796875, "completions/min_length": 33.5, "epoch": 3.2661290322580645, "frac_reward_zero_std": 0.0625, "grad_norm": 1.8828125, "kl": 0.0043750136159360405, "learning_rate": 2.9427928049649455e-07, "loss": 6.760215619578957e-05, "reward": 1.6445029973983765, "reward_std": 0.18590588867664337, "rewards/FidelityReward/mean": 0.7665173709392548, "rewards/FidelityReward/std": 0.2046482115983963, "rewards/JudgeFidelityReward/mean": 0.759877473115921, "rewards/JudgeFidelityReward/std": 0.21322287619113922, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 1620 }, { "clip_ratio/high_max": 0.0010417265351861716, "clip_ratio/high_mean": 0.00019434018176980316, "clip_ratio/low_mean": 6.596472230739892e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002603049040772021, "completions/clipped_ratio": 0.0, "completions/max_length": 228.66666666666666, "completions/mean_length": 75.298828125, "completions/min_length": 34.0, "epoch": 3.276209677419355, "frac_reward_zero_std": 0.109375, "grad_norm": 2.046875, "kl": 0.004302978701889515, "learning_rate": 2.9124550385746856e-07, "loss": 0.00014792578294873238, "reward": 1.6662368377049763, "reward_std": 0.16659738620122275, "rewards/FidelityReward/mean": 0.7751143376032511, "rewards/FidelityReward/std": 0.19258879125118256, "rewards/JudgeFidelityReward/mean": 0.7861513495445251, "rewards/JudgeFidelityReward/std": 0.18144709865252176, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.061009867737690605, "step": 1625 }, { "clip_ratio/high_max": 0.0027560121845453976, "clip_ratio/high_mean": 0.00029981564730405805, "clip_ratio/low_mean": 0.00014050128957023844, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00044031693832948806, "completions/clipped_ratio": 0.0, "completions/max_length": 228.5, "completions/mean_length": 75.328125, "completions/min_length": 31.5, "epoch": 3.286290322580645, "frac_reward_zero_std": 0.0859375, "grad_norm": 1.8203125, "kl": 0.0044592578895390035, "learning_rate": 2.8822100669355076e-07, "loss": 0.0001723800553008914, "reward": 1.6339633464813232, "reward_std": 0.19186969101428986, "rewards/FidelityReward/mean": 0.7535508573055267, "rewards/FidelityReward/std": 0.2017533853650093, "rewards/JudgeFidelityReward/mean": 0.7627781927585602, "rewards/JudgeFidelityReward/std": 0.19488805532455444, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 1630 }, { "clip_ratio/high_max": 0.00105079454369843, "clip_ratio/high_mean": 0.00014658267609775067, "clip_ratio/low_mean": 0.00013023698702454566, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002768196631222963, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/mean_length": 75.58138020833333, "completions/min_length": 34.333333333333336, "epoch": 3.2963709677419355, "frac_reward_zero_std": 0.109375, "grad_norm": 2.09375, "kl": 0.00439912797883153, "learning_rate": 2.852059234485338e-07, "loss": 0.00021376083604991437, "reward": 1.6539161602656047, "reward_std": 0.17693625390529633, "rewards/FidelityReward/mean": 0.7705186406771342, "rewards/FidelityReward/std": 0.20284331341584524, "rewards/JudgeFidelityReward/mean": 0.7739565769831339, "rewards/JudgeFidelityReward/std": 0.19582747916380563, "rewards/SelfEvolvingFormatReward/mean": 0.9928385416666666, "rewards/SelfEvolvingFormatReward/std": 0.08218589673439662, "step": 1635 }, { "clip_ratio/high_max": 0.0016225232742726804, "clip_ratio/high_mean": 0.00027402827981859446, "clip_ratio/low_mean": 0.0001606630365131423, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004346913192421198, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 75.3603515625, "completions/min_length": 32.5, "epoch": 3.306451612903226, "frac_reward_zero_std": 0.0625, "grad_norm": 2.0625, "kl": 0.004517137072980404, "learning_rate": 2.8220038814774627e-07, "loss": 0.00019868630915880202, "reward": 1.6068612933158875, "reward_std": 0.19074411690235138, "rewards/FidelityReward/mean": 0.7372455596923828, "rewards/FidelityReward/std": 0.1984105333685875, "rewards/JudgeFidelityReward/mean": 0.7421612441539764, "rewards/JudgeFidelityReward/std": 0.18196148425340652, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1640 }, { "clip_ratio/high_max": 0.0014173376373946666, "clip_ratio/high_mean": 0.00019376830314286054, "clip_ratio/low_mean": 5.625425110338256e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002500225440599024, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/mean_length": 73.35286458333333, "completions/min_length": 32.0, "epoch": 3.316532258064516, "frac_reward_zero_std": 0.0625, "grad_norm": 2.15625, "kl": 0.004395316448062658, "learning_rate": 2.7920453439209467e-07, "loss": 0.00018649878911674022, "reward": 1.6368151903152466, "reward_std": 0.18190082907676697, "rewards/FidelityReward/mean": 0.7572845816612244, "rewards/FidelityReward/std": 0.2099634607632955, "rewards/JudgeFidelityReward/mean": 0.7623163263003031, "rewards/JudgeFidelityReward/std": 0.19855531056722006, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.054928320149580635, "step": 1645 }, { "clip_ratio/high_max": 0.002431406918913126, "clip_ratio/high_mean": 0.00024959935108199717, "clip_ratio/low_mean": 0.0002077353245113045, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00045733466977253555, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/mean_length": 75.16015625, "completions/min_length": 31.0, "epoch": 3.3266129032258065, "frac_reward_zero_std": 0.0859375, "grad_norm": 1.828125, "kl": 0.004385416023433208, "learning_rate": 2.7621849535212596e-07, "loss": 0.00012708087451756, "reward": 1.6205180883407593, "reward_std": 0.18255525827407837, "rewards/FidelityReward/mean": 0.7451381683349609, "rewards/FidelityReward/std": 0.20610077679157257, "rewards/JudgeFidelityReward/mean": 0.7527129948139191, "rewards/JudgeFidelityReward/std": 0.19158388674259186, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1650 }, { "clip_ratio/high_max": 0.0013130569364875554, "clip_ratio/high_mean": 0.00010401583276689052, "clip_ratio/low_mean": 3.886734921252355e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001428831834346056, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/mean_length": 75.73567708333333, "completions/min_length": 31.0, "epoch": 3.336693548387097, "frac_reward_zero_std": 0.0625, "grad_norm": 2.09375, "kl": 0.004303735680878162, "learning_rate": 2.732424037621066e-07, "loss": 0.0002494971267879009, "reward": 1.6236932277679443, "reward_std": 0.18891324599583945, "rewards/FidelityReward/mean": 0.7465738654136658, "rewards/FidelityReward/std": 0.20112731556097665, "rewards/JudgeFidelityReward/mean": 0.7568429311116537, "rewards/JudgeFidelityReward/std": 0.1846804072459539, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.05027572065591812, "step": 1655 }, { "clip_ratio/high_max": 0.0013695718487724661, "clip_ratio/high_mean": 0.00027695661410689356, "clip_ratio/low_mean": 0.00012368151801638304, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004006381263025105, "completions/clipped_ratio": 0.0, "completions/max_length": 220.5, "completions/mean_length": 75.44140625, "completions/min_length": 31.0, "epoch": 3.346774193548387, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.078125, "kl": 0.0042082306928932665, "learning_rate": 2.7027639191412287e-07, "loss": 0.00013480883790180088, "reward": 1.6206936836242676, "reward_std": 0.1817742958664894, "rewards/FidelityReward/mean": 0.7433209121227264, "rewards/FidelityReward/std": 0.20781230181455612, "rewards/JudgeFidelityReward/mean": 0.7547456324100494, "rewards/JudgeFidelityReward/std": 0.1965627521276474, "rewards/SelfEvolvingFormatReward/mean": 1.0, "rewards/SelfEvolvingFormatReward/std": 0.0, "step": 1660 }, { "clip_ratio/high_max": 0.0015010701958090068, "clip_ratio/high_mean": 0.00023570857010781764, "clip_ratio/low_mean": 7.388153171632439e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000309590098913759, "completions/clipped_ratio": 0.0, "completions/max_length": 225.66666666666666, "completions/mean_length": 74.61263020833333, "completions/min_length": 35.0, "epoch": 3.3568548387096775, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.0625, "kl": 0.004325261060148478, "learning_rate": 2.6732059165220067e-07, "loss": 0.0001491859322413802, "reward": 1.6100316842397053, "reward_std": 0.18826828400293985, "rewards/FidelityReward/mean": 0.742109477519989, "rewards/FidelityReward/std": 0.2081685115893682, "rewards/JudgeFidelityReward/mean": 0.7371465365091959, "rewards/JudgeFidelityReward/std": 0.19680552184581757, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.029462782045205433, "step": 1665 }, { "clip_ratio/high_max": 0.0018450891133397818, "clip_ratio/high_mean": 0.00019466981757432222, "clip_ratio/low_mean": 0.00020167900365777313, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039634882705286145, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/mean_length": 75.70703125, "completions/min_length": 34.0, "epoch": 3.366935483870968, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.234375, "kl": 0.004322745883837342, "learning_rate": 2.643751343664434e-07, "loss": 0.00010536875342950224, "reward": 1.5877740979194641, "reward_std": 0.21157416701316833, "rewards/FidelityReward/mean": 0.7280126512050629, "rewards/FidelityReward/std": 0.2108348160982132, "rewards/JudgeFidelityReward/mean": 0.7214760482311249, "rewards/JudgeFidelityReward/std": 0.2035394236445427, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1670 }, { "clip_ratio/high_max": 0.0011524899397045373, "clip_ratio/high_mean": 0.0002106460218783468, "clip_ratio/low_mean": 7.557818025816232e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002862241817638278, "completions/clipped_ratio": 0.0, "completions/max_length": 239.33333333333334, "completions/mean_length": 77.21158854166667, "completions/min_length": 32.0, "epoch": 3.377016129032258, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 1.9375, "kl": 0.004424586798995734, "learning_rate": 2.614401509871934e-07, "loss": 2.398713259026408e-05, "reward": 1.5935180981953938, "reward_std": 0.1993399461110433, "rewards/FidelityReward/mean": 0.7233438094456991, "rewards/FidelityReward/std": 0.20839925607045492, "rewards/JudgeFidelityReward/mean": 0.742952843507131, "rewards/JudgeFidelityReward/std": 0.18529805541038513, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04162587722142538, "step": 1675 }, { "clip_ratio/high_max": 0.0014836878515779972, "clip_ratio/high_mean": 0.00018657337059266866, "clip_ratio/low_mean": 0.0001661585323745385, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035273191169835625, "completions/clipped_ratio": 0.0, "completions/max_length": 246.5, "completions/mean_length": 74.1455078125, "completions/min_length": 33.5, "epoch": 3.3870967741935485, "frac_reward_zero_std": 0.09375, "grad_norm": 2.140625, "kl": 0.004237819509580731, "learning_rate": 2.585157719792106e-07, "loss": 0.00021611799020320176, "reward": 1.5788553953170776, "reward_std": 0.1867520660161972, "rewards/FidelityReward/mean": 0.7168698608875275, "rewards/FidelityReward/std": 0.20971930027008057, "rewards/JudgeFidelityReward/mean": 0.725924164056778, "rewards/JudgeFidelityReward/std": 0.1894823983311653, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1680 }, { "clip_ratio/high_max": 0.001269905222579837, "clip_ratio/high_mean": 0.0001456436119042337, "clip_ratio/low_mean": 0.00015982600743882358, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003054696135222912, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/mean_length": 73.705078125, "completions/min_length": 33.0, "epoch": 3.3971774193548385, "frac_reward_zero_std": 0.0625, "grad_norm": 1.9453125, "kl": 0.004158356599509716, "learning_rate": 2.5560212733587305e-07, "loss": 0.00010760007426142692, "reward": 1.624279499053955, "reward_std": 0.18932045996189117, "rewards/FidelityReward/mean": 0.7523521979649862, "rewards/FidelityReward/std": 0.1973483512798945, "rewards/JudgeFidelityReward/mean": 0.7484118938446045, "rewards/JudgeFidelityReward/std": 0.19056138396263123, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.062273996571699776, "step": 1685 }, { "clip_ratio/high_max": 0.001623747986741364, "clip_ratio/high_mean": 0.00023779019247740507, "clip_ratio/low_mean": 0.00010052899015136063, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033831915352493523, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/mean_length": 75.51171875, "completions/min_length": 35.0, "epoch": 3.407258064516129, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.21875, "kl": 0.00430443212389946, "learning_rate": 2.526993465733997e-07, "loss": 0.00017608770867809652, "reward": 1.619810938835144, "reward_std": 0.20056027173995972, "rewards/FidelityReward/mean": 0.7473324835300446, "rewards/FidelityReward/std": 0.20922142267227173, "rewards/JudgeFidelityReward/mean": 0.7478867173194885, "rewards/JudgeFidelityReward/std": 0.1964324712753296, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1690 }, { "clip_ratio/high_max": 0.00142195587977767, "clip_ratio/high_mean": 0.0002659254823811352, "clip_ratio/low_mean": 8.800995419733226e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003539354307577014, "completions/clipped_ratio": 0.0, "completions/max_length": 204.66666666666666, "completions/mean_length": 76.52734375, "completions/min_length": 35.0, "epoch": 3.4173387096774195, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 1.8984375, "kl": 0.004499034117907285, "learning_rate": 2.498075587250916e-07, "loss": 0.0003530177287757397, "reward": 1.6578350067138672, "reward_std": 0.18054323891798654, "rewards/FidelityReward/mean": 0.7704098224639893, "rewards/FidelityReward/std": 0.19416402777036032, "rewards/JudgeFidelityReward/mean": 0.7781055172284445, "rewards/JudgeFidelityReward/std": 0.18653231859207153, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04627847671508789, "step": 1695 }, { "clip_ratio/high_max": 0.0017352533992379903, "clip_ratio/high_mean": 0.000265800126362592, "clip_ratio/low_mean": 0.0001043199299601838, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037012004759162664, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/mean_length": 76.4287109375, "completions/min_length": 31.0, "epoch": 3.4274193548387095, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.078125, "kl": 0.004123319545760751, "learning_rate": 2.469268923355976e-07, "loss": 0.00013799864100292326, "reward": 1.5928502082824707, "reward_std": 0.19573695957660675, "rewards/FidelityReward/mean": 0.7330319285392761, "rewards/FidelityReward/std": 0.2191758081316948, "rewards/JudgeFidelityReward/mean": 0.7225663661956787, "rewards/JudgeFidelityReward/std": 0.20853198319673538, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1700 }, { "clip_ratio/high_max": 0.0015507961856201292, "clip_ratio/high_mean": 0.00013626740837935358, "clip_ratio/low_mean": 8.901380351744592e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022528121480718254, "completions/clipped_ratio": 0.0, "completions/max_length": 212.66666666666666, "completions/mean_length": 78.21419270833333, "completions/min_length": 34.666666666666664, "epoch": 3.4375, "frac_reward_zero_std": 0.03125, "grad_norm": 2.171875, "kl": 0.004598684329539538, "learning_rate": 2.440574754551996e-07, "loss": 0.0002939883153885603, "reward": 1.6283719539642334, "reward_std": 0.20002581675847372, "rewards/FidelityReward/mean": 0.7522661685943604, "rewards/FidelityReward/std": 0.20281071960926056, "rewards/JudgeFidelityReward/mean": 0.7561178207397461, "rewards/JudgeFidelityReward/std": 0.18983430663744608, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.05018910765647888, "step": 1705 }, { "clip_ratio/high_max": 0.0019044183660298585, "clip_ratio/high_mean": 0.0002540824352763593, "clip_ratio/low_mean": 9.312851761933416e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003472109441645443, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/mean_length": 75.048828125, "completions/min_length": 33.5, "epoch": 3.4475806451612905, "frac_reward_zero_std": 0.09375, "grad_norm": 2.203125, "kl": 0.004488987475633621, "learning_rate": 2.411994356341203e-07, "loss": 0.00010825518984347582, "reward": 1.6003448367118835, "reward_std": 0.1844038888812065, "rewards/FidelityReward/mean": 0.7349657714366913, "rewards/FidelityReward/std": 0.20743804425001144, "rewards/JudgeFidelityReward/mean": 0.7346643805503845, "rewards/JudgeFidelityReward/std": 0.19700051099061966, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.04406425356864929, "step": 1710 }, { "clip_ratio/high_max": 0.001005462114699185, "clip_ratio/high_mean": 0.00020642628660425544, "clip_ratio/low_mean": 5.731935234507546e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002637456404045224, "completions/clipped_ratio": 0.0, "completions/max_length": 204.33333333333334, "completions/mean_length": 77.96940104166667, "completions/min_length": 33.0, "epoch": 3.4576612903225805, "frac_reward_zero_std": 0.0625, "grad_norm": 2.046875, "kl": 0.004366794694215059, "learning_rate": 2.3835289991685392e-07, "loss": 0.00011183111928403378, "reward": 1.6216646830240886, "reward_std": 0.20091334482034048, "rewards/FidelityReward/mean": 0.7488120198249817, "rewards/FidelityReward/std": 0.203736479083697, "rewards/JudgeFidelityReward/mean": 0.7502626975377401, "rewards/JudgeFidelityReward/std": 0.2045555760463079, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.0649204986790816, "step": 1715 }, { "clip_ratio/high_max": 0.0020157165359705686, "clip_ratio/high_mean": 0.00026573336217552426, "clip_ratio/low_mean": 0.0001160778250778094, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003818111843429506, "completions/clipped_ratio": 0.0, "completions/max_length": 187.5, "completions/mean_length": 74.435546875, "completions/min_length": 32.0, "epoch": 3.467741935483871, "frac_reward_zero_std": 0.109375, "grad_norm": 1.9765625, "kl": 0.004357933811843395, "learning_rate": 2.355179948365189e-07, "loss": 0.00028818876016885043, "reward": 1.618487298488617, "reward_std": 0.17925868928432465, "rewards/FidelityReward/mean": 0.7499220371246338, "rewards/FidelityReward/std": 0.2016630619764328, "rewards/JudgeFidelityReward/mean": 0.7420134544372559, "rewards/JudgeFidelityReward/std": 0.19360052794218063, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.049216821789741516, "step": 1720 }, { "clip_ratio/high_max": 0.0012087753508239984, "clip_ratio/high_mean": 0.0001635477179661393, "clip_ratio/low_mean": 7.949612918309868e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002430438529700041, "completions/clipped_ratio": 0.0, "completions/max_length": 232.66666666666666, "completions/mean_length": 75.11588541666667, "completions/min_length": 34.666666666666664, "epoch": 3.4778225806451615, "frac_reward_zero_std": 0.08854166666666667, "grad_norm": 2.046875, "kl": 0.00432376591488719, "learning_rate": 2.3269484640923248e-07, "loss": 0.0002777317306026816, "reward": 1.615090290705363, "reward_std": 0.18517249325911203, "rewards/FidelityReward/mean": 0.7421637773513794, "rewards/FidelityReward/std": 0.22065257529417673, "rewards/JudgeFidelityReward/mean": 0.7484572529792786, "rewards/JudgeFidelityReward/std": 0.2071930468082428, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.05027572065591812, "step": 1725 }, { "clip_ratio/high_max": 0.001845161197707057, "clip_ratio/high_mean": 0.00028127896948717537, "clip_ratio/low_mean": 0.00010815025743795558, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038942922838032247, "completions/clipped_ratio": 0.0, "completions/max_length": 214.5, "completions/mean_length": 76.8291015625, "completions/min_length": 31.5, "epoch": 3.4879032258064515, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.109375, "kl": 0.004473027214407921, "learning_rate": 2.2988358012851018e-07, "loss": 0.00029805055819451807, "reward": 1.5858429074287415, "reward_std": 0.19926616549491882, "rewards/FidelityReward/mean": 0.7213194072246552, "rewards/FidelityReward/std": 0.23833709955215454, "rewards/JudgeFidelityReward/mean": 0.733929842710495, "rewards/JudgeFidelityReward/std": 0.21804144978523254, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06941771507263184, "step": 1730 }, { "clip_ratio/high_max": 0.0012485322542488575, "clip_ratio/high_mean": 0.0001535094663267955, "clip_ratio/low_mean": 0.00011544995941221714, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026895940536633136, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 74.58333333333333, "completions/min_length": 30.666666666666668, "epoch": 3.497983870967742, "frac_reward_zero_std": 0.026041666666666668, "grad_norm": 2.109375, "kl": 0.00449347598478198, "learning_rate": 2.2708432095968655e-07, "loss": 0.0002382716163992882, "reward": 1.5987191200256348, "reward_std": 0.19895153741041818, "rewards/FidelityReward/mean": 0.730006217956543, "rewards/FidelityReward/std": 0.22441714505354562, "rewards/JudgeFidelityReward/mean": 0.7419830163319906, "rewards/JudgeFidelityReward/std": 0.2049167255560557, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.0649204986790816, "step": 1735 }, { "clip_ratio/high_max": 0.0014203283237293363, "clip_ratio/high_mean": 0.00020947320736013352, "clip_ratio/low_mean": 8.298859611386434e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029246179619804027, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 75.8779296875, "completions/min_length": 32.5, "epoch": 3.508064516129032, "frac_reward_zero_std": 0.078125, "grad_norm": 2.1875, "kl": 0.004218348767608404, "learning_rate": 2.242971933343608e-07, "loss": 0.00025222785770893095, "reward": 1.5602970123291016, "reward_std": 0.19871685653924942, "rewards/FidelityReward/mean": 0.7013913094997406, "rewards/FidelityReward/std": 0.22038023173809052, "rewards/JudgeFidelityReward/mean": 0.7197644412517548, "rewards/JudgeFidelityReward/std": 0.19888363033533096, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1740 }, { "clip_ratio/high_max": 0.0015230964869260787, "clip_ratio/high_mean": 0.00014372128061950206, "clip_ratio/low_mean": 5.5612801224924625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019933407311327754, "completions/clipped_ratio": 0.0, "completions/max_length": 257.3333333333333, "completions/mean_length": 75.94075520833333, "completions/min_length": 32.0, "epoch": 3.5181451612903225, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.21875, "kl": 0.00420582895167172, "learning_rate": 2.2152232114486552e-07, "loss": 0.0001901285140775144, "reward": 1.5913751125335693, "reward_std": 0.18817159036795297, "rewards/FidelityReward/mean": 0.7325080037117004, "rewards/FidelityReward/std": 0.20423188308874765, "rewards/JudgeFidelityReward/mean": 0.7203384041786194, "rewards/JudgeFidelityReward/std": 0.19729610780874887, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.05027572065591812, "step": 1745 }, { "clip_ratio/high_max": 0.0019916052697226405, "clip_ratio/high_mean": 0.00026204305177088825, "clip_ratio/low_mean": 5.80999709200114e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032014303142204883, "completions/clipped_ratio": 0.0, "completions/max_length": 231.5, "completions/mean_length": 75.8125, "completions/min_length": 35.5, "epoch": 3.528225806451613, "frac_reward_zero_std": 0.0625, "grad_norm": 2.078125, "kl": 0.004228975810110569, "learning_rate": 2.1875982773875867e-07, "loss": 0.0003047794569283724, "reward": 1.6303719878196716, "reward_std": 0.1880563274025917, "rewards/FidelityReward/mean": 0.7522112131118774, "rewards/FidelityReward/std": 0.19606521725654602, "rewards/JudgeFidelityReward/mean": 0.7592513263225555, "rewards/JudgeFidelityReward/std": 0.1923079416155815, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1750 }, { "clip_ratio/high_max": 0.0009107953635975718, "clip_ratio/high_mean": 0.00013082597870379687, "clip_ratio/low_mean": 7.465548987966031e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020548144821077585, "completions/clipped_ratio": 0.0, "completions/max_length": 210.33333333333334, "completions/mean_length": 77.59114583333333, "completions/min_length": 35.333333333333336, "epoch": 3.538306451612903, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 1.9140625, "kl": 0.004380506929010153, "learning_rate": 2.1600983591334205e-07, "loss": 3.5942331305705014e-05, "reward": 1.621775229771932, "reward_std": 0.19098072250684103, "rewards/FidelityReward/mean": 0.7476260463396708, "rewards/FidelityReward/std": 0.1982661783695221, "rewards/JudgeFidelityReward/mean": 0.751553475856781, "rewards/JudgeFidelityReward/std": 0.1840204894542694, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.054928320149580635, "step": 1755 }, { "clip_ratio/high_max": 0.001867937482893467, "clip_ratio/high_mean": 0.0002404340251814574, "clip_ratio/low_mean": 0.00010007524688262492, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003405092807952315, "completions/clipped_ratio": 0.0, "completions/max_length": 200.5, "completions/mean_length": 74.259765625, "completions/min_length": 34.0, "epoch": 3.5483870967741935, "frac_reward_zero_std": 0.078125, "grad_norm": 2.359375, "kl": 0.004581876192241907, "learning_rate": 2.132724679102017e-07, "loss": 0.00024314075708389282, "reward": 1.6218845844268799, "reward_std": 0.19135338813066483, "rewards/FidelityReward/mean": 0.7521811127662659, "rewards/FidelityReward/std": 0.19167187064886093, "rewards/JudgeFidelityReward/mean": 0.7442898154258728, "rewards/JudgeFidelityReward/std": 0.18463406711816788, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06941771507263184, "step": 1760 }, { "clip_ratio/high_max": 0.0015366867184638977, "clip_ratio/high_mean": 0.00023915048222988844, "clip_ratio/low_mean": 2.9100434039719404e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002682509133592248, "completions/clipped_ratio": 0.0, "completions/max_length": 215.66666666666666, "completions/mean_length": 76.69401041666667, "completions/min_length": 30.666666666666668, "epoch": 3.558467741935484, "frac_reward_zero_std": 0.0625, "grad_norm": 2.140625, "kl": 0.004188844561576843, "learning_rate": 2.1054784540977406e-07, "loss": 0.00029171430505812166, "reward": 1.6342916091283162, "reward_std": 0.19232699275016785, "rewards/FidelityReward/mean": 0.7529008984565735, "rewards/FidelityReward/std": 0.19443650543689728, "rewards/JudgeFidelityReward/mean": 0.7627814213434855, "rewards/JudgeFidelityReward/std": 0.18864269057909647, "rewards/SelfEvolvingFormatReward/mean": 1.0, "rewards/SelfEvolvingFormatReward/std": 0.0, "step": 1765 }, { "clip_ratio/high_max": 0.0021715770941227674, "clip_ratio/high_mean": 0.0004134702612645924, "clip_ratio/low_mean": 0.00018860165728256105, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006020719418302178, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 304.0, "completions/mean_length": 76.6044921875, "completions/min_length": 35.0, "epoch": 3.568548387096774, "frac_reward_zero_std": 0.078125, "grad_norm": 1.8671875, "kl": 0.004177176440134644, "learning_rate": 2.0783608952593767e-07, "loss": 0.00017518806271255016, "reward": 1.642457664012909, "reward_std": 0.18299990892410278, "rewards/FidelityReward/mean": 0.7607671022415161, "rewards/FidelityReward/std": 0.1940205991268158, "rewards/JudgeFidelityReward/mean": 0.7653343081474304, "rewards/JudgeFidelityReward/std": 0.19939829409122467, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1770 }, { "clip_ratio/high_max": 0.0014363814610987903, "clip_ratio/high_mean": 0.00021426668390631676, "clip_ratio/low_mean": 8.012341859284789e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029439011123031376, "completions/clipped_ratio": 0.0, "completions/max_length": 240.33333333333334, "completions/mean_length": 76.181640625, "completions/min_length": 33.666666666666664, "epoch": 3.5786290322580645, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 2.0625, "kl": 0.0041788465343415735, "learning_rate": 2.051373208006293e-07, "loss": 7.46599747799337e-05, "reward": 1.6082641283671062, "reward_std": 0.1813842554887136, "rewards/FidelityReward/mean": 0.7350605726242065, "rewards/FidelityReward/std": 0.22636019190152487, "rewards/JudgeFidelityReward/mean": 0.7470580538113912, "rewards/JudgeFidelityReward/std": 0.21543153127034506, "rewards/SelfEvolvingFormatReward/mean": 0.9993489583333334, "rewards/SelfEvolvingFormatReward/std": 0.014731391022602717, "step": 1775 }, { "clip_ratio/high_max": 0.0021750487852841617, "clip_ratio/high_mean": 0.0002757384325377643, "clip_ratio/low_mean": 0.00010288733028573915, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003786257584579289, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/mean_length": 76.5009765625, "completions/min_length": 34.5, "epoch": 3.588709677419355, "frac_reward_zero_std": 0.1171875, "grad_norm": 1.9375, "kl": 0.0045615918934345245, "learning_rate": 2.0245165919848483e-07, "loss": 0.0002855564933270216, "reward": 1.648118257522583, "reward_std": 0.1745108813047409, "rewards/FidelityReward/mean": 0.7579253613948822, "rewards/FidelityReward/std": 0.22756510972976685, "rewards/JudgeFidelityReward/mean": 0.781362384557724, "rewards/JudgeFidelityReward/std": 0.2088276892900467, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 1780 }, { "clip_ratio/high_max": 0.0010372532531619072, "clip_ratio/high_mean": 0.00012075970298610627, "clip_ratio/low_mean": 3.424573369557038e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015500544104725124, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/mean_length": 76.07161458333333, "completions/min_length": 33.333333333333336, "epoch": 3.598790322580645, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 1.96875, "kl": 0.004194236546754837, "learning_rate": 1.99779224101508e-07, "loss": 0.0002512401202693582, "reward": 1.6315494378407795, "reward_std": 0.19023104012012482, "rewards/FidelityReward/mean": 0.7554205656051636, "rewards/FidelityReward/std": 0.19774279495080313, "rewards/JudgeFidelityReward/mean": 0.7574660778045654, "rewards/JudgeFidelityReward/std": 0.19208411375681558, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.06957309817274411, "step": 1785 }, { "clip_ratio/high_max": 0.0018058576388284564, "clip_ratio/high_mean": 0.00019079541380051523, "clip_ratio/low_mean": 0.00019499166519381105, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003857870819047093, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/mean_length": 76.1923828125, "completions/min_length": 33.5, "epoch": 3.6088709677419355, "frac_reward_zero_std": 0.140625, "grad_norm": 1.8984375, "kl": 0.004266247525811196, "learning_rate": 1.971201343037629e-07, "loss": 9.927644859999418e-05, "reward": 1.666133463382721, "reward_std": 0.1631164401769638, "rewards/FidelityReward/mean": 0.775227427482605, "rewards/FidelityReward/std": 0.18511182814836502, "rewards/JudgeFidelityReward/mean": 0.7837652564048767, "rewards/JudgeFidelityReward/std": 0.1778659075498581, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 1790 }, { "clip_ratio/high_max": 0.0012931741774082183, "clip_ratio/high_mean": 0.00017289096722379328, "clip_ratio/low_mean": 5.861173267476261e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023150270571932198, "completions/clipped_ratio": 0.0, "completions/max_length": 236.33333333333334, "completions/mean_length": 76.947265625, "completions/min_length": 35.333333333333336, "epoch": 3.618951612903226, "frac_reward_zero_std": 0.08854166666666667, "grad_norm": 1.8046875, "kl": 0.004416386224329471, "learning_rate": 1.9447450800609293e-07, "loss": 0.0003063303418457508, "reward": 1.6521144310633342, "reward_std": 0.1765389492114385, "rewards/FidelityReward/mean": 0.7698237498601278, "rewards/FidelityReward/std": 0.19195957481861115, "rewards/JudgeFidelityReward/mean": 0.7678364316622416, "rewards/JudgeFidelityReward/std": 0.1780558874209722, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.054928320149580635, "step": 1795 }, { "clip_ratio/high_max": 0.0017437102273106576, "clip_ratio/high_mean": 0.0002857462619431317, "clip_ratio/low_mean": 0.00018276132177561522, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004685075720772147, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/mean_length": 76.310546875, "completions/min_length": 36.0, "epoch": 3.629032258064516, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.140625, "kl": 0.004239644668996334, "learning_rate": 1.9184246281086775e-07, "loss": 3.078985319007188e-05, "reward": 1.625545620918274, "reward_std": 0.19599320739507675, "rewards/FidelityReward/mean": 0.7495591938495636, "rewards/FidelityReward/std": 0.20692332088947296, "rewards/JudgeFidelityReward/mean": 0.753926008939743, "rewards/JudgeFidelityReward/std": 0.19118688255548477, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1800 }, { "clip_ratio/high_max": 0.0010726734763011336, "clip_ratio/high_mean": 0.00017781041096895934, "clip_ratio/low_mean": 5.932437197770923e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023713479749858378, "completions/clipped_ratio": 0.0, "completions/max_length": 197.66666666666666, "completions/mean_length": 74.35807291666667, "completions/min_length": 32.333333333333336, "epoch": 3.6391129032258065, "frac_reward_zero_std": 0.078125, "grad_norm": 1.8671875, "kl": 0.004386835545301437, "learning_rate": 1.8922411571675477e-07, "loss": 0.00020596254616975785, "reward": 1.6099517345428467, "reward_std": 0.1915816217660904, "rewards/FidelityReward/mean": 0.7389478087425232, "rewards/FidelityReward/std": 0.20866640905539194, "rewards/JudgeFidelityReward/mean": 0.746565043926239, "rewards/JudgeFidelityReward/std": 0.18967323998610178, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.0649204986790816, "step": 1805 }, { "clip_ratio/high_max": 0.0021480091381818056, "clip_ratio/high_mean": 0.00036610697861760855, "clip_ratio/low_mean": 7.195371435955166e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00043806071626022457, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/mean_length": 75.83203125, "completions/min_length": 35.0, "epoch": 3.649193548387097, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.109375, "kl": 0.004297675378620624, "learning_rate": 1.8661958311351878e-07, "loss": 0.00019604042172431945, "reward": 1.615489661693573, "reward_std": 0.20033146440982819, "rewards/FidelityReward/mean": 0.7420907914638519, "rewards/FidelityReward/std": 0.19971774518489838, "rewards/JudgeFidelityReward/mean": 0.7497274577617645, "rewards/JudgeFidelityReward/std": 0.18981438130140305, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.038198307156562805, "step": 1810 }, { "clip_ratio/high_max": 0.0011315891984850167, "clip_ratio/high_mean": 0.00019717701943591237, "clip_ratio/low_mean": 8.279780449811369e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027997481520287694, "completions/clipped_ratio": 0.0, "completions/max_length": 246.66666666666666, "completions/mean_length": 75.86328125, "completions/min_length": 32.333333333333336, "epoch": 3.659274193548387, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.296875, "kl": 0.004251774214208126, "learning_rate": 1.8402898077684803e-07, "loss": 0.00011128999758511782, "reward": 1.6397676467895508, "reward_std": 0.19482406973838806, "rewards/FidelityReward/mean": 0.7587078015009562, "rewards/FidelityReward/std": 0.20092821617921194, "rewards/JudgeFidelityReward/mean": 0.7673279444376627, "rewards/JudgeFidelityReward/std": 0.1889602392911911, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.05875233809153239, "step": 1815 }, { "clip_ratio/high_max": 0.0017148155719041824, "clip_ratio/high_mean": 0.0003082375624217093, "clip_ratio/low_mean": 0.0001501901657320559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004584277281537652, "completions/clipped_ratio": 0.0, "completions/max_length": 185.5, "completions/mean_length": 75.2978515625, "completions/min_length": 34.5, "epoch": 3.6693548387096775, "frac_reward_zero_std": 0.1015625, "grad_norm": 1.8359375, "kl": 0.0040993423666805025, "learning_rate": 1.8145242386320757e-07, "loss": 9.825103916227818e-05, "reward": 1.5923945903778076, "reward_std": 0.1969689279794693, "rewards/FidelityReward/mean": 0.7263618409633636, "rewards/FidelityReward/std": 0.21288063377141953, "rewards/JudgeFidelityReward/mean": 0.734995037317276, "rewards/JudgeFidelityReward/std": 0.19946322590112686, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1820 }, { "clip_ratio/high_max": 0.0007792860968038439, "clip_ratio/high_mean": 0.00015536799910478293, "clip_ratio/low_mean": 7.080626673996448e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002261742716655135, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/mean_length": 75.51236979166667, "completions/min_length": 33.666666666666664, "epoch": 3.679435483870968, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 2.078125, "kl": 0.004312163311988115, "learning_rate": 1.7889002690472131e-07, "loss": 0.00019304066663607954, "reward": 1.575095772743225, "reward_std": 0.1950673907995224, "rewards/FidelityReward/mean": 0.7116447885831197, "rewards/FidelityReward/std": 0.21088301142056784, "rewards/JudgeFidelityReward/mean": 0.7288551727930704, "rewards/JudgeFidelityReward/std": 0.18876202901204428, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 1825 }, { "clip_ratio/high_max": 0.0016077805776149035, "clip_ratio/high_mean": 0.0002600799547508359, "clip_ratio/low_mean": 0.00012431112118065356, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003843910759314895, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 76.0908203125, "completions/min_length": 34.0, "epoch": 3.689516129032258, "frac_reward_zero_std": 0.046875, "grad_norm": 2.09375, "kl": 0.004569537378847599, "learning_rate": 1.7634190380407959e-07, "loss": 0.00020865020342171193, "reward": 1.5724060535430908, "reward_std": 0.21729116141796112, "rewards/FidelityReward/mean": 0.7123843133449554, "rewards/FidelityReward/std": 0.2162477821111679, "rewards/JudgeFidelityReward/mean": 0.7249264419078827, "rewards/JudgeFidelityReward/std": 0.19747485220432281, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06941771507263184, "step": 1830 }, { "clip_ratio/high_max": 0.0011846718844026327, "clip_ratio/high_mean": 0.00018793381750583648, "clip_ratio/low_mean": 4.7259921848308296e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023519374663010239, "completions/clipped_ratio": 0.0, "completions/max_length": 228.33333333333334, "completions/mean_length": 78.08984375, "completions/min_length": 33.0, "epoch": 3.6995967741935485, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.078125, "kl": 0.004494318272918463, "learning_rate": 1.738081678294771e-07, "loss": 0.00014166049659252166, "reward": 1.6326347986857097, "reward_std": 0.20328459640343985, "rewards/FidelityReward/mean": 0.7528204520543417, "rewards/FidelityReward/std": 0.20937449236710867, "rewards/JudgeFidelityReward/mean": 0.764185905456543, "rewards/JudgeFidelityReward/std": 0.1990953584512075, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.0649204986790816, "step": 1835 }, { "clip_ratio/high_max": 0.002060055220499635, "clip_ratio/high_mean": 0.00028697601519525053, "clip_ratio/low_mean": 0.00014600888243876398, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004329848918132484, "completions/clipped_ratio": 0.0, "completions/max_length": 206.5, "completions/mean_length": 75.8974609375, "completions/min_length": 33.5, "epoch": 3.709677419354839, "frac_reward_zero_std": 0.03125, "grad_norm": 1.9921875, "kl": 0.004832510650157928, "learning_rate": 1.7128893160957754e-07, "loss": 0.00034802164882421496, "reward": 1.6075437664985657, "reward_std": 0.19864822179079056, "rewards/FidelityReward/mean": 0.7355911433696747, "rewards/FidelityReward/std": 0.2159252092242241, "rewards/JudgeFidelityReward/mean": 0.7468349039554596, "rewards/JudgeFidelityReward/std": 0.20146744698286057, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 1840 }, { "clip_ratio/high_max": 0.0011224642861634494, "clip_ratio/high_mean": 0.00016925969393923878, "clip_ratio/low_mean": 9.937597569660283e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002686356659978628, "completions/clipped_ratio": 0.0, "completions/max_length": 255.66666666666666, "completions/mean_length": 73.91080729166667, "completions/min_length": 33.333333333333336, "epoch": 3.719758064516129, "frac_reward_zero_std": 0.09895833333333333, "grad_norm": 1.9296875, "kl": 0.004434998147189617, "learning_rate": 1.687843071285065e-07, "loss": 0.0001536100637167692, "reward": 1.6231749852498372, "reward_std": 0.18086242179075876, "rewards/FidelityReward/mean": 0.7481780846913656, "rewards/FidelityReward/std": 0.19649379948774973, "rewards/JudgeFidelityReward/mean": 0.7532490491867065, "rewards/JudgeFidelityReward/std": 0.18189981083075205, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04410756006836891, "step": 1845 }, { "clip_ratio/high_max": 0.0014330406207591294, "clip_ratio/high_mean": 0.00019476651796139776, "clip_ratio/low_mean": 0.00011703372874762862, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031180024379864337, "completions/clipped_ratio": 0.0, "completions/max_length": 211.5, "completions/mean_length": 76.1328125, "completions/min_length": 34.0, "epoch": 3.7298387096774195, "frac_reward_zero_std": 0.078125, "grad_norm": 1.921875, "kl": 0.004522269032895565, "learning_rate": 1.6629440572087483e-07, "loss": 0.0003158600069582462, "reward": 1.676751732826233, "reward_std": 0.17677125334739685, "rewards/FidelityReward/mean": 0.7821020483970642, "rewards/FidelityReward/std": 0.1862448900938034, "rewards/JudgeFidelityReward/mean": 0.7912525534629822, "rewards/JudgeFidelityReward/std": 0.16746549308300018, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1850 }, { "clip_ratio/high_max": 0.0010856815613806247, "clip_ratio/high_mean": 0.000156941544264555, "clip_ratio/low_mean": 2.785668184515089e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001847982290200889, "completions/clipped_ratio": 0.0, "completions/max_length": 254.66666666666666, "completions/mean_length": 75.8515625, "completions/min_length": 34.0, "epoch": 3.7399193548387095, "frac_reward_zero_std": 0.0625, "grad_norm": 2.359375, "kl": 0.004270680621266365, "learning_rate": 1.6381933806682868e-07, "loss": 0.00020738160237669944, "reward": 1.630092740058899, "reward_std": 0.19056224326292673, "rewards/FidelityReward/mean": 0.7591679890950521, "rewards/FidelityReward/std": 0.19939864675203958, "rewards/JudgeFidelityReward/mean": 0.7464066743850708, "rewards/JudgeFidelityReward/std": 0.20213781793912253, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.050638811041911445, "step": 1855 }, { "clip_ratio/high_max": 0.0015239916741847993, "clip_ratio/high_mean": 0.00023028752766549588, "clip_ratio/low_mean": 0.00011087239108746872, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003411599202081561, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/mean_length": 78.3046875, "completions/min_length": 34.5, "epoch": 3.75, "frac_reward_zero_std": 0.0234375, "grad_norm": 2.046875, "kl": 0.0044085832312703134, "learning_rate": 1.6135921418712955e-07, "loss": 0.00030226283706724644, "reward": 1.6592049598693848, "reward_std": 0.2013247236609459, "rewards/FidelityReward/mean": 0.7713325321674347, "rewards/FidelityReward/std": 0.19375848770141602, "rewards/JudgeFidelityReward/mean": 0.7806276381015778, "rewards/JudgeFidelityReward/std": 0.18814583867788315, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06616134010255337, "step": 1860 }, { "clip_ratio/high_max": 0.0011995569802820683, "clip_ratio/high_mean": 0.00015165129443630576, "clip_ratio/low_mean": 7.221028208732605e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002238615823443979, "completions/clipped_ratio": 0.0, "completions/max_length": 217.33333333333334, "completions/mean_length": 78.541015625, "completions/min_length": 34.333333333333336, "epoch": 3.7600806451612905, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.109375, "kl": 0.004336325451731682, "learning_rate": 1.5891414343826438e-07, "loss": 0.0001326317898929119, "reward": 1.6181641419728596, "reward_std": 0.19349213937918344, "rewards/FidelityReward/mean": 0.743055522441864, "rewards/FidelityReward/std": 0.21379546324412027, "rewards/JudgeFidelityReward/mean": 0.7541234890619913, "rewards/JudgeFidelityReward/std": 0.19823691248893738, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06243881583213806, "step": 1865 }, { "clip_ratio/high_max": 0.0020475121680647137, "clip_ratio/high_mean": 0.00027459922712296246, "clip_ratio/low_mean": 0.00012143502826802432, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039603428449481727, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/mean_length": 77.650390625, "completions/min_length": 36.5, "epoch": 3.7701612903225805, "frac_reward_zero_std": 0.109375, "grad_norm": 1.90625, "kl": 0.004525269288569689, "learning_rate": 1.5648423450758403e-07, "loss": -3.816925163846463e-06, "reward": 1.614884078502655, "reward_std": 0.18777543306350708, "rewards/FidelityReward/mean": 0.7473197877407074, "rewards/FidelityReward/std": 0.20537929236888885, "rewards/JudgeFidelityReward/mean": 0.7409880459308624, "rewards/JudgeFidelityReward/std": 0.20361118018627167, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07639661431312561, "step": 1870 }, { "clip_ratio/high_max": 0.0013720347546041013, "clip_ratio/high_mean": 0.00024158803280442952, "clip_ratio/low_mean": 6.25738175585866e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003041618503630161, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 75.52213541666667, "completions/min_length": 34.333333333333336, "epoch": 3.780241935483871, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 2.03125, "kl": 0.004398058727383613, "learning_rate": 1.5406959540847186e-07, "loss": 0.0001973236445337534, "reward": 1.6355584065119426, "reward_std": 0.1864496966203054, "rewards/FidelityReward/mean": 0.7583272854487101, "rewards/FidelityReward/std": 0.20326581100622812, "rewards/JudgeFidelityReward/mean": 0.755764385064443, "rewards/JudgeFidelityReward/std": 0.1971494903167089, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.02081293861071269, "step": 1875 }, { "clip_ratio/high_max": 0.0011103845201432705, "clip_ratio/high_mean": 0.0001334963337285444, "clip_ratio/low_mean": 0.00016045819502323865, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002939545258414, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 76.390625, "completions/min_length": 34.0, "epoch": 3.790322580645161, "frac_reward_zero_std": 0.0546875, "grad_norm": 1.9375, "kl": 0.004303332325071097, "learning_rate": 1.5167033347554282e-07, "loss": 0.0002213543513789773, "reward": 1.6131742596626282, "reward_std": 0.198679618537426, "rewards/FidelityReward/mean": 0.7440221607685089, "rewards/FidelityReward/std": 0.20080439746379852, "rewards/JudgeFidelityReward/mean": 0.7402573227882385, "rewards/JudgeFidelityReward/std": 0.18647734075784683, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1880 }, { "clip_ratio/high_max": 0.0009071146370843053, "clip_ratio/high_mean": 0.0001427698851330206, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001427698851330206, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/mean_length": 76.58723958333333, "completions/min_length": 34.666666666666664, "epoch": 3.8004032258064515, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 1.84375, "kl": 0.004461365099996328, "learning_rate": 1.4928655535987132e-07, "loss": 0.0003079236252233386, "reward": 1.6194534301757812, "reward_std": 0.19803112745285034, "rewards/FidelityReward/mean": 0.7502098282178243, "rewards/FidelityReward/std": 0.20890629788239798, "rewards/JudgeFidelityReward/mean": 0.7449975411097208, "rewards/JudgeFidelityReward/std": 0.2007472962141037, "rewards/SelfEvolvingFormatReward/mean": 0.9934895833333334, "rewards/SelfEvolvingFormatReward/std": 0.07908969124158223, "step": 1885 }, { "clip_ratio/high_max": 0.0014328116783872246, "clip_ratio/high_mean": 0.00021306067355908453, "clip_ratio/low_mean": 0.00012614592851605265, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033920659916475415, "completions/clipped_ratio": 0.0, "completions/max_length": 295.5, "completions/mean_length": 76.517578125, "completions/min_length": 34.5, "epoch": 3.810483870967742, "frac_reward_zero_std": 0.0546875, "grad_norm": 1.90625, "kl": 0.004528817068785429, "learning_rate": 1.4691836702425174e-07, "loss": 0.00013846829533576966, "reward": 1.5959939360618591, "reward_std": 0.20441993325948715, "rewards/FidelityReward/mean": 0.732142448425293, "rewards/FidelityReward/std": 0.2236955389380455, "rewards/JudgeFidelityReward/mean": 0.7345390319824219, "rewards/JudgeFidelityReward/std": 0.21636144071817398, "rewards/SelfEvolvingFormatReward/mean": 0.9931640625, "rewards/SelfEvolvingFormatReward/std": 0.08043622970581055, "step": 1890 }, { "clip_ratio/high_max": 0.0011152725666761399, "clip_ratio/high_mean": 0.00013680708361789583, "clip_ratio/low_mean": 8.790720603428781e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022471429547294975, "completions/clipped_ratio": 0.0, "completions/max_length": 215.66666666666666, "completions/mean_length": 74.892578125, "completions/min_length": 31.666666666666668, "epoch": 3.820564516129032, "frac_reward_zero_std": 0.0625, "grad_norm": 2.125, "kl": 0.004110699612647295, "learning_rate": 1.4456587373848728e-07, "loss": 0.0002563773188740015, "reward": 1.590746561686198, "reward_std": 0.1983626385529836, "rewards/FidelityReward/mean": 0.7251677711804708, "rewards/FidelityReward/std": 0.20977951089541116, "rewards/JudgeFidelityReward/mean": 0.7331106265385946, "rewards/JudgeFidelityReward/std": 0.19950776298840842, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.025465538104375202, "step": 1895 }, { "clip_ratio/high_max": 0.0017140648793429135, "clip_ratio/high_mean": 0.00020939057576470078, "clip_ratio/low_mean": 9.58181481109932e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003052087384276092, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/mean_length": 75.345703125, "completions/min_length": 31.0, "epoch": 3.8306451612903225, "frac_reward_zero_std": 0.109375, "grad_norm": 2.03125, "kl": 0.004448689147830009, "learning_rate": 1.4222918007471036e-07, "loss": 0.0001337785040959716, "reward": 1.603415310382843, "reward_std": 0.17151565849781036, "rewards/FidelityReward/mean": 0.7362803220748901, "rewards/FidelityReward/std": 0.2400817573070526, "rewards/JudgeFidelityReward/mean": 0.7381761968135834, "rewards/JudgeFidelityReward/std": 0.22430124133825302, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06243881583213806, "step": 1900 }, { "clip_ratio/high_max": 0.001881246268749237, "clip_ratio/high_mean": 0.0002529114950448275, "clip_ratio/low_mean": 3.895199479302392e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002918634912930429, "completions/clipped_ratio": 0.0, "completions/max_length": 209.33333333333334, "completions/mean_length": 76.13671875, "completions/min_length": 33.666666666666664, "epoch": 3.840725806451613, "frac_reward_zero_std": 0.09375, "grad_norm": 1.9609375, "kl": 0.004466261528432369, "learning_rate": 1.3990838990273523e-07, "loss": 0.00020027244463562965, "reward": 1.5927563508351643, "reward_std": 0.1925533562898636, "rewards/FidelityReward/mean": 0.7265434861183167, "rewards/FidelityReward/std": 0.2102272311846415, "rewards/JudgeFidelityReward/mean": 0.735680898030599, "rewards/JudgeFidelityReward/std": 0.19343488415082297, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.054928320149580635, "step": 1905 }, { "clip_ratio/high_max": 0.0017803634516894817, "clip_ratio/high_mean": 0.0001568333595059812, "clip_ratio/low_mean": 9.773711208254099e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002545704774092883, "completions/clipped_ratio": 0.0, "completions/max_length": 207.5, "completions/mean_length": 76.5341796875, "completions/min_length": 33.5, "epoch": 3.850806451612903, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.21875, "kl": 0.004312469623982906, "learning_rate": 1.376036063854401e-07, "loss": 0.00012254316825419665, "reward": 1.5867921710014343, "reward_std": 0.20654936879873276, "rewards/FidelityReward/mean": 0.7305760383605957, "rewards/FidelityReward/std": 0.20190968364477158, "rewards/JudgeFidelityReward/mean": 0.7153618335723877, "rewards/JudgeFidelityReward/std": 0.1969948709011078, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.038198307156562805, "step": 1910 }, { "clip_ratio/high_max": 0.0012474580202251674, "clip_ratio/high_mean": 0.00018638428300619126, "clip_ratio/low_mean": 3.221911247237585e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002186033991165459, "completions/clipped_ratio": 0.0, "completions/max_length": 228.33333333333334, "completions/mean_length": 75.84765625, "completions/min_length": 32.666666666666664, "epoch": 3.8608870967741935, "frac_reward_zero_std": 0.036458333333333336, "grad_norm": 2.171875, "kl": 0.004555762559175491, "learning_rate": 1.35314931974181e-07, "loss": 0.00018065969925373793, "reward": 1.5719959735870361, "reward_std": 0.2100758602221807, "rewards/FidelityReward/mean": 0.714603324731191, "rewards/FidelityReward/std": 0.2110180656115214, "rewards/JudgeFidelityReward/mean": 0.7193425297737122, "rewards/JudgeFidelityReward/std": 0.19714943567911783, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.062273996571699776, "step": 1915 }, { "clip_ratio/high_max": 0.0018870757892727851, "clip_ratio/high_mean": 0.000248779181856662, "clip_ratio/low_mean": 0.00011639074655249715, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036516992840915916, "completions/clipped_ratio": 0.0, "completions/max_length": 210.5, "completions/mean_length": 75.38671875, "completions/min_length": 34.5, "epoch": 3.870967741935484, "frac_reward_zero_std": 0.078125, "grad_norm": 2.0, "kl": 0.004270273260772228, "learning_rate": 1.3304246840423878e-07, "loss": 0.00015360430115833877, "reward": 1.6182953715324402, "reward_std": 0.19573119282722473, "rewards/FidelityReward/mean": 0.7406527698040009, "rewards/FidelityReward/std": 0.19928811490535736, "rewards/JudgeFidelityReward/mean": 0.7611445486545563, "rewards/JudgeFidelityReward/std": 0.17512956261634827, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07639661431312561, "step": 1920 }, { "clip_ratio/high_max": 0.001433981629088521, "clip_ratio/high_mean": 0.00015854540979489684, "clip_ratio/low_mean": 5.7507419842295346e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021605282090604306, "completions/clipped_ratio": 0.0, "completions/max_length": 215.33333333333334, "completions/mean_length": 75.74934895833333, "completions/min_length": 33.666666666666664, "epoch": 3.881048387096774, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.21875, "kl": 0.0042992545757442715, "learning_rate": 1.3078631669029561e-07, "loss": 0.0001514966133981943, "reward": 1.6127551396687825, "reward_std": 0.19303787251313528, "rewards/FidelityReward/mean": 0.738893965880076, "rewards/FidelityReward/std": 0.21338166296482086, "rewards/JudgeFidelityReward/mean": 0.749675452709198, "rewards/JudgeFidelityReward/std": 0.19866444170475006, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 1925 }, { "clip_ratio/high_max": 0.0018857224145904183, "clip_ratio/high_mean": 0.000264862144831568, "clip_ratio/low_mean": 0.00014348535623867064, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004083474806975573, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/mean_length": 77.150390625, "completions/min_length": 34.5, "epoch": 3.8911290322580645, "frac_reward_zero_std": 0.0390625, "grad_norm": 2.3125, "kl": 0.004293096344918012, "learning_rate": 1.2854657712194566e-07, "loss": 0.00016289707273244857, "reward": 1.6286965012550354, "reward_std": 0.20009694248437881, "rewards/FidelityReward/mean": 0.7526104152202606, "rewards/FidelityReward/std": 0.19682910293340683, "rewards/JudgeFidelityReward/mean": 0.7541252076625824, "rewards/JudgeFidelityReward/std": 0.1890822947025299, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 1930 }, { "clip_ratio/high_max": 0.0015508708078414203, "clip_ratio/high_mean": 0.00017758933827281, "clip_ratio/low_mean": 0.0001040905190166086, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002816798631101847, "completions/clipped_ratio": 0.0, "completions/max_length": 262.3333333333333, "completions/mean_length": 76.55859375, "completions/min_length": 32.0, "epoch": 3.901209677419355, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 2.203125, "kl": 0.004362406395375729, "learning_rate": 1.2632334925923666e-07, "loss": 0.00029396179597824813, "reward": 1.6139265696207683, "reward_std": 0.18816896776358286, "rewards/FidelityReward/mean": 0.7404965758323669, "rewards/FidelityReward/std": 0.20132641990979513, "rewards/JudgeFidelityReward/mean": 0.7501153349876404, "rewards/JudgeFidelityReward/std": 0.18707647422949472, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04627847671508789, "step": 1935 }, { "clip_ratio/high_max": 0.0020362735725939275, "clip_ratio/high_mean": 0.00026772962883114815, "clip_ratio/low_mean": 0.00011398837086744607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003817180055193603, "completions/clipped_ratio": 0.0, "completions/max_length": 209.5, "completions/mean_length": 75.2646484375, "completions/min_length": 31.5, "epoch": 3.911290322580645, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.015625, "kl": 0.004423535335808993, "learning_rate": 1.2411673192824385e-07, "loss": 0.00022539645433425904, "reward": 1.6225082874298096, "reward_std": 0.19158346951007843, "rewards/FidelityReward/mean": 0.751184731721878, "rewards/FidelityReward/std": 0.2057369500398636, "rewards/JudgeFidelityReward/mean": 0.7465532422065735, "rewards/JudgeFidelityReward/std": 0.19914916157722473, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.04406425356864929, "step": 1940 }, { "clip_ratio/high_max": 0.0010291228536516428, "clip_ratio/high_mean": 0.00015962435863912106, "clip_ratio/low_mean": 8.701246115379035e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002466368256136775, "completions/clipped_ratio": 0.0, "completions/max_length": 224.33333333333334, "completions/mean_length": 77.763671875, "completions/min_length": 34.333333333333336, "epoch": 3.9213709677419355, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.359375, "kl": 0.004695643112063408, "learning_rate": 1.2192682321667785e-07, "loss": 0.0001425493508577347, "reward": 1.6036008993784587, "reward_std": 0.20217236876487732, "rewards/FidelityReward/mean": 0.7366252342859904, "rewards/FidelityReward/std": 0.21326242884000143, "rewards/JudgeFidelityReward/mean": 0.7385085622469584, "rewards/JudgeFidelityReward/std": 0.19523900747299194, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.054841707150141396, "step": 1945 }, { "clip_ratio/high_max": 0.0016803580801934004, "clip_ratio/high_mean": 0.0002454581728670746, "clip_ratio/low_mean": 0.00011088846367783845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003563466249033809, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/mean_length": 77.994140625, "completions/min_length": 36.5, "epoch": 3.931451612903226, "frac_reward_zero_std": 0.046875, "grad_norm": 1.953125, "kl": 0.004537448659539223, "learning_rate": 1.1975372046952454e-07, "loss": 0.00016439551254734396, "reward": 1.6481484770774841, "reward_std": 0.18741405755281448, "rewards/FidelityReward/mean": 0.7624232172966003, "rewards/FidelityReward/std": 0.21058690547943115, "rewards/JudgeFidelityReward/mean": 0.7753567397594452, "rewards/JudgeFidelityReward/std": 0.20394058525562286, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.04406425356864929, "step": 1950 }, { "clip_ratio/high_max": 0.000805082661099732, "clip_ratio/high_mean": 9.44754690863192e-05, "clip_ratio/low_mean": 6.119682802818716e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015567229129374028, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/mean_length": 75.79361979166667, "completions/min_length": 34.333333333333336, "epoch": 3.941532258064516, "frac_reward_zero_std": 0.0625, "grad_norm": 1.9140625, "kl": 0.004362776409834623, "learning_rate": 1.1759752028471642e-07, "loss": 0.00027247359976172446, "reward": 1.6204776366551716, "reward_std": 0.18989566465218863, "rewards/FidelityReward/mean": 0.7502163648605347, "rewards/FidelityReward/std": 0.19794078667958578, "rewards/JudgeFidelityReward/mean": 0.7450798749923706, "rewards/JudgeFidelityReward/std": 0.19084775944550833, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.06709141532580058, "step": 1955 }, { "clip_ratio/high_max": 0.0028824090957641603, "clip_ratio/high_mean": 0.00032075961935333907, "clip_ratio/low_mean": 0.0001487038447521627, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00046946346992626786, "completions/clipped_ratio": 0.0, "completions/max_length": 234.5, "completions/mean_length": 76.8701171875, "completions/min_length": 34.0, "epoch": 3.9516129032258065, "frac_reward_zero_std": 0.0859375, "grad_norm": 1.796875, "kl": 0.00458681583404541, "learning_rate": 1.154583185088408e-07, "loss": 0.0001245675841346383, "reward": 1.613266110420227, "reward_std": 0.1826377585530281, "rewards/FidelityReward/mean": 0.7389774322509766, "rewards/FidelityReward/std": 0.20754677057266235, "rewards/JudgeFidelityReward/mean": 0.7554134130477905, "rewards/JudgeFidelityReward/std": 0.19436632841825485, "rewards/SelfEvolvingFormatReward/mean": 0.9931640625, "rewards/SelfEvolvingFormatReward/std": 0.0822625607252121, "step": 1960 }, { "clip_ratio/high_max": 0.0008762651588767767, "clip_ratio/high_mean": 0.0001301254495047033, "clip_ratio/low_mean": 3.638085909187794e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016650631441734732, "completions/clipped_ratio": 0.0, "completions/max_length": 208.66666666666666, "completions/mean_length": 75.98958333333333, "completions/min_length": 35.0, "epoch": 3.961693548387097, "frac_reward_zero_std": 0.08854166666666667, "grad_norm": 2.03125, "kl": 0.004261226765811444, "learning_rate": 1.1333621023287732e-07, "loss": 0.00017137029208242894, "reward": 1.61529274781545, "reward_std": 0.1878791848818461, "rewards/FidelityReward/mean": 0.7432210445404053, "rewards/FidelityReward/std": 0.2010272592306137, "rewards/JudgeFidelityReward/mean": 0.748700737953186, "rewards/JudgeFidelityReward/std": 0.18371717631816864, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.062273996571699776, "step": 1965 }, { "clip_ratio/high_max": 0.001544502191245556, "clip_ratio/high_mean": 0.000182223750744015, "clip_ratio/low_mean": 4.776598434546031e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022998972563073039, "completions/clipped_ratio": 0.0, "completions/max_length": 227.5, "completions/mean_length": 76.5615234375, "completions/min_length": 36.5, "epoch": 3.971774193548387, "frac_reward_zero_std": 0.0703125, "grad_norm": 1.9453125, "kl": 0.004376919893547893, "learning_rate": 1.1123128978797253e-07, "loss": 0.00010421093320474028, "reward": 1.6059116125106812, "reward_std": 0.19631125777959824, "rewards/FidelityReward/mean": 0.7360182404518127, "rewards/FidelityReward/std": 0.21086598932743073, "rewards/JudgeFidelityReward/mean": 0.7446695566177368, "rewards/JudgeFidelityReward/std": 0.20356427878141403, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06941771507263184, "step": 1970 }, { "clip_ratio/high_max": 0.0011429166421294211, "clip_ratio/high_mean": 0.00013866388471797108, "clip_ratio/low_mean": 5.1243860070826486e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000189907755702734, "completions/clipped_ratio": 0.0006510416666666666, "completions/max_length": 291.0, "completions/mean_length": 77.064453125, "completions/min_length": 35.0, "epoch": 3.9818548387096775, "frac_reward_zero_std": 0.10416666666666667, "grad_norm": 1.984375, "kl": 0.004194094892591238, "learning_rate": 1.0914365074124604e-07, "loss": 0.0002580634551122785, "reward": 1.645979881286621, "reward_std": 0.17664009829362234, "rewards/FidelityReward/mean": 0.7614281177520752, "rewards/FidelityReward/std": 0.18488938609759012, "rewards/JudgeFidelityReward/mean": 0.7743119200070699, "rewards/JudgeFidelityReward/std": 0.17406892279783884, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.053477492183446884, "step": 1975 }, { "clip_ratio/high_max": 0.0017826269380748273, "clip_ratio/high_mean": 0.00022066491073928772, "clip_ratio/low_mean": 9.480588778387755e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031547079561278224, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/mean_length": 73.8115234375, "completions/min_length": 33.5, "epoch": 3.991935483870968, "frac_reward_zero_std": 0.1796875, "grad_norm": 1.8359375, "kl": 0.004340470023453236, "learning_rate": 1.0707338589163078e-07, "loss": 0.000330126378685236, "reward": 1.6238734722137451, "reward_std": 0.15744109451770782, "rewards/FidelityReward/mean": 0.7522661089897156, "rewards/FidelityReward/std": 0.21488786488771439, "rewards/JudgeFidelityReward/mean": 0.7451677322387695, "rewards/JudgeFidelityReward/std": 0.1988459900021553, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1980 }, { "clip_ratio/high_max": 0.001022988627664745, "clip_ratio/high_mean": 0.0002242725167889148, "clip_ratio/low_mean": 6.66328560328111e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029090536991134287, "completions/clipped_ratio": 0.0, "completions/max_length": 249.66666666666666, "completions/mean_length": 79.60481770833333, "completions/min_length": 32.0, "epoch": 4.002016129032258, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.015625, "kl": 0.004469733871519566, "learning_rate": 1.0502058726574914e-07, "loss": 0.0002078363671898842, "reward": 1.6008758147557576, "reward_std": 0.19291386504968008, "rewards/FidelityReward/mean": 0.7333622177441915, "rewards/FidelityReward/std": 0.21695821483929953, "rewards/JudgeFidelityReward/mean": 0.7395844658215841, "rewards/JudgeFidelityReward/std": 0.20481655995051065, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.06709141532580058, "step": 1985 }, { "clip_ratio/high_max": 0.001525480207055807, "clip_ratio/high_mean": 0.00025009968085214495, "clip_ratio/low_mean": 6.565837393281981e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003157580620609224, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/mean_length": 76.8154296875, "completions/min_length": 37.5, "epoch": 4.012096774193548, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.09375, "kl": 0.004226531647145748, "learning_rate": 1.0298534611382148e-07, "loss": 0.0001934761879965663, "reward": 1.5938870310783386, "reward_std": 0.19848518818616867, "rewards/FidelityReward/mean": 0.7286499440670013, "rewards/FidelityReward/std": 0.20873919129371643, "rewards/JudgeFidelityReward/mean": 0.732427328824997, "rewards/JudgeFidelityReward/std": 0.20520135760307312, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 1990 }, { "clip_ratio/high_max": 0.0010653065517544746, "clip_ratio/high_mean": 0.0001345355121884495, "clip_ratio/low_mean": 0.00018311963649466633, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031765514286234974, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/mean_length": 76.408203125, "completions/min_length": 33.333333333333336, "epoch": 4.022177419354839, "frac_reward_zero_std": 0.046875, "grad_norm": 2.203125, "kl": 0.004672729689627886, "learning_rate": 1.009677529056096e-07, "loss": 0.0002288956195116043, "reward": 1.64350430170695, "reward_std": 0.1943971316019694, "rewards/FidelityReward/mean": 0.762401262919108, "rewards/FidelityReward/std": 0.203192338347435, "rewards/JudgeFidelityReward/mean": 0.7674143314361572, "rewards/JudgeFidelityReward/std": 0.19298016528288522, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.07174401481946309, "step": 1995 }, { "clip_ratio/high_max": 0.0017048316076397896, "clip_ratio/high_mean": 0.000261395430425182, "clip_ratio/low_mean": 0.00017609189380891622, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000437487312592566, "completions/clipped_ratio": 0.0, "completions/max_length": 213.5, "completions/mean_length": 76.1396484375, "completions/min_length": 34.0, "epoch": 4.032258064516129, "frac_reward_zero_std": 0.1171875, "grad_norm": 1.890625, "kl": 0.004294170532375574, "learning_rate": 9.89678973263962e-08, "loss": 0.00012682254891842603, "reward": 1.658136785030365, "reward_std": 0.17242266982793808, "rewards/FidelityReward/mean": 0.7771112322807312, "rewards/FidelityReward/std": 0.19600674510002136, "rewards/JudgeFidelityReward/mean": 0.7679104208946228, "rewards/JudgeFidelityReward/std": 0.1973351463675499, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 2000 }, { "clip_ratio/high_max": 0.0010682083666324615, "clip_ratio/high_mean": 0.00013120417715981603, "clip_ratio/low_mean": 0.00017720228352118282, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000308406469412148, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/mean_length": 74.80859375, "completions/min_length": 33.0, "epoch": 4.042338709677419, "frac_reward_zero_std": 0.078125, "grad_norm": 2.125, "kl": 0.004431633418425918, "learning_rate": 9.69858682729976e-08, "loss": 5.476299556903541e-05, "reward": 1.6193496386210124, "reward_std": 0.18717692295710245, "rewards/FidelityReward/mean": 0.747450073560079, "rewards/FidelityReward/std": 0.20029619336128235, "rewards/JudgeFidelityReward/mean": 0.7444502115249634, "rewards/JudgeFidelityReward/std": 0.19624574979146323, "rewards/SelfEvolvingFormatReward/mean": 0.9993489583333334, "rewards/SelfEvolvingFormatReward/std": 0.014731391022602717, "step": 2005 }, { "clip_ratio/high_max": 0.0021943000378087164, "clip_ratio/high_mean": 0.00026460652006790044, "clip_ratio/low_mean": 7.295905452338047e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003375655796844512, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/mean_length": 77.330078125, "completions/min_length": 33.5, "epoch": 4.05241935483871, "frac_reward_zero_std": 0.109375, "grad_norm": 2.046875, "kl": 0.00448016170412302, "learning_rate": 9.502175384981164e-08, "loss": 9.973873384296894e-05, "reward": 1.6158220767974854, "reward_std": 0.19449644535779953, "rewards/FidelityReward/mean": 0.7374111711978912, "rewards/FidelityReward/std": 0.19740547239780426, "rewards/JudgeFidelityReward/mean": 0.7597513794898987, "rewards/JudgeFidelityReward/std": 0.18221478164196014, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.038198307156562805, "step": 2010 }, { "clip_ratio/high_max": 0.0009551406605169177, "clip_ratio/high_mean": 0.00014866775600239635, "clip_ratio/low_mean": 9.065052727237344e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023931828327476977, "completions/clipped_ratio": 0.0, "completions/max_length": 196.66666666666666, "completions/mean_length": 74.72200520833333, "completions/min_length": 34.0, "epoch": 4.0625, "frac_reward_zero_std": 0.078125, "grad_norm": 1.921875, "kl": 0.004114143922924996, "learning_rate": 9.307564136490254e-08, "loss": 0.00017758040921762587, "reward": 1.6118155320485432, "reward_std": 0.18590607245763144, "rewards/FidelityReward/mean": 0.7412594159444174, "rewards/FidelityReward/std": 0.21192209919293722, "rewards/JudgeFidelityReward/mean": 0.743065337340037, "rewards/JudgeFidelityReward/std": 0.21167430778344473, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 2015 }, { "clip_ratio/high_max": 0.0013661872129887343, "clip_ratio/high_mean": 0.0002579367021098733, "clip_ratio/low_mean": 0.00011139175039716065, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036932844086550175, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/mean_length": 75.3525390625, "completions/min_length": 35.0, "epoch": 4.07258064516129, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.265625, "kl": 0.004368662647902965, "learning_rate": 9.114761732611892e-08, "loss": 0.00013490417040884495, "reward": 1.5791715383529663, "reward_std": 0.19694602489471436, "rewards/FidelityReward/mean": 0.7147561013698578, "rewards/FidelityReward/std": 0.19339265674352646, "rewards/JudgeFidelityReward/mean": 0.731760710477829, "rewards/JudgeFidelityReward/std": 0.18016470968723297, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.038198307156562805, "step": 2020 }, { "clip_ratio/high_max": 0.0012898062821477651, "clip_ratio/high_mean": 0.0001900595729239285, "clip_ratio/low_mean": 9.9434849107638e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002894944278523326, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/mean_length": 76.9140625, "completions/min_length": 34.0, "epoch": 4.082661290322581, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.234375, "kl": 0.004379147291183471, "learning_rate": 8.923776743724875e-08, "loss": 0.00013122634263709187, "reward": 1.6464619239171345, "reward_std": 0.18738944828510284, "rewards/FidelityReward/mean": 0.7629764874776205, "rewards/FidelityReward/std": 0.20726822316646576, "rewards/JudgeFidelityReward/mean": 0.771528164545695, "rewards/JudgeFidelityReward/std": 0.19876697659492493, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.054841707150141396, "step": 2025 }, { "clip_ratio/high_max": 0.002335022436454892, "clip_ratio/high_mean": 0.0002216923632659018, "clip_ratio/low_mean": 8.822616655379534e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030991851817816494, "completions/clipped_ratio": 0.0, "completions/max_length": 261.5, "completions/mean_length": 75.5888671875, "completions/min_length": 34.0, "epoch": 4.092741935483871, "frac_reward_zero_std": 0.109375, "grad_norm": 1.9140625, "kl": 0.004260590020567179, "learning_rate": 8.734617659420978e-08, "loss": 0.00027530582156032324, "reward": 1.612242877483368, "reward_std": 0.17676255851984024, "rewards/FidelityReward/mean": 0.7413524985313416, "rewards/FidelityReward/std": 0.20092611014842987, "rewards/JudgeFidelityReward/mean": 0.7466636002063751, "rewards/JudgeFidelityReward/std": 0.18831756711006165, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06941771507263184, "step": 2030 }, { "clip_ratio/high_max": 0.001386985881254077, "clip_ratio/high_mean": 0.00013487619580700993, "clip_ratio/low_mean": 3.598290859372355e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017085910658352078, "completions/clipped_ratio": 0.0, "completions/max_length": 208.33333333333334, "completions/mean_length": 75.43294270833333, "completions/min_length": 33.666666666666664, "epoch": 4.102822580645161, "frac_reward_zero_std": 0.078125, "grad_norm": 1.8828125, "kl": 0.004475994128733873, "learning_rate": 8.547292888127522e-08, "loss": 9.766736766323448e-05, "reward": 1.6732274691263835, "reward_std": 0.1748856802781423, "rewards/FidelityReward/mean": 0.7790601054827372, "rewards/FidelityReward/std": 0.20122897624969482, "rewards/JudgeFidelityReward/mean": 0.7922410170237223, "rewards/JudgeFidelityReward/std": 0.18953942755858103, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06243881583213806, "step": 2035 }, { "clip_ratio/high_max": 0.0015697124414145946, "clip_ratio/high_mean": 0.000242658203933388, "clip_ratio/low_mean": 0.00015675835602451115, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003994165570475161, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 77.0693359375, "completions/min_length": 36.0, "epoch": 4.112903225806452, "frac_reward_zero_std": 0.1015625, "grad_norm": 2.046875, "kl": 0.0045061461627483364, "learning_rate": 8.361810756733689e-08, "loss": 0.00015069040236994624, "reward": 1.635189175605774, "reward_std": 0.18307878077030182, "rewards/FidelityReward/mean": 0.750144362449646, "rewards/FidelityReward/std": 0.20004800707101822, "rewards/JudgeFidelityReward/mean": 0.7700894773006439, "rewards/JudgeFidelityReward/std": 0.19038581103086472, "rewards/SelfEvolvingFormatReward/mean": 1.0, "rewards/SelfEvolvingFormatReward/std": 0.0, "step": 2040 }, { "clip_ratio/high_max": 0.0008632990997284651, "clip_ratio/high_mean": 0.00011452395119704306, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00011452395119704306, "completions/clipped_ratio": 0.0, "completions/max_length": 184.66666666666666, "completions/mean_length": 75.5390625, "completions/min_length": 33.0, "epoch": 4.122983870967742, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.03125, "kl": 0.0043548217974603174, "learning_rate": 8.17817951022033e-08, "loss": 8.257174631580711e-05, "reward": 1.5938140948613484, "reward_std": 0.19973676900068918, "rewards/FidelityReward/mean": 0.7266798615455627, "rewards/FidelityReward/std": 0.20568356414635977, "rewards/JudgeFidelityReward/mean": 0.7375236749649048, "rewards/JudgeFidelityReward/std": 0.19392590721448263, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.05635726824402809, "step": 2045 }, { "clip_ratio/high_max": 0.0014636674197390676, "clip_ratio/high_mean": 0.0001801763653929811, "clip_ratio/low_mean": 0.0001437076338334009, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032388399995397776, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/mean_length": 77.2890625, "completions/min_length": 35.5, "epoch": 4.133064516129032, "frac_reward_zero_std": 0.0859375, "grad_norm": 1.90625, "kl": 0.004456135351210833, "learning_rate": 7.996407311293435e-08, "loss": 0.00013899842742830514, "reward": 1.610632598400116, "reward_std": 0.18714118003845215, "rewards/FidelityReward/mean": 0.7382184267044067, "rewards/FidelityReward/std": 0.2114994078874588, "rewards/JudgeFidelityReward/mean": 0.7487344443798065, "rewards/JudgeFidelityReward/std": 0.2058488354086876, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06243881583213806, "step": 2050 }, { "clip_ratio/high_max": 0.0018859650008380413, "clip_ratio/high_mean": 0.0002826157957315445, "clip_ratio/low_mean": 3.106059157289565e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003136764047667384, "completions/clipped_ratio": 0.0, "completions/max_length": 210.66666666666666, "completions/mean_length": 77.76627604166667, "completions/min_length": 33.666666666666664, "epoch": 4.143145161290323, "frac_reward_zero_std": 0.09375, "grad_norm": 2.078125, "kl": 0.004451719578355551, "learning_rate": 7.816502240021367e-08, "loss": 0.00022437432780861855, "reward": 1.6202104091644287, "reward_std": 0.1940657446781794, "rewards/FidelityReward/mean": 0.7459409832954407, "rewards/FidelityReward/std": 0.21103068192799887, "rewards/JudgeFidelityReward/mean": 0.7511431177457174, "rewards/JudgeFidelityReward/std": 0.1986528585354487, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04162587722142538, "step": 2055 }, { "clip_ratio/high_max": 0.001548237493261695, "clip_ratio/high_mean": 0.0002511116035748273, "clip_ratio/low_mean": 0.00018613060237839817, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004372422117739916, "completions/clipped_ratio": 0.0, "completions/max_length": 246.5, "completions/mean_length": 76.1005859375, "completions/min_length": 32.5, "epoch": 4.153225806451613, "frac_reward_zero_std": 0.046875, "grad_norm": 1.96875, "kl": 0.004328435473144055, "learning_rate": 7.638472293475601e-08, "loss": 0.0001831711968407035, "reward": 1.662703275680542, "reward_std": 0.1772255226969719, "rewards/FidelityReward/mean": 0.7796740531921387, "rewards/FidelityReward/std": 0.18996409326791763, "rewards/JudgeFidelityReward/mean": 0.7670349180698395, "rewards/JudgeFidelityReward/std": 0.1907225027680397, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 2060 }, { "clip_ratio/high_max": 0.0010754312854260207, "clip_ratio/high_mean": 0.0001810855173971504, "clip_ratio/low_mean": 8.294982108054682e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002640353515744209, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/mean_length": 74.74479166666667, "completions/min_length": 33.0, "epoch": 4.163306451612903, "frac_reward_zero_std": 0.11979166666666667, "grad_norm": 2.109375, "kl": 0.004200613964349032, "learning_rate": 7.462325385375317e-08, "loss": 0.00012141899205744266, "reward": 1.6280490557352703, "reward_std": 0.17944997549057007, "rewards/FidelityReward/mean": 0.7472088932991028, "rewards/FidelityReward/std": 0.20161542296409607, "rewards/JudgeFidelityReward/mean": 0.7636333107948303, "rewards/JudgeFidelityReward/std": 0.17664937178293863, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 2065 }, { "clip_ratio/high_max": 0.0017991371685639023, "clip_ratio/high_mean": 0.00024368930025957524, "clip_ratio/low_mean": 6.20857666945085e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003057750698644668, "completions/clipped_ratio": 0.0, "completions/max_length": 190.5, "completions/mean_length": 76.7451171875, "completions/min_length": 33.0, "epoch": 4.173387096774194, "frac_reward_zero_std": 0.09375, "grad_norm": 2.140625, "kl": 0.004143463261425495, "learning_rate": 7.288069345735593e-08, "loss": 0.00011605084873735904, "reward": 1.5734190940856934, "reward_std": 0.20483741909265518, "rewards/FidelityReward/mean": 0.7163415551185608, "rewards/FidelityReward/std": 0.20979640632867813, "rewards/JudgeFidelityReward/mean": 0.7161083221435547, "rewards/JudgeFidelityReward/std": 0.20992134511470795, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 2070 }, { "clip_ratio/high_max": 0.0010127606801688671, "clip_ratio/high_mean": 0.00011718362220562995, "clip_ratio/low_mean": 7.246420136652887e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001896478235721588, "completions/clipped_ratio": 0.0, "completions/max_length": 287.6666666666667, "completions/mean_length": 76.70572916666667, "completions/min_length": 35.666666666666664, "epoch": 4.183467741935484, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 1.953125, "kl": 0.0044537817128002645, "learning_rate": 7.115711920519297e-08, "loss": 0.00014690221287310122, "reward": 1.6364631652832031, "reward_std": 0.18676521380742392, "rewards/FidelityReward/mean": 0.7548468510309855, "rewards/FidelityReward/std": 0.20611186822255453, "rewards/JudgeFidelityReward/mean": 0.7703941464424133, "rewards/JudgeFidelityReward/std": 0.1862939844528834, "rewards/SelfEvolvingFormatReward/mean": 0.9928385416666666, "rewards/SelfEvolvingFormatReward/std": 0.08037197838226955, "step": 2075 }, { "clip_ratio/high_max": 0.0017066396540030837, "clip_ratio/high_mean": 0.00022079221671447157, "clip_ratio/low_mean": 0.0001492854324169457, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00037007766077294945, "completions/clipped_ratio": 0.0, "completions/max_length": 220.5, "completions/mean_length": 75.677734375, "completions/min_length": 32.0, "epoch": 4.193548387096774, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.109375, "kl": 0.004251022543758154, "learning_rate": 6.94526077129286e-08, "loss": 0.00019728078041225672, "reward": 1.6271880865097046, "reward_std": 0.19054310023784637, "rewards/FidelityReward/mean": 0.7466512024402618, "rewards/FidelityReward/std": 0.20713885873556137, "rewards/JudgeFidelityReward/mean": 0.7620501816272736, "rewards/JudgeFidelityReward/std": 0.1859828159213066, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 2080 }, { "clip_ratio/high_max": 0.00165530014783144, "clip_ratio/high_mean": 0.0002460955874994397, "clip_ratio/low_mean": 4.467732360353693e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029077292419970034, "completions/clipped_ratio": 0.0, "completions/max_length": 212.33333333333334, "completions/mean_length": 74.82877604166667, "completions/min_length": 33.333333333333336, "epoch": 4.203629032258065, "frac_reward_zero_std": 0.08854166666666667, "grad_norm": 1.9765625, "kl": 0.004432747606188059, "learning_rate": 6.776723474885654e-08, "loss": 0.0001927702920511365, "reward": 1.5901945034662883, "reward_std": 0.19350763658682504, "rewards/FidelityReward/mean": 0.7237923940022787, "rewards/FidelityReward/std": 0.20263835291067758, "rewards/JudgeFidelityReward/mean": 0.7341063221295675, "rewards/JudgeFidelityReward/std": 0.18875634670257568, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.029462782045205433, "step": 2085 }, { "clip_ratio/high_max": 0.00196528690867126, "clip_ratio/high_mean": 0.000186172264511697, "clip_ratio/low_mean": 0.00011485870345495641, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030103096505627034, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/mean_length": 75.75390625, "completions/min_length": 32.5, "epoch": 4.213709677419355, "frac_reward_zero_std": 0.1015625, "grad_norm": 1.9296875, "kl": 0.004234050773084164, "learning_rate": 6.610107523053177e-08, "loss": 0.000217284238897264, "reward": 1.6058353185653687, "reward_std": 0.18838677555322647, "rewards/FidelityReward/mean": 0.7346596121788025, "rewards/FidelityReward/std": 0.1994512751698494, "rewards/JudgeFidelityReward/mean": 0.7443046569824219, "rewards/JudgeFidelityReward/std": 0.18290919065475464, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 2090 }, { "clip_ratio/high_max": 0.0014216050039976835, "clip_ratio/high_mean": 0.00015660799108445643, "clip_ratio/low_mean": 7.792866381350905e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023453664034605025, "completions/clipped_ratio": 0.0006510416666666666, "completions/max_length": 271.3333333333333, "completions/mean_length": 77.119140625, "completions/min_length": 33.333333333333336, "epoch": 4.223790322580645, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 1.984375, "kl": 0.004312271066009998, "learning_rate": 6.44542032214409e-08, "loss": 0.00010173945920541883, "reward": 1.6641346613566081, "reward_std": 0.18286090095837912, "rewards/FidelityReward/mean": 0.7754382888476054, "rewards/FidelityReward/std": 0.19656967123349509, "rewards/JudgeFidelityReward/mean": 0.779345691204071, "rewards/JudgeFidelityReward/std": 0.18775095542271933, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 2095 }, { "clip_ratio/high_max": 0.0010460362304002047, "clip_ratio/high_mean": 0.00018730882438831032, "clip_ratio/low_mean": 0.00011254819910391233, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029985701548866927, "completions/clipped_ratio": 0.0, "completions/max_length": 225.5, "completions/mean_length": 74.4716796875, "completions/min_length": 32.0, "epoch": 4.233870967741935, "frac_reward_zero_std": 0.1015625, "grad_norm": 1.9609375, "kl": 0.004087631776928902, "learning_rate": 6.282669192770895e-08, "loss": 0.0001672212267294526, "reward": 1.5995277166366577, "reward_std": 0.19840194284915924, "rewards/FidelityReward/mean": 0.7312908172607422, "rewards/FidelityReward/std": 0.210673525929451, "rewards/JudgeFidelityReward/mean": 0.7384269535541534, "rewards/JudgeFidelityReward/std": 0.19092251360416412, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 2100 }, { "clip_ratio/high_max": 0.001248898822814226, "clip_ratio/high_mean": 0.00018053228850476444, "clip_ratio/low_mean": 8.536754758097232e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002658998360857368, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/mean_length": 76.099609375, "completions/min_length": 35.333333333333336, "epoch": 4.243951612903226, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 1.9296875, "kl": 0.00441257543861866, "learning_rate": 6.121861369484633e-08, "loss": 0.00019141330849379302, "reward": 1.6088426113128662, "reward_std": 0.18932096660137177, "rewards/FidelityReward/mean": 0.7395243247350057, "rewards/FidelityReward/std": 0.20713828007380167, "rewards/JudgeFidelityReward/mean": 0.7418918410936991, "rewards/JudgeFidelityReward/std": 0.1972370743751526, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.054928320149580635, "step": 2105 }, { "clip_ratio/high_max": 0.0017895319731906056, "clip_ratio/high_mean": 0.00020618420094251633, "clip_ratio/low_mean": 4.230513877701014e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002484893542714417, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/mean_length": 77.8134765625, "completions/min_length": 37.5, "epoch": 4.254032258064516, "frac_reward_zero_std": 0.078125, "grad_norm": 2.0625, "kl": 0.004541651997715235, "learning_rate": 5.963004000453242e-08, "loss": 0.00017580522689968346, "reward": 1.626539945602417, "reward_std": 0.18940573185682297, "rewards/FidelityReward/mean": 0.7438114881515503, "rewards/FidelityReward/std": 0.20390889048576355, "rewards/JudgeFidelityReward/mean": 0.7732692658901215, "rewards/JudgeFidelityReward/std": 0.17681121081113815, "rewards/SelfEvolvingFormatReward/mean": 0.9921875, "rewards/SelfEvolvingFormatReward/std": 0.08812850713729858, "step": 2110 }, { "clip_ratio/high_max": 0.0017022217623889446, "clip_ratio/high_mean": 0.00015267388662323355, "clip_ratio/low_mean": 5.153752863407135e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002042114152573049, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/mean_length": 77.02213541666667, "completions/min_length": 32.333333333333336, "epoch": 4.264112903225806, "frac_reward_zero_std": 0.0625, "grad_norm": 2.125, "kl": 0.004436514247208834, "learning_rate": 5.8061041471437696e-08, "loss": 0.00018947042990475894, "reward": 1.6045000553131104, "reward_std": 0.2038863797982534, "rewards/FidelityReward/mean": 0.7333805362383524, "rewards/FidelityReward/std": 0.21073735256989798, "rewards/JudgeFidelityReward/mean": 0.7448431054751078, "rewards/JudgeFidelityReward/std": 0.19828306138515472, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04019692912697792, "step": 2115 }, { "clip_ratio/high_max": 0.002354386355727911, "clip_ratio/high_mean": 0.00033902842551469803, "clip_ratio/low_mean": 0.0001244529994437471, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004634814104065299, "completions/clipped_ratio": 0.0, "completions/max_length": 222.5, "completions/mean_length": 75.2236328125, "completions/min_length": 33.5, "epoch": 4.274193548387097, "frac_reward_zero_std": 0.1171875, "grad_norm": 2.109375, "kl": 0.00428944630548358, "learning_rate": 5.6511687840085695e-08, "loss": 0.00020097726956009866, "reward": 1.6449273824691772, "reward_std": 0.16848085820674896, "rewards/FidelityReward/mean": 0.7586049437522888, "rewards/FidelityReward/std": 0.20792318880558014, "rewards/JudgeFidelityReward/mean": 0.7745980322360992, "rewards/JudgeFidelityReward/std": 0.18421665579080582, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 2120 }, { "clip_ratio/high_max": 0.001130073145031929, "clip_ratio/high_mean": 0.00015534920385107398, "clip_ratio/low_mean": 6.672255476587452e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022207176079973577, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/mean_length": 76.10286458333333, "completions/min_length": 35.0, "epoch": 4.284274193548387, "frac_reward_zero_std": 0.0625, "grad_norm": 1.9453125, "kl": 0.004210641561076045, "learning_rate": 5.4982047981752155e-08, "loss": 0.0002659392077475786, "reward": 1.6694721778233845, "reward_std": 0.18199909230073294, "rewards/FidelityReward/mean": 0.7792465488115946, "rewards/FidelityReward/std": 0.1917042831579844, "rewards/JudgeFidelityReward/mean": 0.7843575278917948, "rewards/JudgeFidelityReward/std": 0.1909073442220688, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.061009867737690605, "step": 2125 }, { "clip_ratio/high_max": 0.001421480835415423, "clip_ratio/high_mean": 0.00025792542146518826, "clip_ratio/low_mean": 0.0001071672872058116, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036509273340925576, "completions/clipped_ratio": 0.0009765625, "completions/max_length": 327.0, "completions/mean_length": 76.177734375, "completions/min_length": 31.0, "epoch": 4.294354838709677, "frac_reward_zero_std": 0.0546875, "grad_norm": 1.90625, "kl": 0.004409938305616379, "learning_rate": 5.3472189891403506e-08, "loss": 0.00025842422619462015, "reward": 1.6572197675704956, "reward_std": 0.18258966505527496, "rewards/FidelityReward/mean": 0.772561639547348, "rewards/FidelityReward/std": 0.22187913209199905, "rewards/JudgeFidelityReward/mean": 0.7751757204532623, "rewards/JudgeFidelityReward/std": 0.20853505283594131, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 2130 }, { "clip_ratio/high_max": 0.000954413553699851, "clip_ratio/high_mean": 0.0001402384805260226, "clip_ratio/low_mean": 5.9298481210134925e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019953696755692363, "completions/clipped_ratio": 0.0, "completions/max_length": 183.33333333333334, "completions/mean_length": 75.171875, "completions/min_length": 32.333333333333336, "epoch": 4.304435483870968, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 1.9609375, "kl": 0.004340261686593294, "learning_rate": 5.1982180684674856e-08, "loss": 0.00026184553280472755, "reward": 1.6220850149790447, "reward_std": 0.20042178531487784, "rewards/FidelityReward/mean": 0.7497738798459371, "rewards/FidelityReward/std": 0.19252600272496542, "rewards/JudgeFidelityReward/mean": 0.7498305042584738, "rewards/JudgeFidelityReward/std": 0.19089479744434357, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.06957309817274411, "step": 2135 }, { "clip_ratio/high_max": 0.001927033788524568, "clip_ratio/high_mean": 0.0003108782053459436, "clip_ratio/low_mean": 0.00017455053748562931, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004854287370108068, "completions/clipped_ratio": 0.0, "completions/max_length": 286.5, "completions/mean_length": 77.251953125, "completions/min_length": 34.5, "epoch": 4.314516129032258, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.125, "kl": 0.0045956001617014405, "learning_rate": 5.051208659488626e-08, "loss": 0.0002040887251496315, "reward": 1.6324363350868225, "reward_std": 0.18064165115356445, "rewards/FidelityReward/mean": 0.7580136656761169, "rewards/FidelityReward/std": 0.21706699579954147, "rewards/JudgeFidelityReward/mean": 0.7498218417167664, "rewards/JudgeFidelityReward/std": 0.21950280666351318, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 2140 }, { "clip_ratio/high_max": 0.0013383530545979737, "clip_ratio/high_mean": 0.00021433201036415994, "clip_ratio/low_mean": 5.1320019701961427e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026565203443169595, "completions/clipped_ratio": 0.0, "completions/max_length": 178.33333333333334, "completions/mean_length": 72.33984375, "completions/min_length": 31.666666666666668, "epoch": 4.324596774193548, "frac_reward_zero_std": 0.13020833333333334, "grad_norm": 2.015625, "kl": 0.004206532146781683, "learning_rate": 4.9061972970098394e-08, "loss": 0.00014082366833463312, "reward": 1.6561845143636067, "reward_std": 0.16676337520281473, "rewards/FidelityReward/mean": 0.7688944737116495, "rewards/FidelityReward/std": 0.19235980014006296, "rewards/JudgeFidelityReward/mean": 0.7765331069628397, "rewards/JudgeFidelityReward/std": 0.17641136546929678, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.025465538104375202, "step": 2145 }, { "clip_ratio/high_max": 0.002550362842157483, "clip_ratio/high_mean": 0.0002774420543573797, "clip_ratio/low_mean": 0.0001237043528817594, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040114641888067126, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/mean_length": 76.5869140625, "completions/min_length": 35.5, "epoch": 4.334677419354839, "frac_reward_zero_std": 0.0625, "grad_norm": 2.015625, "kl": 0.004527305625379086, "learning_rate": 4.763190427020819e-08, "loss": 0.00012252613669261336, "reward": 1.6308746933937073, "reward_std": 0.18669770658016205, "rewards/FidelityReward/mean": 0.7487328052520752, "rewards/FidelityReward/std": 0.2014009729027748, "rewards/JudgeFidelityReward/mean": 0.7691664695739746, "rewards/JudgeFidelityReward/std": 0.17752519994974136, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06616134010255337, "step": 2150 }, { "clip_ratio/high_max": 0.0009177489206194878, "clip_ratio/high_mean": 0.0001674351398833096, "clip_ratio/low_mean": 7.432725542457774e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024176238803192974, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/mean_length": 75.44010416666667, "completions/min_length": 31.333333333333332, "epoch": 4.344758064516129, "frac_reward_zero_std": 0.09895833333333333, "grad_norm": 2.09375, "kl": 0.004215614218264818, "learning_rate": 4.6221944064083195e-08, "loss": 0.0001578575000166893, "reward": 1.5993274450302124, "reward_std": 0.18436014155546823, "rewards/FidelityReward/mean": 0.732560376326243, "rewards/FidelityReward/std": 0.21070661147435507, "rewards/JudgeFidelityReward/mean": 0.7367894053459167, "rewards/JudgeFidelityReward/std": 0.2032040854295095, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04410756006836891, "step": 2155 }, { "clip_ratio/high_max": 0.0014693424571305513, "clip_ratio/high_mean": 0.0002216088061686605, "clip_ratio/low_mean": 7.256066528498195e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002941694634500891, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/mean_length": 75.73828125, "completions/min_length": 29.5, "epoch": 4.354838709677419, "frac_reward_zero_std": 0.1015625, "grad_norm": 2.078125, "kl": 0.00429981267079711, "learning_rate": 4.483215502673554e-08, "loss": 0.0001901344978250563, "reward": 1.6344141960144043, "reward_std": 0.17427322268486023, "rewards/FidelityReward/mean": 0.7536951899528503, "rewards/FidelityReward/std": 0.1992030218243599, "rewards/JudgeFidelityReward/mean": 0.7624145746231079, "rewards/JudgeFidelityReward/std": 0.18311656266450882, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 2160 }, { "clip_ratio/high_max": 0.0013675966765731573, "clip_ratio/high_mean": 0.00022789081558585166, "clip_ratio/low_mean": 3.0467292526736854e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002583581139333546, "completions/clipped_ratio": 0.0, "completions/max_length": 211.33333333333334, "completions/mean_length": 74.57421875, "completions/min_length": 34.666666666666664, "epoch": 4.36491935483871, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 1.84375, "kl": 0.004290653858333826, "learning_rate": 4.3462598936536974e-08, "loss": 0.0001533273607492447, "reward": 1.60842764377594, "reward_std": 0.19615506629149118, "rewards/FidelityReward/mean": 0.7401986122131348, "rewards/FidelityReward/std": 0.21083234250545502, "rewards/JudgeFidelityReward/mean": 0.7397132317225138, "rewards/JudgeFidelityReward/std": 0.20042388141155243, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.04410756006836891, "step": 2165 }, { "clip_ratio/high_max": 0.0017849760130047797, "clip_ratio/high_mean": 0.0002759877825155854, "clip_ratio/low_mean": 0.00018015236710198224, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00045614015543833376, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/mean_length": 78.8232421875, "completions/min_length": 37.0, "epoch": 4.375, "frac_reward_zero_std": 0.0546875, "grad_norm": 1.875, "kl": 0.004520012531429529, "learning_rate": 4.2113336672471245e-08, "loss": 5.5818009423092006e-05, "reward": 1.621022343635559, "reward_std": 0.18920938670635223, "rewards/FidelityReward/mean": 0.7432770431041718, "rewards/FidelityReward/std": 0.20782603323459625, "rewards/JudgeFidelityReward/mean": 0.7613500356674194, "rewards/JudgeFidelityReward/std": 0.19415508955717087, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 2170 }, { "clip_ratio/high_max": 0.0007347796810790897, "clip_ratio/high_mean": 0.00011287226516287774, "clip_ratio/low_mean": 6.517642468679696e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017804868402890862, "completions/clipped_ratio": 0.0, "completions/max_length": 213.33333333333334, "completions/mean_length": 75.95052083333333, "completions/min_length": 35.333333333333336, "epoch": 4.38508064516129, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 2.046875, "kl": 0.004169743228703737, "learning_rate": 4.078442821142919e-08, "loss": 0.00022539617493748664, "reward": 1.5906039079030354, "reward_std": 0.19004892806212106, "rewards/FidelityReward/mean": 0.7226227124532064, "rewards/FidelityReward/std": 0.2164847900470098, "rewards/JudgeFidelityReward/mean": 0.7392175793647766, "rewards/JudgeFidelityReward/std": 0.18402027090390524, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.054928320149580635, "step": 2175 }, { "clip_ratio/high_max": 0.0013978639850392937, "clip_ratio/high_mean": 0.00022525334497913718, "clip_ratio/low_mean": 0.00013305482571013272, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00035830816486850383, "completions/clipped_ratio": 0.0, "completions/max_length": 173.5, "completions/mean_length": 74.90234375, "completions/min_length": 35.5, "epoch": 4.395161290322581, "frac_reward_zero_std": 0.09375, "grad_norm": 2.0625, "kl": 0.004268205724656582, "learning_rate": 3.947593262554205e-08, "loss": 9.935577400028705e-05, "reward": 1.6108205914497375, "reward_std": 0.18632050603628159, "rewards/FidelityReward/mean": 0.7387725114822388, "rewards/FidelityReward/std": 0.197623111307621, "rewards/JudgeFidelityReward/mean": 0.7440961301326752, "rewards/JudgeFidelityReward/std": 0.18161259591579437, "rewards/SelfEvolvingFormatReward/mean": 1.0, "rewards/SelfEvolvingFormatReward/std": 0.0, "step": 2180 }, { "clip_ratio/high_max": 0.0010237023932859301, "clip_ratio/high_mean": 0.00011811817821580917, "clip_ratio/low_mean": 5.866916326340288e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017678734147921204, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/mean_length": 75.705078125, "completions/min_length": 34.666666666666664, "epoch": 4.405241935483871, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 2.234375, "kl": 0.004386240616440773, "learning_rate": 3.818790807955552e-08, "loss": 0.0002643003361299634, "reward": 1.6509801944096882, "reward_std": 0.18802579740683237, "rewards/FidelityReward/mean": 0.7694242795308431, "rewards/FidelityReward/std": 0.20251364509264627, "rewards/JudgeFidelityReward/mean": 0.7670180002848307, "rewards/JudgeFidelityReward/std": 0.1964504470427831, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.061009867737690605, "step": 2185 }, { "clip_ratio/high_max": 0.0015660115983337163, "clip_ratio/high_mean": 0.00021831748308613895, "clip_ratio/low_mean": 0.00014249632076825946, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003608138067647815, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/mean_length": 76.9677734375, "completions/min_length": 34.5, "epoch": 4.415322580645161, "frac_reward_zero_std": 0.078125, "grad_norm": 1.90625, "kl": 0.004302301723510027, "learning_rate": 3.692041182824485e-08, "loss": 0.0002372488146647811, "reward": 1.6367492079734802, "reward_std": 0.18295477330684662, "rewards/FidelityReward/mean": 0.7550921142101288, "rewards/FidelityReward/std": 0.20840520411729813, "rewards/JudgeFidelityReward/mean": 0.7691735625267029, "rewards/JudgeFidelityReward/std": 0.18419194221496582, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 2190 }, { "clip_ratio/high_max": 0.0009847545297816396, "clip_ratio/high_mean": 0.00011796849430538714, "clip_ratio/low_mean": 2.9328447999432684e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00014729694230481983, "completions/clipped_ratio": 0.0, "completions/max_length": 222.66666666666666, "completions/mean_length": 75.119140625, "completions/min_length": 34.333333333333336, "epoch": 4.425403225806452, "frac_reward_zero_std": 0.09375, "grad_norm": 2.015625, "kl": 0.004028069134801626, "learning_rate": 3.567350021386895e-08, "loss": 0.0002104469109326601, "reward": 1.5976461966832478, "reward_std": 0.1895762582619985, "rewards/FidelityReward/mean": 0.7314104437828064, "rewards/FidelityReward/std": 0.21053215861320496, "rewards/JudgeFidelityReward/mean": 0.7344247500101725, "rewards/JudgeFidelityReward/std": 0.199992502729098, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 2195 }, { "clip_ratio/high_max": 0.0017578533850610256, "clip_ratio/high_mean": 0.00016352072707377375, "clip_ratio/low_mean": 0.0001368714074487798, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00030039213597774503, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/mean_length": 76.3681640625, "completions/min_length": 36.5, "epoch": 4.435483870967742, "frac_reward_zero_std": 0.0390625, "grad_norm": 1.9765625, "kl": 0.004614555835723877, "learning_rate": 3.444722866366662e-08, "loss": 0.00018884572200477122, "reward": 1.6233514547348022, "reward_std": 0.19499419629573822, "rewards/FidelityReward/mean": 0.7513507008552551, "rewards/FidelityReward/std": 0.20277181267738342, "rewards/JudgeFidelityReward/mean": 0.7469311654567719, "rewards/JudgeFidelityReward/std": 0.19251128286123276, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.038198307156562805, "step": 2200 }, { "clip_ratio/high_max": 0.0007092287763953209, "clip_ratio/high_mean": 8.694830466993153e-05, "clip_ratio/low_mean": 8.25062015792355e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000169454503338784, "completions/clipped_ratio": 0.0, "completions/max_length": 241.33333333333334, "completions/mean_length": 75.34049479166667, "completions/min_length": 33.666666666666664, "epoch": 4.445564516129032, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.0625, "kl": 0.004239155538380146, "learning_rate": 3.32416516873924e-08, "loss": 0.00025222827680408954, "reward": 1.6143182118733723, "reward_std": 0.1867438703775406, "rewards/FidelityReward/mean": 0.7442978024482727, "rewards/FidelityReward/std": 0.2081207036972046, "rewards/JudgeFidelityReward/mean": 0.7426449855168661, "rewards/JudgeFidelityReward/std": 0.1982286125421524, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04162587722142538, "step": 2205 }, { "clip_ratio/high_max": 0.0014827301492914557, "clip_ratio/high_mean": 0.0002382981823757291, "clip_ratio/low_mean": 0.00015894701646175237, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039724520174786446, "completions/clipped_ratio": 0.0, "completions/max_length": 238.5, "completions/mean_length": 76.009765625, "completions/min_length": 34.5, "epoch": 4.455645161290323, "frac_reward_zero_std": 0.03125, "grad_norm": 2.015625, "kl": 0.004240183625370264, "learning_rate": 3.2056822874893254e-08, "loss": 0.00014703740598633885, "reward": 1.6334282159805298, "reward_std": 0.19114912301301956, "rewards/FidelityReward/mean": 0.7567564249038696, "rewards/FidelityReward/std": 0.20471206307411194, "rewards/JudgeFidelityReward/mean": 0.7552966773509979, "rewards/JudgeFidelityReward/std": 0.20550690591335297, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03121940791606903, "step": 2210 }, { "clip_ratio/high_max": 0.0006378347752615809, "clip_ratio/high_mean": 8.340949061675928e-05, "clip_ratio/low_mean": 3.6495349195320156e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00011990483617410063, "completions/clipped_ratio": 0.0006510416666666666, "completions/max_length": 269.6666666666667, "completions/mean_length": 75.78450520833333, "completions/min_length": 31.666666666666668, "epoch": 4.465725806451613, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 2.03125, "kl": 0.004287008289247751, "learning_rate": 3.089279489372704e-08, "loss": 0.00010766780469566584, "reward": 1.599148432413737, "reward_std": 0.19755411644776663, "rewards/FidelityReward/mean": 0.7339858611424764, "rewards/FidelityReward/std": 0.21362081170082092, "rewards/JudgeFidelityReward/mean": 0.7355334560076395, "rewards/JudgeFidelityReward/std": 0.19936654965082803, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.06835554415980975, "step": 2215 }, { "clip_ratio/high_max": 0.002355672512203455, "clip_ratio/high_mean": 0.00025619001826271415, "clip_ratio/low_mean": 0.0001098297827411443, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036601980682462456, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/mean_length": 73.1953125, "completions/min_length": 34.0, "epoch": 4.475806451612903, "frac_reward_zero_std": 0.09375, "grad_norm": 1.9453125, "kl": 0.004253725055605173, "learning_rate": 2.9749619486820863e-08, "loss": 8.339879568666219e-05, "reward": 1.6128642559051514, "reward_std": 0.17836115509271622, "rewards/FidelityReward/mean": 0.7358356416225433, "rewards/FidelityReward/std": 0.20688496530056, "rewards/JudgeFidelityReward/mean": 0.7560103237628937, "rewards/JudgeFidelityReward/std": 0.18402817845344543, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 2220 }, { "clip_ratio/high_max": 0.001025556493550539, "clip_ratio/high_mean": 0.00015822966815903783, "clip_ratio/low_mean": 0.00011091290507465601, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026914258487522604, "completions/clipped_ratio": 0.0006510416666666666, "completions/max_length": 269.0, "completions/mean_length": 77.61197916666667, "completions/min_length": 34.0, "epoch": 4.485887096774194, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 2.015625, "kl": 0.004385048802942037, "learning_rate": 2.8627347470171092e-08, "loss": 0.00023813406005501747, "reward": 1.5998478730519612, "reward_std": 0.19110600650310516, "rewards/FidelityReward/mean": 0.7294936180114746, "rewards/FidelityReward/std": 0.2235938956340154, "rewards/JudgeFidelityReward/mean": 0.7433125575383505, "rewards/JudgeFidelityReward/std": 0.21277333299318948, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04019692912697792, "step": 2225 }, { "clip_ratio/high_max": 0.0019537480548024178, "clip_ratio/high_mean": 0.00021136534924153237, "clip_ratio/low_mean": 6.868287164252251e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002800482208840549, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/mean_length": 74.62890625, "completions/min_length": 34.5, "epoch": 4.495967741935484, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.21875, "kl": 0.004350726725533605, "learning_rate": 2.7526028730584684e-08, "loss": 0.0002185351215302944, "reward": 1.5947083234786987, "reward_std": 0.1871831864118576, "rewards/FidelityReward/mean": 0.7294973433017731, "rewards/FidelityReward/std": 0.2295611947774887, "rewards/JudgeFidelityReward/mean": 0.7333515286445618, "rewards/JudgeFidelityReward/std": 0.2136455848813057, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.038198307156562805, "step": 2230 }, { "clip_ratio/high_max": 0.0014767630957067013, "clip_ratio/high_mean": 0.00016303436132147908, "clip_ratio/low_mean": 0.00010092736629303544, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002639617305248976, "completions/clipped_ratio": 0.0, "completions/max_length": 237.66666666666666, "completions/mean_length": 75.66927083333333, "completions/min_length": 33.333333333333336, "epoch": 4.506048387096774, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 2.09375, "kl": 0.004315158911049366, "learning_rate": 2.644571222346148e-08, "loss": 0.00019175302004441618, "reward": 1.5965760151545207, "reward_std": 0.1944337139527003, "rewards/FidelityReward/mean": 0.7325723171234131, "rewards/FidelityReward/std": 0.20997238655885062, "rewards/JudgeFidelityReward/mean": 0.7306114633878072, "rewards/JudgeFidelityReward/std": 0.1988206704457601, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.05027572065591812, "step": 2235 }, { "clip_ratio/high_max": 0.001280310214497149, "clip_ratio/high_mean": 0.00017045468848664314, "clip_ratio/low_mean": 0.00015457137196790428, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003250260546337813, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/mean_length": 72.7041015625, "completions/min_length": 32.0, "epoch": 4.516129032258064, "frac_reward_zero_std": 0.1328125, "grad_norm": 2.03125, "kl": 0.004383118636906147, "learning_rate": 2.538644597061812e-08, "loss": 0.0002081761136651039, "reward": 1.619686245918274, "reward_std": 0.17654020339250565, "rewards/FidelityReward/mean": 0.7455938756465912, "rewards/FidelityReward/std": 0.19855594635009766, "rewards/JudgeFidelityReward/mean": 0.7501379251480103, "rewards/JudgeFidelityReward/std": 0.1720166578888893, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 2240 }, { "clip_ratio/high_max": 0.0020561940968036652, "clip_ratio/high_mean": 0.0002226888056611642, "clip_ratio/low_mean": 9.149728866759687e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031418610014952717, "completions/clipped_ratio": 0.0, "completions/max_length": 196.66666666666666, "completions/mean_length": 75.89778645833333, "completions/min_length": 32.333333333333336, "epoch": 4.526209677419355, "frac_reward_zero_std": 0.09895833333333333, "grad_norm": 1.875, "kl": 0.004065915243700147, "learning_rate": 2.434827705815351e-08, "loss": 0.00014936974039301277, "reward": 1.613095998764038, "reward_std": 0.18028039733568826, "rewards/FidelityReward/mean": 0.7361675898234049, "rewards/FidelityReward/std": 0.20105977853139242, "rewards/JudgeFidelityReward/mean": 0.7558098236719767, "rewards/JudgeFidelityReward/std": 0.18296339611212412, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 2245 }, { "clip_ratio/high_max": 0.0020087541546672583, "clip_ratio/high_mean": 0.00023651797091588377, "clip_ratio/low_mean": 0.00013156300701666623, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003680809633806348, "completions/clipped_ratio": 0.0, "completions/max_length": 221.5, "completions/mean_length": 78.03515625, "completions/min_length": 37.0, "epoch": 4.536290322580645, "frac_reward_zero_std": 0.046875, "grad_norm": 2.203125, "kl": 0.004105753963813186, "learning_rate": 2.3331251634355342e-08, "loss": 0.00016693724319338799, "reward": 1.6164546608924866, "reward_std": 0.20252878963947296, "rewards/FidelityReward/mean": 0.7429018616676331, "rewards/FidelityReward/std": 0.20266211032867432, "rewards/JudgeFidelityReward/mean": 0.7500351667404175, "rewards/JudgeFidelityReward/std": 0.18662116676568985, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.038198307156562805, "step": 2250 }, { "clip_ratio/high_max": 0.0009128889534622431, "clip_ratio/high_mean": 0.00011097460519522428, "clip_ratio/low_mean": 4.783051845151931e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015880512073636056, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/mean_length": 75.857421875, "completions/min_length": 35.333333333333336, "epoch": 4.546370967741936, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 1.9296875, "kl": 0.004373204242438078, "learning_rate": 2.2335414907649286e-08, "loss": 0.00023760725744068623, "reward": 1.637438416481018, "reward_std": 0.18659188350041708, "rewards/FidelityReward/mean": 0.7575337886810303, "rewards/FidelityReward/std": 0.18431042631467184, "rewards/JudgeFidelityReward/mean": 0.7643664677937826, "rewards/JudgeFidelityReward/std": 0.1662126580874125, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.0649204986790816, "step": 2255 }, { "clip_ratio/high_max": 0.0020708392839878798, "clip_ratio/high_mean": 0.0003551461733877659, "clip_ratio/low_mean": 0.00010459931145305746, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004597454913891852, "completions/clipped_ratio": 0.0, "completions/max_length": 193.5, "completions/mean_length": 76.90625, "completions/min_length": 31.5, "epoch": 4.556451612903226, "frac_reward_zero_std": 0.078125, "grad_norm": 2.046875, "kl": 0.004528816137462854, "learning_rate": 2.136081114458904e-08, "loss": 2.0352228602860123e-05, "reward": 1.6400362253189087, "reward_std": 0.17878013104200363, "rewards/FidelityReward/mean": 0.7637317478656769, "rewards/FidelityReward/std": 0.20104001462459564, "rewards/JudgeFidelityReward/mean": 0.7565151751041412, "rewards/JudgeFidelityReward/std": 0.19685690104961395, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06243881583213806, "step": 2260 }, { "clip_ratio/high_max": 0.001355842687189579, "clip_ratio/high_mean": 0.00019433769630268217, "clip_ratio/low_mean": 0.00014446853892877698, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003388062468729913, "completions/clipped_ratio": 0.0, "completions/max_length": 241.33333333333334, "completions/mean_length": 76.01236979166667, "completions/min_length": 33.666666666666664, "epoch": 4.566532258064516, "frac_reward_zero_std": 0.046875, "grad_norm": 1.8046875, "kl": 0.004171302355825901, "learning_rate": 2.040748366788875e-08, "loss": 0.00022010919637978078, "reward": 1.6472783486048381, "reward_std": 0.18704660733540854, "rewards/FidelityReward/mean": 0.7623857458432516, "rewards/FidelityReward/std": 0.19127561648686728, "rewards/JudgeFidelityReward/mean": 0.7697852253913879, "rewards/JudgeFidelityReward/std": 0.1868008722861608, "rewards/SelfEvolvingFormatReward/mean": 1.0, "rewards/SelfEvolvingFormatReward/std": 0.0, "step": 2265 }, { "clip_ratio/high_max": 0.0014125945977866649, "clip_ratio/high_mean": 0.00023243247997015715, "clip_ratio/low_mean": 8.514464861946181e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031757712713442743, "completions/clipped_ratio": 0.0, "completions/max_length": 264.5, "completions/mean_length": 78.3642578125, "completions/min_length": 37.5, "epoch": 4.576612903225806, "frac_reward_zero_std": 0.0625, "grad_norm": 2.015625, "kl": 0.004375748801976442, "learning_rate": 1.947547485449713e-08, "loss": 0.00030665365047752855, "reward": 1.5900461077690125, "reward_std": 0.19176383316516876, "rewards/FidelityReward/mean": 0.7265447974205017, "rewards/FidelityReward/std": 0.2238108068704605, "rewards/JudgeFidelityReward/mean": 0.7328619956970215, "rewards/JudgeFidelityReward/std": 0.21990548074245453, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 2270 }, { "clip_ratio/high_max": 0.0011282311752438545, "clip_ratio/high_mean": 0.000153614382725209, "clip_ratio/low_mean": 6.437032716348768e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021798470988869666, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/mean_length": 76.48697916666667, "completions/min_length": 30.0, "epoch": 4.586693548387097, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.140625, "kl": 0.004465873539447785, "learning_rate": 1.856482613371402e-08, "loss": 0.00012709706788882613, "reward": 1.6428823868433635, "reward_std": 0.18609345455964407, "rewards/FidelityReward/mean": 0.7629103263219198, "rewards/FidelityReward/std": 0.20940678815046945, "rewards/JudgeFidelityReward/mean": 0.7625481287638346, "rewards/JudgeFidelityReward/std": 0.21457533041636148, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04019692912697792, "step": 2275 }, { "clip_ratio/high_max": 0.0016940467758104205, "clip_ratio/high_mean": 0.00019985766266472638, "clip_ratio/low_mean": 7.909320629551075e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000278950878418982, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/mean_length": 75.931640625, "completions/min_length": 32.0, "epoch": 4.596774193548387, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.03125, "kl": 0.004327153973281384, "learning_rate": 1.76755779853483e-08, "loss": 0.00033680172637104986, "reward": 1.6001319289207458, "reward_std": 0.19268349558115005, "rewards/FidelityReward/mean": 0.7303123474121094, "rewards/FidelityReward/std": 0.2231368124485016, "rewards/JudgeFidelityReward/mean": 0.7454985082149506, "rewards/JudgeFidelityReward/std": 0.21542859077453613, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 2280 }, { "clip_ratio/high_max": 0.0014884352218359708, "clip_ratio/high_mean": 0.00016601643292233348, "clip_ratio/low_mean": 7.188309682533145e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023789952974766492, "completions/clipped_ratio": 0.0, "completions/max_length": 192.33333333333334, "completions/mean_length": 75.169921875, "completions/min_length": 32.666666666666664, "epoch": 4.606854838709677, "frac_reward_zero_std": 0.07291666666666667, "grad_norm": 1.9609375, "kl": 0.004320419020950794, "learning_rate": 1.6807769937919046e-08, "loss": 0.0002051998395472765, "reward": 1.6552503108978271, "reward_std": 0.1814823398987452, "rewards/FidelityReward/mean": 0.7733110984166464, "rewards/FidelityReward/std": 0.19477767248948416, "rewards/JudgeFidelityReward/mean": 0.7684357166290283, "rewards/JudgeFidelityReward/std": 0.18943549692630768, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.0649204986790816, "step": 2285 }, { "clip_ratio/high_max": 0.0014814682537689805, "clip_ratio/high_mean": 0.0001414547848980874, "clip_ratio/low_mean": 0.00010897787287831307, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025043265195563437, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/mean_length": 76.626953125, "completions/min_length": 32.0, "epoch": 4.616935483870968, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.1875, "kl": 0.004327710997313261, "learning_rate": 1.596144056689791e-08, "loss": 0.00021397953387349843, "reward": 1.6589984893798828, "reward_std": 0.17462214827537537, "rewards/FidelityReward/mean": 0.7703548669815063, "rewards/FidelityReward/std": 0.1918085440993309, "rewards/JudgeFidelityReward/mean": 0.7811935245990753, "rewards/JudgeFidelityReward/std": 0.17164857685565948, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 2290 }, { "clip_ratio/high_max": 0.0009948495076969265, "clip_ratio/high_mean": 8.406160632148385e-05, "clip_ratio/low_mean": 4.06974446377717e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00012475905241444708, "completions/clipped_ratio": 0.0, "completions/max_length": 226.33333333333334, "completions/mean_length": 76.27669270833333, "completions/min_length": 32.333333333333336, "epoch": 4.627016129032258, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 2.046875, "kl": 0.004406649153679609, "learning_rate": 1.513662749299477e-08, "loss": 0.0002239431720227003, "reward": 1.6389744679133098, "reward_std": 0.1839041362206141, "rewards/FidelityReward/mean": 0.7586462497711182, "rewards/FidelityReward/std": 0.20202181239922842, "rewards/JudgeFidelityReward/mean": 0.7619583606719971, "rewards/JudgeFidelityReward/std": 0.18017958601315817, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.029462782045205433, "step": 2295 }, { "clip_ratio/high_max": 0.0017785275587812067, "clip_ratio/high_mean": 0.00020701128523796796, "clip_ratio/low_mean": 0.000130300474120304, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033731176517903805, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/mean_length": 75.548828125, "completions/min_length": 35.5, "epoch": 4.637096774193548, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.09375, "kl": 0.004434118513017893, "learning_rate": 1.4333367380485462e-08, "loss": 0.00023481659591197967, "reward": 1.6361274123191833, "reward_std": 0.19280442595481873, "rewards/FidelityReward/mean": 0.7554672360420227, "rewards/FidelityReward/std": 0.20224828273057938, "rewards/JudgeFidelityReward/mean": 0.7652265131473541, "rewards/JudgeFidelityReward/std": 0.17919837683439255, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 2300 }, { "clip_ratio/high_max": 0.0007390826707705855, "clip_ratio/high_mean": 0.00013030710979364813, "clip_ratio/low_mean": 2.9990216717123985e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016029732651077211, "completions/clipped_ratio": 0.0, "completions/max_length": 216.33333333333334, "completions/mean_length": 75.30989583333333, "completions/min_length": 33.666666666666664, "epoch": 4.647177419354839, "frac_reward_zero_std": 0.08854166666666667, "grad_norm": 1.9296875, "kl": 0.004197925142943859, "learning_rate": 1.3551695935581308e-08, "loss": 0.00013447920791804792, "reward": 1.6061625878016155, "reward_std": 0.18858807782332102, "rewards/FidelityReward/mean": 0.7341494957605997, "rewards/FidelityReward/std": 0.21020828684171042, "rewards/JudgeFidelityReward/mean": 0.745979368686676, "rewards/JudgeFidelityReward/std": 0.1964809646209081, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.04419417306780815, "step": 2305 }, { "clip_ratio/high_max": 0.0020024793688207866, "clip_ratio/high_mean": 0.00025525290984660385, "clip_ratio/low_mean": 0.0001672366459388286, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00042248956160619857, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/mean_length": 77.48046875, "completions/min_length": 34.0, "epoch": 4.657258064516129, "frac_reward_zero_std": 0.0703125, "grad_norm": 1.9375, "kl": 0.0041974842548370365, "learning_rate": 1.2791647904842995e-08, "loss": 0.00013363875914365053, "reward": 1.651712417602539, "reward_std": 0.1866108924150467, "rewards/FidelityReward/mean": 0.7682011127471924, "rewards/FidelityReward/std": 0.19033675640821457, "rewards/JudgeFidelityReward/mean": 0.771905392408371, "rewards/JudgeFidelityReward/std": 0.18393758684396744, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.06616134010255337, "step": 2310 }, { "clip_ratio/high_max": 0.0012991152703762054, "clip_ratio/high_mean": 0.00017195246764458715, "clip_ratio/low_mean": 6.483250035671517e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023678496945649384, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/mean_length": 76.93294270833333, "completions/min_length": 35.333333333333336, "epoch": 4.667338709677419, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.09375, "kl": 0.0044135615229606625, "learning_rate": 1.2053257073635193e-08, "loss": 0.00022810180671513082, "reward": 1.630345304807027, "reward_std": 0.19021835923194885, "rewards/FidelityReward/mean": 0.752817968527476, "rewards/FidelityReward/std": 0.20695790151755014, "rewards/JudgeFidelityReward/mean": 0.7570077180862427, "rewards/JudgeFidelityReward/std": 0.18890860676765442, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 2315 }, { "clip_ratio/high_max": 0.0014722449937835335, "clip_ratio/high_mean": 0.00021218244801275433, "clip_ratio/low_mean": 0.00013564629189204424, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034782873699441553, "completions/clipped_ratio": 0.0, "completions/max_length": 221.5, "completions/mean_length": 79.8603515625, "completions/min_length": 32.5, "epoch": 4.67741935483871, "frac_reward_zero_std": 0.046875, "grad_norm": 2.078125, "kl": 0.004309109505265951, "learning_rate": 1.1336556264624875e-08, "loss": 0.0001530010486021638, "reward": 1.5767950415611267, "reward_std": 0.21875117719173431, "rewards/FidelityReward/mean": 0.7081640362739563, "rewards/FidelityReward/std": 0.20678365230560303, "rewards/JudgeFidelityReward/mean": 0.7431213557720184, "rewards/JudgeFidelityReward/std": 0.17875193804502487, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07131390832364559, "step": 2320 }, { "clip_ratio/high_max": 0.0016717263963073492, "clip_ratio/high_mean": 0.00016972116427496075, "clip_ratio/low_mean": 9.3088170979172e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026280934689566494, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/mean_length": 75.49609375, "completions/min_length": 36.0, "epoch": 4.6875, "frac_reward_zero_std": 0.08333333333333333, "grad_norm": 2.0, "kl": 0.004491163417696953, "learning_rate": 1.0641577336322761e-08, "loss": 0.00018220789497718214, "reward": 1.593193769454956, "reward_std": 0.19060435891151428, "rewards/FidelityReward/mean": 0.7293131550153097, "rewards/FidelityReward/std": 0.22093738118807474, "rewards/JudgeFidelityReward/mean": 0.7316675384839376, "rewards/JudgeFidelityReward/std": 0.1948830783367157, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.05018910765647888, "step": 2325 }, { "clip_ratio/high_max": 0.002151038427837193, "clip_ratio/high_mean": 0.00020199672144372016, "clip_ratio/low_mean": 0.000163198885275051, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003651956096291542, "completions/clipped_ratio": 0.0, "completions/max_length": 226.5, "completions/mean_length": 79.5595703125, "completions/min_length": 35.5, "epoch": 4.69758064516129, "frac_reward_zero_std": 0.109375, "grad_norm": 2.171875, "kl": 0.00420228224247694, "learning_rate": 9.968351181666557e-09, "loss": 0.00019895939622074365, "reward": 1.6082355380058289, "reward_std": 0.18930794298648834, "rewards/FidelityReward/mean": 0.7376376986503601, "rewards/FidelityReward/std": 0.20312724262475967, "rewards/JudgeFidelityReward/mean": 0.7421721816062927, "rewards/JudgeFidelityReward/std": 0.2052415832877159, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 2330 }, { "clip_ratio/high_max": 0.0011690782383084297, "clip_ratio/high_mean": 0.0001702671404927969, "clip_ratio/low_mean": 6.40546357317362e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023432178422808648, "completions/clipped_ratio": 0.0, "completions/max_length": 224.33333333333334, "completions/mean_length": 74.96875, "completions/min_length": 33.666666666666664, "epoch": 4.707661290322581, "frac_reward_zero_std": 0.020833333333333332, "grad_norm": 2.234375, "kl": 0.004781534988433122, "learning_rate": 9.31690772664806e-09, "loss": 0.00017382041551172733, "reward": 1.6246421337127686, "reward_std": 0.19713342189788818, "rewards/FidelityReward/mean": 0.7463943958282471, "rewards/FidelityReward/std": 0.2148935149113337, "rewards/JudgeFidelityReward/mean": 0.7604016860326132, "rewards/JudgeFidelityReward/std": 0.19950532913208008, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.061009867737690605, "step": 2335 }, { "clip_ratio/high_max": 0.0016743190120905639, "clip_ratio/high_mean": 0.00014980045525589958, "clip_ratio/low_mean": 9.520688618067653e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024500733707100153, "completions/clipped_ratio": 0.0, "completions/max_length": 232.5, "completions/mean_length": 74.1064453125, "completions/min_length": 33.0, "epoch": 4.717741935483871, "frac_reward_zero_std": 0.078125, "grad_norm": 2.203125, "kl": 0.0043690381571650505, "learning_rate": 8.68727592898294e-09, "loss": 0.00015586577355861664, "reward": 1.6097332239151, "reward_std": 0.18431120365858078, "rewards/FidelityReward/mean": 0.7337581515312195, "rewards/FidelityReward/std": 0.20460832118988037, "rewards/JudgeFidelityReward/mean": 0.7529265582561493, "rewards/JudgeFidelityReward/std": 0.1770670861005783, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 2340 }, { "clip_ratio/high_max": 0.0009755150880664587, "clip_ratio/high_mean": 0.00016934353625401853, "clip_ratio/low_mean": 6.115229480201379e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023049584124237298, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/mean_length": 76.3125, "completions/min_length": 33.666666666666664, "epoch": 4.727822580645161, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 1.890625, "kl": 0.004602654837071896, "learning_rate": 8.07948377682327e-09, "loss": 0.00011206798953935503, "reward": 1.6513455708821614, "reward_std": 0.17764364182949066, "rewards/FidelityReward/mean": 0.76289830605189, "rewards/FidelityReward/std": 0.19525042672952017, "rewards/JudgeFidelityReward/mean": 0.7781965533892313, "rewards/JudgeFidelityReward/std": 0.17565601567427316, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.029462782045205433, "step": 2345 }, { "clip_ratio/high_max": 0.0017435807501897215, "clip_ratio/high_mean": 0.00017549296899233013, "clip_ratio/low_mean": 8.46717637614347e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026016474585048853, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/mean_length": 75.9150390625, "completions/min_length": 35.5, "epoch": 4.737903225806452, "frac_reward_zero_std": 0.0859375, "grad_norm": 2.109375, "kl": 0.004497794527560472, "learning_rate": 7.49355828751358e-09, "loss": 6.560995825566351e-05, "reward": 1.6511902809143066, "reward_std": 0.18442977964878082, "rewards/FidelityReward/mean": 0.7662737667560577, "rewards/FidelityReward/std": 0.1973242238163948, "rewards/JudgeFidelityReward/mean": 0.7727627158164978, "rewards/JudgeFidelityReward/std": 0.18665000051259995, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 2350 }, { "clip_ratio/high_max": 0.0010156714357435702, "clip_ratio/high_mean": 0.00015728749567642809, "clip_ratio/low_mean": 0.00013903040962759405, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029631787911057474, "completions/clipped_ratio": 0.0, "completions/max_length": 247.66666666666666, "completions/mean_length": 75.8359375, "completions/min_length": 33.666666666666664, "epoch": 4.747983870967742, "frac_reward_zero_std": 0.046875, "grad_norm": 2.0625, "kl": 0.004325374495238066, "learning_rate": 6.929525506389921e-09, "loss": 0.00028171730227768423, "reward": 1.6456997791926067, "reward_std": 0.18517610927422842, "rewards/FidelityReward/mean": 0.7595421075820923, "rewards/FidelityReward/std": 0.205393115679423, "rewards/JudgeFidelityReward/mean": 0.7749193708101908, "rewards/JudgeFidelityReward/std": 0.19507992764314017, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04019692912697792, "step": 2355 }, { "clip_ratio/high_max": 0.0014997193589806556, "clip_ratio/high_mean": 0.00021629080874845384, "clip_ratio/low_mean": 0.00012160041369497776, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003378912224434316, "completions/clipped_ratio": 0.0, "completions/max_length": 204.5, "completions/mean_length": 76.859375, "completions/min_length": 32.0, "epoch": 4.758064516129032, "frac_reward_zero_std": 0.0703125, "grad_norm": 1.9375, "kl": 0.004387686401605606, "learning_rate": 6.387410505621915e-09, "loss": 0.00026064906269311904, "reward": 1.6387258768081665, "reward_std": 0.1796923652291298, "rewards/FidelityReward/mean": 0.7603088021278381, "rewards/FidelityReward/std": 0.21032831072807312, "rewards/JudgeFidelityReward/mean": 0.7597638368606567, "rewards/JudgeFidelityReward/std": 0.19319875538349152, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 2360 }, { "clip_ratio/high_max": 0.0012709290254861116, "clip_ratio/high_mean": 0.00017171052750200033, "clip_ratio/low_mean": 9.929779625963419e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027100832667201755, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/mean_length": 77.837890625, "completions/min_length": 32.333333333333336, "epoch": 4.768145161290323, "frac_reward_zero_std": 0.08854166666666667, "grad_norm": 1.9375, "kl": 0.004282903578132391, "learning_rate": 5.867237383098467e-09, "loss": 5.3752341773360965e-05, "reward": 1.599855860074361, "reward_std": 0.19079516331354776, "rewards/FidelityReward/mean": 0.7315008242925009, "rewards/FidelityReward/std": 0.21965560813744864, "rewards/JudgeFidelityReward/mean": 0.7393141388893127, "rewards/JudgeFidelityReward/std": 0.20752171675364176, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04019692912697792, "step": 2365 }, { "clip_ratio/high_max": 0.0018653654493391513, "clip_ratio/high_mean": 0.00021519108850043267, "clip_ratio/low_mean": 0.00014589533675462006, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036108642816543577, "completions/clipped_ratio": 0.0, "completions/max_length": 204.5, "completions/mean_length": 75.1748046875, "completions/min_length": 32.0, "epoch": 4.778225806451613, "frac_reward_zero_std": 0.0703125, "grad_norm": 2.0, "kl": 0.004190378822386265, "learning_rate": 5.369029261356572e-09, "loss": 0.00015569038223475217, "reward": 1.634135901927948, "reward_std": 0.18972192704677582, "rewards/FidelityReward/mean": 0.760304182767868, "rewards/FidelityReward/std": 0.1917603239417076, "rewards/JudgeFidelityReward/mean": 0.7476633787155151, "rewards/JudgeFidelityReward/std": 0.1899239346385002, "rewards/SelfEvolvingFormatReward/mean": 1.0, "rewards/SelfEvolvingFormatReward/std": 0.0, "step": 2370 }, { "clip_ratio/high_max": 0.0010479755699634552, "clip_ratio/high_mean": 0.0001240205019712448, "clip_ratio/low_mean": 6.436861149268225e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018838911782950162, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/mean_length": 75.89192708333333, "completions/min_length": 35.333333333333336, "epoch": 4.788306451612903, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.046875, "kl": 0.004282257054001093, "learning_rate": 4.8928082865532515e-09, "loss": 0.0001454728073440492, "reward": 1.6120548248291016, "reward_std": 0.1934799700975418, "rewards/FidelityReward/mean": 0.7374627192815145, "rewards/FidelityReward/std": 0.19727643330891928, "rewards/JudgeFidelityReward/mean": 0.7517883578936259, "rewards/JudgeFidelityReward/std": 0.182780921459198, "rewards/SelfEvolvingFormatReward/mean": 0.9973958333333334, "rewards/SelfEvolvingFormatReward/std": 0.04019692912697792, "step": 2375 }, { "clip_ratio/high_max": 0.001317855645902455, "clip_ratio/high_mean": 0.00013799481675960123, "clip_ratio/low_mean": 7.960090151755139e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021759571391157805, "completions/clipped_ratio": 0.0, "completions/max_length": 232.5, "completions/mean_length": 75.9501953125, "completions/min_length": 35.0, "epoch": 4.798387096774194, "frac_reward_zero_std": 0.1171875, "grad_norm": 1.9765625, "kl": 0.00434931656345725, "learning_rate": 4.4385956274813894e-09, "loss": 0.00024683596566319463, "reward": 1.6237576007843018, "reward_std": 0.18129397183656693, "rewards/FidelityReward/mean": 0.744288295507431, "rewards/FidelityReward/std": 0.20443285256624222, "rewards/JudgeFidelityReward/mean": 0.7628448903560638, "rewards/JudgeFidelityReward/std": 0.18960031867027283, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06029539369046688, "step": 2380 }, { "clip_ratio/high_max": 0.001188496220856905, "clip_ratio/high_mean": 0.00014709955430589617, "clip_ratio/low_mean": 8.537976973457261e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002324793313164264, "completions/clipped_ratio": 0.0, "completions/max_length": 201.33333333333334, "completions/mean_length": 75.197265625, "completions/min_length": 33.333333333333336, "epoch": 4.808467741935484, "frac_reward_zero_std": 0.09375, "grad_norm": 2.015625, "kl": 0.00422343285754323, "learning_rate": 4.0064114746284905e-09, "loss": 0.00025968013796955346, "reward": 1.6123360395431519, "reward_std": 0.18576247493426004, "rewards/FidelityReward/mean": 0.7401071786880493, "rewards/FidelityReward/std": 0.21258878211180368, "rewards/JudgeFidelityReward/mean": 0.7444577018419901, "rewards/JudgeFidelityReward/std": 0.2061573714017868, "rewards/SelfEvolvingFormatReward/mean": 1.0, "rewards/SelfEvolvingFormatReward/std": 0.0, "step": 2385 }, { "clip_ratio/high_max": 0.001531186094507575, "clip_ratio/high_mean": 0.00016586291312705725, "clip_ratio/low_mean": 8.791466025286354e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025377756683155896, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/mean_length": 76.474609375, "completions/min_length": 31.0, "epoch": 4.818548387096774, "frac_reward_zero_std": 0.125, "grad_norm": 1.7890625, "kl": 0.004080559406429529, "learning_rate": 3.596275039279506e-09, "loss": 0.00016211812617257239, "reward": 1.6138678789138794, "reward_std": 0.1944422423839569, "rewards/FidelityReward/mean": 0.7428773641586304, "rewards/FidelityReward/std": 0.19657447189092636, "rewards/JudgeFidelityReward/mean": 0.747840404510498, "rewards/JudgeFidelityReward/std": 0.19124583154916763, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 2390 }, { "clip_ratio/high_max": 0.0007165079237893224, "clip_ratio/high_mean": 0.00012816747766919434, "clip_ratio/low_mean": 5.980970163363963e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018797717057168485, "completions/clipped_ratio": 0.0, "completions/max_length": 250.66666666666666, "completions/mean_length": 74.68619791666667, "completions/min_length": 32.0, "epoch": 4.828629032258064, "frac_reward_zero_std": 0.08854166666666667, "grad_norm": 2.09375, "kl": 0.004355143662542105, "learning_rate": 3.208204552662519e-09, "loss": 0.00025244494900107385, "reward": 1.604028304417928, "reward_std": 0.1846901128689448, "rewards/FidelityReward/mean": 0.7342956860860189, "rewards/FidelityReward/std": 0.22565807898839316, "rewards/JudgeFidelityReward/mean": 0.744673470656077, "rewards/JudgeFidelityReward/std": 0.21100319921970367, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.06957309817274411, "step": 2395 }, { "clip_ratio/high_max": 0.0017940256744623183, "clip_ratio/high_mean": 0.00019254732469562442, "clip_ratio/low_mean": 3.945597927668132e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023200330033432693, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/mean_length": 75.7197265625, "completions/min_length": 36.0, "epoch": 4.838709677419355, "frac_reward_zero_std": 0.078125, "grad_norm": 2.0625, "kl": 0.004433056153357029, "learning_rate": 2.8422172651385026e-09, "loss": 0.00020380469504743814, "reward": 1.6415361166000366, "reward_std": 0.17551253736019135, "rewards/FidelityReward/mean": 0.7574577629566193, "rewards/FidelityReward/std": 0.1849243864417076, "rewards/JudgeFidelityReward/mean": 0.7691330015659332, "rewards/JudgeFidelityReward/std": 0.16631601005792618, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 2400 }, { "clip_ratio/high_max": 0.0006440841592848301, "clip_ratio/high_mean": 5.8944476768374444e-05, "clip_ratio/low_mean": 7.618969539180398e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00013513417216017842, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/mean_length": 75.89322916666667, "completions/min_length": 32.0, "epoch": 4.848790322580645, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 2.296875, "kl": 0.0043453630991280075, "learning_rate": 2.498329445434544e-09, "loss": 0.0001855379668995738, "reward": 1.5511852502822876, "reward_std": 0.20404345790545145, "rewards/FidelityReward/mean": 0.693851113319397, "rewards/FidelityReward/std": 0.21975439290205637, "rewards/JudgeFidelityReward/mean": 0.7146682937939962, "rewards/JudgeFidelityReward/std": 0.2062630554040273, "rewards/SelfEvolvingFormatReward/mean": 1.0, "rewards/SelfEvolvingFormatReward/std": 0.0, "step": 2405 }, { "clip_ratio/high_max": 0.0011894766241312027, "clip_ratio/high_mean": 0.00012938133149873466, "clip_ratio/low_mean": 0.0001321155345067382, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002614968630950898, "completions/clipped_ratio": 0.0, "completions/max_length": 197.5, "completions/mean_length": 73.95703125, "completions/min_length": 33.5, "epoch": 4.858870967741936, "frac_reward_zero_std": 0.046875, "grad_norm": 1.9765625, "kl": 0.0046053314581513405, "learning_rate": 2.176556379920702e-09, "loss": 0.00022013685666024684, "reward": 1.5785198211669922, "reward_std": 0.20686772465705872, "rewards/FidelityReward/mean": 0.7135146856307983, "rewards/FidelityReward/std": 0.2260010689496994, "rewards/JudgeFidelityReward/mean": 0.7339166104793549, "rewards/JudgeFidelityReward/std": 0.21056392043828964, "rewards/SelfEvolvingFormatReward/mean": 0.99609375, "rewards/SelfEvolvingFormatReward/std": 0.06243881583213806, "step": 2410 }, { "clip_ratio/high_max": 0.0007867559092119336, "clip_ratio/high_mean": 0.0001110290118958801, "clip_ratio/low_mean": 1.0486577230039984e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00012151558767072856, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/mean_length": 77.04036458333333, "completions/min_length": 33.0, "epoch": 4.868951612903226, "frac_reward_zero_std": 0.057291666666666664, "grad_norm": 2.296875, "kl": 0.004133446794003248, "learning_rate": 1.8769123719302726e-09, "loss": 0.00016773580573499204, "reward": 1.593920111656189, "reward_std": 0.20117040475209555, "rewards/FidelityReward/mean": 0.7252888282140096, "rewards/FidelityReward/std": 0.20616135994593301, "rewards/JudgeFidelityReward/mean": 0.742470920085907, "rewards/JudgeFidelityReward/std": 0.18283976117769876, "rewards/SelfEvolvingFormatReward/mean": 0.9947916666666666, "rewards/SelfEvolvingFormatReward/std": 0.06835554415980975, "step": 2415 }, { "clip_ratio/high_max": 0.0013847316382452845, "clip_ratio/high_mean": 9.51838563196361e-05, "clip_ratio/low_mean": 3.09613453282509e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001261451980099082, "completions/clipped_ratio": 0.0, "completions/max_length": 226.5, "completions/mean_length": 75.2900390625, "completions/min_length": 30.5, "epoch": 4.879032258064516, "frac_reward_zero_std": 0.046875, "grad_norm": 2.390625, "kl": 0.004399135615676642, "learning_rate": 1.5994107411242964e-09, "loss": 0.0002650600392371416, "reward": 1.5900483131408691, "reward_std": 0.2003740593791008, "rewards/FidelityReward/mean": 0.7257078886032104, "rewards/FidelityReward/std": 0.2234923243522644, "rewards/JudgeFidelityReward/mean": 0.7335637509822845, "rewards/JudgeFidelityReward/std": 0.21049996465444565, "rewards/SelfEvolvingFormatReward/mean": 0.9951171875, "rewards/SelfEvolvingFormatReward/std": 0.049216821789741516, "step": 2420 }, { "clip_ratio/high_max": 0.0006964124273508787, "clip_ratio/high_mean": 5.05051895743236e-05, "clip_ratio/low_mean": 2.3620996944373474e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.412618724629283e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 203.66666666666666, "completions/mean_length": 75.71940104166667, "completions/min_length": 33.666666666666664, "epoch": 4.889112903225806, "frac_reward_zero_std": 0.06770833333333333, "grad_norm": 1.9375, "kl": 0.00448099235072732, "learning_rate": 1.3440638228991997e-09, "loss": 0.0002101005055010319, "reward": 1.6452195247014363, "reward_std": 0.18876604239145914, "rewards/FidelityReward/mean": 0.7646592656771342, "rewards/FidelityReward/std": 0.19928972919782004, "rewards/JudgeFidelityReward/mean": 0.7656776905059814, "rewards/JudgeFidelityReward/std": 0.18809018532435098, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.06566246723135312, "step": 2425 }, { "clip_ratio/high_max": 0.0009131880942732096, "clip_ratio/high_mean": 2.85371279460378e-05, "clip_ratio/low_mean": 8.668516238685698e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 3.7205644184723495e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 213.5, "completions/mean_length": 75.5986328125, "completions/min_length": 34.0, "epoch": 4.899193548387097, "frac_reward_zero_std": 0.0703125, "grad_norm": 1.984375, "kl": 0.0046795177273452285, "learning_rate": 1.1108829678385667e-09, "loss": 0.00010627892334014177, "reward": 1.6026061177253723, "reward_std": 0.19315538555383682, "rewards/FidelityReward/mean": 0.7327402830123901, "rewards/FidelityReward/std": 0.21432962268590927, "rewards/JudgeFidelityReward/mean": 0.7465676665306091, "rewards/JudgeFidelityReward/std": 0.19050489366054535, "rewards/SelfEvolvingFormatReward/mean": 0.9931640625, "rewards/SelfEvolvingFormatReward/std": 0.0822625607252121, "step": 2430 }, { "clip_ratio/high_max": 0.0003424657508730888, "clip_ratio/high_mean": 1.0702054714784026e-05, "clip_ratio/low_mean": 1.0738831770140678e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.1440886484924704e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/mean_length": 75.212890625, "completions/min_length": 33.333333333333336, "epoch": 4.909274193548387, "frac_reward_zero_std": 0.08854166666666667, "grad_norm": 1.984375, "kl": 0.004355764016509056, "learning_rate": 8.998785412088206e-10, "loss": 0.0001895488705486059, "reward": 1.6084417502085369, "reward_std": 0.1866721361875534, "rewards/FidelityReward/mean": 0.7386779387791952, "rewards/FidelityReward/std": 0.20450087388356528, "rewards/JudgeFidelityReward/mean": 0.7414806485176086, "rewards/JudgeFidelityReward/std": 0.18697253863016763, "rewards/SelfEvolvingFormatReward/mean": 0.998046875, "rewards/SelfEvolvingFormatReward/std": 0.03554432963331541, "step": 2435 }, { "clip_ratio/high_max": 0.00037037036381661894, "clip_ratio/high_mean": 2.674533170647919e-05, "clip_ratio/low_mean": 2.171610831283033e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 4.846144001930952e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 198.5, "completions/mean_length": 74.8779296875, "completions/min_length": 34.5, "epoch": 4.919354838709677, "frac_reward_zero_std": 0.0546875, "grad_norm": 2.203125, "kl": 0.004174936423078179, "learning_rate": 7.110599224980362e-10, "loss": 0.00014485751744359733, "reward": 1.6163172721862793, "reward_std": 0.19877901673316956, "rewards/FidelityReward/mean": 0.7462747991085052, "rewards/FidelityReward/std": 0.186854787170887, "rewards/JudgeFidelityReward/mean": 0.7410616278648376, "rewards/JudgeFidelityReward/std": 0.1788829118013382, "rewards/SelfEvolvingFormatReward/mean": 0.9990234375, "rewards/SelfEvolvingFormatReward/std": 0.022097086533904076, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.33333333333334, "completions/mean_length": 78.71744791666667, "completions/min_length": 34.333333333333336, "epoch": 4.929435483870968, "frac_reward_zero_std": 0.041666666666666664, "grad_norm": 2.140625, "kl": 0.004241044074296952, "learning_rate": 5.444355049992744e-10, "loss": 0.00014818599447607994, "reward": 1.610734502474467, "reward_std": 0.19336114823818207, "rewards/FidelityReward/mean": 0.7380477587381998, "rewards/FidelityReward/std": 0.20875958104928335, "rewards/JudgeFidelityReward/mean": 0.7466755708058676, "rewards/JudgeFidelityReward/std": 0.20327018201351166, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.029462782045205433, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.5, "completions/mean_length": 76.1357421875, "completions/min_length": 33.0, "epoch": 4.939516129032258, "frac_reward_zero_std": 0.046875, "grad_norm": 2.09375, "kl": 0.004566581919789314, "learning_rate": 4.000126954376015e-10, "loss": 0.00017638460267335176, "reward": 1.6268584728240967, "reward_std": 0.20315121114253998, "rewards/FidelityReward/mean": 0.7499594390392303, "rewards/FidelityReward/std": 0.20100069791078568, "rewards/JudgeFidelityReward/mean": 0.760634034872055, "rewards/JudgeFidelityReward/std": 0.17895179241895676, "rewards/SelfEvolvingFormatReward/mean": 0.9931640625, "rewards/SelfEvolvingFormatReward/std": 0.07595821656286716, "step": 2450 }, { "clip_ratio/high_max": 0.000303951371461153, "clip_ratio/high_mean": 9.498480358161032e-06, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 9.498480358161032e-06, "completions/clipped_ratio": 0.0, "completions/max_length": 209.66666666666666, "completions/mean_length": 76.98502604166667, "completions/min_length": 34.0, "epoch": 4.949596774193548, "frac_reward_zero_std": 0.0625, "grad_norm": 2.0, "kl": 0.004282972496002913, "learning_rate": 2.777979136404651e-10, "loss": 0.0001630153739824891, "reward": 1.6168125073115032, "reward_std": 0.19205083946386972, "rewards/FidelityReward/mean": 0.7422618468602499, "rewards/FidelityReward/std": 0.20786737898985544, "rewards/JudgeFidelityReward/mean": 0.753658652305603, "rewards/JudgeFidelityReward/std": 0.1867198795080185, "rewards/SelfEvolvingFormatReward/mean": 0.9954427083333334, "rewards/SelfEvolvingFormatReward/std": 0.0649204986790816, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/mean_length": 75.8427734375, "completions/min_length": 33.5, "epoch": 4.959677419354839, "frac_reward_zero_std": 0.09375, "grad_norm": 2.203125, "kl": 0.004555711802095175, "learning_rate": 1.7779659225269917e-10, "loss": 0.00019516663160175086, "reward": 1.5944036841392517, "reward_std": 0.19459538161754608, "rewards/FidelityReward/mean": 0.728424072265625, "rewards/FidelityReward/std": 0.20912756770849228, "rewards/JudgeFidelityReward/mean": 0.7407482266426086, "rewards/JudgeFidelityReward/std": 0.1873146966099739, "rewards/SelfEvolvingFormatReward/mean": 0.9912109375, "rewards/SelfEvolvingFormatReward/std": 0.09328107535839081, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.33333333333334, "completions/mean_length": 75.83723958333333, "completions/min_length": 35.0, "epoch": 4.969758064516129, "frac_reward_zero_std": 0.052083333333333336, "grad_norm": 2.109375, "kl": 0.004362636711448431, "learning_rate": 1.0001317649488417e-10, "loss": 0.00017080270918086172, "reward": 1.6078285376230876, "reward_std": 0.18867049117883047, "rewards/FidelityReward/mean": 0.7413418491681417, "rewards/FidelityReward/std": 0.22476021945476532, "rewards/JudgeFidelityReward/mean": 0.7362286249796549, "rewards/JudgeFidelityReward/std": 0.21837759017944336, "rewards/SelfEvolvingFormatReward/mean": 0.9967447916666666, "rewards/SelfEvolvingFormatReward/std": 0.054928320149580635, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/mean_length": 75.625, "completions/min_length": 34.5, "epoch": 4.979838709677419, "frac_reward_zero_std": 0.1171875, "grad_norm": 1.8203125, "kl": 0.004382760263979435, "learning_rate": 4.4451123965616456e-11, "loss": 0.00017577477265149355, "reward": 1.599212110042572, "reward_std": 0.19179576635360718, "rewards/FidelityReward/mean": 0.7317268252372742, "rewards/FidelityReward/std": 0.19482120871543884, "rewards/JudgeFidelityReward/mean": 0.7379002273082733, "rewards/JudgeFidelityReward/std": 0.19028466194868088, "rewards/SelfEvolvingFormatReward/mean": 0.9970703125, "rewards/SelfEvolvingFormatReward/std": 0.053316494449973106, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.33333333333334, "completions/mean_length": 75.255859375, "completions/min_length": 32.0, "epoch": 4.98991935483871, "frac_reward_zero_std": 0.13541666666666666, "grad_norm": 1.8515625, "kl": 0.00423154141753912, "learning_rate": 1.1112904488019826e-11, "loss": 0.00017009219154715539, "reward": 1.6435070435206096, "reward_std": 0.16975254317124686, "rewards/FidelityReward/mean": 0.7644589145978292, "rewards/FidelityReward/std": 0.1976737231016159, "rewards/JudgeFidelityReward/mean": 0.7593983809153239, "rewards/JudgeFidelityReward/std": 0.1788502832253774, "rewards/SelfEvolvingFormatReward/mean": 0.9986979166666666, "rewards/SelfEvolvingFormatReward/std": 0.02081293861071269, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.5, "completions/mean_length": 78.8271484375, "completions/min_length": 33.0, "epoch": 5.0, "frac_reward_zero_std": 0.03125, "grad_norm": 1.90625, "kl": 0.0043757672421634196, "learning_rate": 0.0, "loss": 0.00017648208886384965, "reward": 1.6436462998390198, "reward_std": 0.18628622591495514, "rewards/FidelityReward/mean": 0.7631798088550568, "rewards/FidelityReward/std": 0.22186345607042313, "rewards/JudgeFidelityReward/mean": 0.7667922973632812, "rewards/JudgeFidelityReward/std": 0.21321173012256622, "rewards/SelfEvolvingFormatReward/mean": 0.994140625, "rewards/SelfEvolvingFormatReward/std": 0.07528366148471832, "step": 2480 } ], "logging_steps": 5, "max_steps": 2480, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }