{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 455.8125, "epoch": 0.008, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0618, "reward": 4.689006567001343, "reward_std": 1.78610560297966, "rewards/mrr_reward": 0.2938988097012043, "rewards/rank_analyze_format_reward": 0.11466514505445957, "rewards/rank_answer_foramt_reward": 0.501953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.8984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 486.734375, "epoch": 0.016, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": 0.0063, "reward": 3.9557588696479797, "reward_std": 1.5732559561729431, "rewards/mrr_reward": 0.169766865670681, "rewards/rank_analyze_format_reward": 0.07681952975690365, "rewards/rank_answer_foramt_reward": 0.36328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9807952791452408, "rewards/rank_overall_format_reward_more": 0.875, "rewards/rank_verify_format_reward": 0.9807952791452408, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 441.203125, "epoch": 0.024, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0565, "reward": 4.4016576409339905, "reward_std": 1.6944840550422668, "rewards/mrr_reward": 0.2554253488779068, "rewards/rank_analyze_format_reward": 0.11000172607600689, "rewards/rank_answer_foramt_reward": 0.4140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.875, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 473.03125, "epoch": 0.032, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0361, "reward": 5.027822136878967, "reward_std": 1.9866893887519836, "rewards/mrr_reward": 0.3529265820980072, "rewards/rank_analyze_format_reward": 0.15089312940835953, "rewards/rank_answer_foramt_reward": 0.5546875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.998236283659935, "rewards/rank_overall_format_reward_more": 0.9140625, "rewards/rank_verify_format_reward": 0.998236283659935, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 488.078125, "epoch": 0.04, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0246, "reward": 4.68894362449646, "reward_std": 1.7762902677059174, "rewards/mrr_reward": 0.3031250014901161, "rewards/rank_analyze_format_reward": 0.2197269294410944, "rewards/rank_answer_foramt_reward": 0.474609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9808974117040634, "rewards/rank_overall_format_reward_more": 0.8203125, "rewards/rank_verify_format_reward": 0.9808974117040634, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 476.78125, "epoch": 0.048, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0422, "reward": 4.237152338027954, "reward_std": 1.5055316388607025, "rewards/mrr_reward": 0.19459325820207596, "rewards/rank_analyze_format_reward": 0.14743656385689974, "rewards/rank_answer_foramt_reward": 0.46875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9994212985038757, "rewards/rank_overall_format_reward_more": 0.859375, "rewards/rank_verify_format_reward": 0.9837962985038757, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 456.6875, "epoch": 0.056, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.023, "reward": 4.391666889190674, "reward_std": 1.7478849291801453, "rewards/mrr_reward": 0.27126736007630825, "rewards/rank_analyze_format_reward": 0.13924695551395416, "rewards/rank_answer_foramt_reward": 0.4375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9977376908063889, "rewards/rank_overall_format_reward_more": 0.78125, "rewards/rank_verify_format_reward": 0.9508626908063889, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 438.890625, "epoch": 0.064, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0161, "reward": 4.737706661224365, "reward_std": 1.8800698816776276, "rewards/mrr_reward": 0.328125, "rewards/rank_analyze_format_reward": 0.08576204627752304, "rewards/rank_answer_foramt_reward": 0.484375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.997436136007309, "rewards/rank_overall_format_reward_more": 0.875, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 440.546875, "epoch": 0.072, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.026, "reward": 4.544053554534912, "reward_std": 2.140646994113922, "rewards/mrr_reward": 0.3108258917927742, "rewards/rank_analyze_format_reward": 0.09371883049607277, "rewards/rank_answer_foramt_reward": 0.44140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.8125, "rewards/rank_verify_format_reward": 0.953125, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 486.078125, "epoch": 0.08, "grad_norm": 0.02036167122423649, "kl": 0.0, "learning_rate": 1.9999999684172664e-05, "loss": -0.0462, "reward": 4.728065490722656, "reward_std": 1.9379011690616608, "rewards/mrr_reward": 0.3036644458770752, "rewards/rank_analyze_format_reward": 0.2416149042546749, "rewards/rank_answer_foramt_reward": 0.4921875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9835526347160339, "rewards/rank_overall_format_reward_more": 0.828125, "rewards/rank_verify_format_reward": 0.9679276347160339, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 478.84375, "epoch": 0.088, "grad_norm": 0.02036167122423649, "kl": -5.602836608886719e-06, "learning_rate": 1.9999999684172664e-05, "loss": -0.0299, "reward": 4.586392045021057, "reward_std": 1.808391511440277, "rewards/mrr_reward": 0.26946303993463516, "rewards/rank_analyze_format_reward": 0.178153439424932, "rewards/rank_answer_foramt_reward": 0.52734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.8359375, "rewards/rank_verify_format_reward": 0.9679276347160339, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 455.078125, "epoch": 0.096, "grad_norm": 0.020598648115992546, "kl": -6.273388862609863e-06, "learning_rate": 1.9999998736690666e-05, "loss": -0.019, "reward": 4.161486208438873, "reward_std": 1.7841115891933441, "rewards/mrr_reward": 0.21319444477558136, "rewards/rank_analyze_format_reward": 0.09823539853096008, "rewards/rank_answer_foramt_reward": 0.396484375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9968380630016327, "rewards/rank_overall_format_reward_more": 0.8359375, "rewards/rank_verify_format_reward": 0.9812130630016327, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 459.578125, "epoch": 0.104, "grad_norm": 0.021508827805519104, "kl": -4.723668098449707e-06, "learning_rate": 1.999999715755407e-05, "loss": -0.0384, "reward": 4.695295810699463, "reward_std": 1.5369611978530884, "rewards/mrr_reward": 0.3081597238779068, "rewards/rank_analyze_format_reward": 0.11086451821029186, "rewards/rank_answer_foramt_reward": 0.48046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.875, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 464.015625, "epoch": 0.112, "grad_norm": 0.022063156589865685, "kl": -5.081295967102051e-06, "learning_rate": 1.9999994946762974e-05, "loss": -0.0454, "reward": 4.200581610202789, "reward_std": 1.8469471633434296, "rewards/mrr_reward": 0.20451389625668526, "rewards/rank_analyze_format_reward": 0.16768221091479063, "rewards/rank_answer_foramt_reward": 0.41015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.8203125, "rewards/rank_verify_format_reward": 0.984375, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 452.5, "epoch": 0.12, "grad_norm": 0.022208023816347122, "kl": -4.26173210144043e-06, "learning_rate": 1.999999210431752e-05, "loss": -0.0243, "reward": 4.085702300071716, "reward_std": 1.512882336974144, "rewards/mrr_reward": 0.17906746454536915, "rewards/rank_analyze_format_reward": 0.14385094121098518, "rewards/rank_answer_foramt_reward": 0.4375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9956032931804657, "rewards/rank_overall_format_reward_more": 0.796875, "rewards/rank_verify_format_reward": 0.9956032931804657, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 467.296875, "epoch": 0.128, "grad_norm": 0.020399712026119232, "kl": -4.0084123611450195e-06, "learning_rate": 1.9999988630217885e-05, "loss": -0.0316, "reward": 5.109304070472717, "reward_std": 1.985443890094757, "rewards/mrr_reward": 0.3867187425494194, "rewards/rank_analyze_format_reward": 0.17149577103555202, "rewards/rank_answer_foramt_reward": 0.580078125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.8125, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 473.90625, "epoch": 0.136, "grad_norm": 0.02226601168513298, "kl": -2.7865171432495117e-06, "learning_rate": 1.999998452446429e-05, "loss": -0.032, "reward": 4.289996266365051, "reward_std": 1.757462590932846, "rewards/mrr_reward": 0.2313119969330728, "rewards/rank_analyze_format_reward": 0.15154925920069218, "rewards/rank_answer_foramt_reward": 0.44140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9835526347160339, "rewards/rank_overall_format_reward_more": 0.8203125, "rewards/rank_verify_format_reward": 0.9679276347160339, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 486.328125, "epoch": 0.144, "grad_norm": 0.02119840867817402, "kl": -2.086162567138672e-07, "learning_rate": 1.9999979787056998e-05, "loss": -0.0259, "reward": 4.4557565450668335, "reward_std": 1.1966679394245148, "rewards/mrr_reward": 0.22187501564621925, "rewards/rank_analyze_format_reward": 0.1767062321305275, "rewards/rank_answer_foramt_reward": 0.521484375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9975329041481018, "rewards/rank_overall_format_reward_more": 0.875, "rewards/rank_verify_format_reward": 0.9975329041481018, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 446.53125, "epoch": 0.152, "grad_norm": 0.02239903435111046, "kl": -2.2351741790771484e-07, "learning_rate": 1.9999974417996303e-05, "loss": -0.0161, "reward": 4.088248610496521, "reward_std": 1.54827019572258, "rewards/mrr_reward": 0.18916791677474976, "rewards/rank_analyze_format_reward": 0.09350559022277594, "rewards/rank_answer_foramt_reward": 0.42578125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9959887713193893, "rewards/rank_overall_format_reward_more": 0.8359375, "rewards/rank_verify_format_reward": 0.9803637713193893, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 467.40625, "epoch": 0.16, "grad_norm": 0.02158363349735737, "kl": 2.995133399963379e-06, "learning_rate": 1.9999968417282542e-05, "loss": -0.0394, "reward": 5.011839747428894, "reward_std": 1.7887286245822906, "rewards/mrr_reward": 0.35902776941657066, "rewards/rank_analyze_format_reward": 0.1226036436855793, "rewards/rank_answer_foramt_reward": 0.59375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.859375, "rewards/rank_verify_format_reward": 1.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 512.265625, "epoch": 0.168, "grad_norm": 0.019902769476175308, "kl": 2.086162567138672e-06, "learning_rate": 1.99999617849161e-05, "loss": -0.007, "reward": 4.999041318893433, "reward_std": 2.092874825000763, "rewards/mrr_reward": 0.33280009776353836, "rewards/rank_analyze_format_reward": 0.3474135100841522, "rewards/rank_answer_foramt_reward": 0.564453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9678308814764023, "rewards/rank_overall_format_reward_more": 0.8203125, "rewards/rank_verify_format_reward": 0.9678308814764023, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 462.078125, "epoch": 0.176, "grad_norm": 0.0211955476552248, "kl": 6.16908073425293e-06, "learning_rate": 1.9999954520897394e-05, "loss": 0.0067, "reward": 4.904757022857666, "reward_std": 1.5794726610183716, "rewards/mrr_reward": 0.35468750447034836, "rewards/rank_analyze_format_reward": 0.09507373627275229, "rewards/rank_answer_foramt_reward": 0.611328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.8125, "rewards/rank_verify_format_reward": 0.9679276347160339, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 466.828125, "epoch": 0.184, "grad_norm": 0.020976468920707703, "kl": 6.943941116333008e-06, "learning_rate": 1.999994662522688e-05, "loss": -0.0219, "reward": 5.450310587882996, "reward_std": 1.9311817586421967, "rewards/mrr_reward": 0.44487228244543076, "rewards/rank_analyze_format_reward": 0.20164816547185183, "rewards/rank_answer_foramt_reward": 0.66796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.8359375, "rewards/rank_verify_format_reward": 0.9670085161924362, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 467.34375, "epoch": 0.192, "grad_norm": 0.0208530742675066, "kl": 1.093745231628418e-05, "learning_rate": 1.9999938097905064e-05, "loss": -0.0345, "reward": 4.764381527900696, "reward_std": 1.8450036644935608, "rewards/mrr_reward": 0.31850818172097206, "rewards/rank_analyze_format_reward": 0.1051585366949439, "rewards/rank_answer_foramt_reward": 0.546875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9972826093435287, "rewards/rank_overall_format_reward_more": 0.859375, "rewards/rank_verify_format_reward": 0.9816576093435287, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 477.484375, "epoch": 0.2, "grad_norm": 0.02045305259525776, "kl": 1.1593103408813477e-05, "learning_rate": 1.9999928938932473e-05, "loss": -0.0176, "reward": 4.7958372831344604, "reward_std": 1.7617928981781006, "rewards/mrr_reward": 0.2876054085791111, "rewards/rank_analyze_format_reward": 0.28027439024299383, "rewards/rank_answer_foramt_reward": 0.48828125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9970238208770752, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.9970238208770752, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 445.140625, "epoch": 0.208, "grad_norm": 0.02045305259525776, "kl": 1.919269561767578e-05, "learning_rate": 1.9999928938932473e-05, "loss": -0.002, "reward": 4.298715710639954, "reward_std": 1.676234632730484, "rewards/mrr_reward": 0.2303757481276989, "rewards/rank_analyze_format_reward": 0.11518567334860563, "rewards/rank_answer_foramt_reward": 0.458984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9679276347160339, "rewards/rank_overall_format_reward_more": 0.8671875, "rewards/rank_verify_format_reward": 0.9679276347160339, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 483.71875, "epoch": 0.216, "grad_norm": 0.02045305259525776, "kl": 1.0028481483459473e-05, "learning_rate": 1.9999928938932473e-05, "loss": -0.0122, "reward": 4.261886656284332, "reward_std": 1.7420227527618408, "rewards/mrr_reward": 0.19882812350988388, "rewards/rank_analyze_format_reward": 0.19866678677499294, "rewards/rank_answer_foramt_reward": 0.44921875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9837500005960464, "rewards/rank_overall_format_reward_more": 0.8359375, "rewards/rank_verify_format_reward": 0.9990011900663376, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 494.4375, "epoch": 0.224, "grad_norm": 0.020556651055812836, "kl": 1.0758638381958008e-05, "learning_rate": 1.99999191483097e-05, "loss": -0.0292, "reward": 4.516226172447205, "reward_std": 1.9960070848464966, "rewards/mrr_reward": 0.28723958507180214, "rewards/rank_analyze_format_reward": 0.14337314292788506, "rewards/rank_answer_foramt_reward": 0.470703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9820645451545715, "rewards/rank_overall_format_reward_more": 0.7890625, "rewards/rank_verify_format_reward": 0.9820645451545715, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 477.453125, "epoch": 0.232, "grad_norm": 0.019716233015060425, "kl": 1.9982457160949707e-05, "learning_rate": 1.999990872603735e-05, "loss": -0.017, "reward": 4.805420398712158, "reward_std": 1.657298356294632, "rewards/mrr_reward": 0.3268229216337204, "rewards/rank_analyze_format_reward": 0.14590902999043465, "rewards/rank_answer_foramt_reward": 0.490234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9934926480054855, "rewards/rank_overall_format_reward_more": 0.875, "rewards/rank_verify_format_reward": 0.9934926480054855, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 492.375, "epoch": 0.24, "grad_norm": 0.02322639897465706, "kl": 1.965463161468506e-05, "learning_rate": 1.999989767211609e-05, "loss": -0.0386, "reward": 4.979418992996216, "reward_std": 1.6339992135763168, "rewards/mrr_reward": 0.3164062537252903, "rewards/rank_analyze_format_reward": 0.31370767019689083, "rewards/rank_answer_foramt_reward": 0.51953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9988712668418884, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.9988712668418884, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 464.875, "epoch": 0.248, "grad_norm": 0.020231744274497032, "kl": 2.7373433113098145e-05, "learning_rate": 1.9999885986546613e-05, "loss": -0.0448, "reward": 4.7086580991744995, "reward_std": 1.7371686697006226, "rewards/mrr_reward": 0.2849392406642437, "rewards/rank_analyze_format_reward": 0.16265114955604076, "rewards/rank_answer_foramt_reward": 0.4921875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9296875, "rewards/rank_verify_format_reward": 0.984375, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 487.484375, "epoch": 0.256, "grad_norm": 0.01962853968143463, "kl": 3.975629806518555e-05, "learning_rate": 1.999987366932966e-05, "loss": -0.0411, "reward": 4.679190993309021, "reward_std": 1.5342676639556885, "rewards/mrr_reward": 0.27297867834568024, "rewards/rank_analyze_format_reward": 0.16313984990119934, "rewards/rank_answer_foramt_reward": 0.56640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.875, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 454.515625, "epoch": 0.264, "grad_norm": 0.02267865277826786, "kl": 3.771483898162842e-05, "learning_rate": 1.9999860720466007e-05, "loss": -0.0034, "reward": 4.1931135058403015, "reward_std": 1.5233525335788727, "rewards/mrr_reward": 0.19470486417412758, "rewards/rank_analyze_format_reward": 0.10926186013966799, "rewards/rank_answer_foramt_reward": 0.443359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9972426444292068, "rewards/rank_overall_format_reward_more": 0.8671875, "rewards/rank_verify_format_reward": 0.9972426444292068, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 461.953125, "epoch": 0.272, "grad_norm": 0.02176724746823311, "kl": 5.410611629486084e-05, "learning_rate": 1.9999847139956477e-05, "loss": -0.0314, "reward": 4.550845384597778, "reward_std": 1.958255022764206, "rewards/mrr_reward": 0.30027903243899345, "rewards/rank_analyze_format_reward": 0.0568907568231225, "rewards/rank_answer_foramt_reward": 0.529296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9677083343267441, "rewards/rank_overall_format_reward_more": 0.84375, "rewards/rank_verify_format_reward": 0.9520833343267441, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 474.65625, "epoch": 0.28, "grad_norm": 0.0221868809312582, "kl": 5.4582953453063965e-05, "learning_rate": 1.9999832927801922e-05, "loss": -0.0057, "reward": 4.710769176483154, "reward_std": 1.6857908964157104, "rewards/mrr_reward": 0.3105034828186035, "rewards/rank_analyze_format_reward": 0.1985221654176712, "rewards/rank_answer_foramt_reward": 0.544921875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9798430800437927, "rewards/rank_overall_format_reward_more": 0.78125, "rewards/rank_verify_format_reward": 0.9642180800437927, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 496.78125, "epoch": 0.288, "grad_norm": 0.02121078222990036, "kl": 6.13182783126831e-05, "learning_rate": 1.9999818084003243e-05, "loss": -0.0368, "reward": 5.009979605674744, "reward_std": 1.9656108021736145, "rewards/mrr_reward": 0.32831721380352974, "rewards/rank_analyze_format_reward": 0.24603652395308018, "rewards/rank_answer_foramt_reward": 0.58984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9811964929103851, "rewards/rank_overall_format_reward_more": 0.8984375, "rewards/rank_verify_format_reward": 0.9811964929103851, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 489.40625, "epoch": 0.296, "grad_norm": 0.022762347012758255, "kl": 7.359683513641357e-05, "learning_rate": 1.999980260856137e-05, "loss": 0.0164, "reward": 4.261849403381348, "reward_std": 1.6020236611366272, "rewards/mrr_reward": 0.20416666939854622, "rewards/rank_analyze_format_reward": 0.16004161350429058, "rewards/rank_answer_foramt_reward": 0.4375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9980392158031464, "rewards/rank_overall_format_reward_more": 0.8515625, "rewards/rank_verify_format_reward": 0.9980392158031464, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 473.59375, "epoch": 0.304, "grad_norm": 0.02300061471760273, "kl": 6.565451622009277e-05, "learning_rate": 1.9999786501477298e-05, "loss": -0.0407, "reward": 4.600297033786774, "reward_std": 1.597813993692398, "rewards/mrr_reward": 0.2838975712656975, "rewards/rank_analyze_format_reward": 0.10455834865570068, "rewards/rank_answer_foramt_reward": 0.50390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9984335899353027, "rewards/rank_overall_format_reward_more": 0.859375, "rewards/rank_verify_format_reward": 0.9984335899353027, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 468.828125, "epoch": 0.312, "grad_norm": 0.022656837478280067, "kl": 9.85860824584961e-05, "learning_rate": 1.9999769762752024e-05, "loss": -0.0421, "reward": 4.96368944644928, "reward_std": 1.8781414777040482, "rewards/mrr_reward": 0.33848586305975914, "rewards/rank_analyze_format_reward": 0.16822483576834202, "rewards/rank_answer_foramt_reward": 0.654296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.8203125, "rewards/rank_verify_format_reward": 0.9678308814764023, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 503.1875, "epoch": 0.32, "grad_norm": 0.023156002163887024, "kl": 0.0001109689474105835, "learning_rate": 1.999975239238662e-05, "loss": -0.0188, "reward": 5.189586162567139, "reward_std": 2.028193384408951, "rewards/mrr_reward": 0.36250000447034836, "rewards/rank_analyze_format_reward": 0.3480729628354311, "rewards/rank_answer_foramt_reward": 0.560546875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.981889471411705, "rewards/rank_overall_format_reward_more": 0.8671875, "rewards/rank_verify_format_reward": 0.981889471411705, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 461.5625, "epoch": 0.328, "grad_norm": 0.021381191909313202, "kl": 0.00012908875942230225, "learning_rate": 1.999973439038218e-05, "loss": -0.0281, "reward": 4.959184765815735, "reward_std": 2.1065359711647034, "rewards/mrr_reward": 0.36927083879709244, "rewards/rank_analyze_format_reward": 0.1428538914769888, "rewards/rank_answer_foramt_reward": 0.48828125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9918892979621887, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.9762642979621887, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 490.0, "epoch": 0.336, "grad_norm": 0.02237197570502758, "kl": 0.00011831521987915039, "learning_rate": 1.9999715756739833e-05, "loss": -0.0379, "reward": 4.825831055641174, "reward_std": 1.8668445944786072, "rewards/mrr_reward": 0.3253224194049835, "rewards/rank_analyze_format_reward": 0.163555265404284, "rewards/rank_answer_foramt_reward": 0.568359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9900633096694946, "rewards/rank_overall_format_reward_more": 0.828125, "rewards/rank_verify_format_reward": 0.9744383096694946, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 473.34375, "epoch": 0.344, "grad_norm": 0.022024238482117653, "kl": 0.00014644861221313477, "learning_rate": 1.9999696491460764e-05, "loss": -0.0215, "reward": 4.890589237213135, "reward_std": 1.6738486886024475, "rewards/mrr_reward": 0.3286830335855484, "rewards/rank_analyze_format_reward": 0.15786650124937296, "rewards/rank_answer_foramt_reward": 0.56640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9961046874523163, "rewards/rank_overall_format_reward_more": 0.859375, "rewards/rank_verify_format_reward": 0.9961046874523163, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 467.234375, "epoch": 0.352, "grad_norm": 0.02275724522769451, "kl": 0.00016573071479797363, "learning_rate": 1.9999676594546187e-05, "loss": -0.0215, "reward": 5.033377289772034, "reward_std": 1.8407581448554993, "rewards/mrr_reward": 0.3557477742433548, "rewards/rank_analyze_format_reward": 0.14718732610344887, "rewards/rank_answer_foramt_reward": 0.59765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 487.625, "epoch": 0.36, "grad_norm": 0.023730719462037086, "kl": 0.00015407800674438477, "learning_rate": 1.999965606599736e-05, "loss": -0.0031, "reward": 5.316616773605347, "reward_std": 1.5850826501846313, "rewards/mrr_reward": 0.4290550574660301, "rewards/rank_analyze_format_reward": 0.08148389589041471, "rewards/rank_answer_foramt_reward": 0.697265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9928547292947769, "rewards/rank_overall_format_reward_more": 0.8515625, "rewards/rank_verify_format_reward": 0.9772297292947769, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 509.9375, "epoch": 0.368, "grad_norm": 0.021739846095442772, "kl": 0.00018548965454101562, "learning_rate": 1.999963490581558e-05, "loss": -0.0254, "reward": 5.217623829841614, "reward_std": 1.4084790647029877, "rewards/mrr_reward": 0.33927951753139496, "rewards/rank_analyze_format_reward": 0.3564212815836072, "rewards/rank_answer_foramt_reward": 0.6953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.998135969042778, "rewards/rank_overall_format_reward_more": 0.84375, "rewards/rank_verify_format_reward": 0.966885969042778, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 473.734375, "epoch": 0.376, "grad_norm": 0.023394783958792686, "kl": 0.00021630525588989258, "learning_rate": 1.9999613114002184e-05, "loss": -0.0309, "reward": 4.08813738822937, "reward_std": 1.2790243327617645, "rewards/mrr_reward": 0.14723462983965874, "rewards/rank_analyze_format_reward": 0.15144313033670187, "rewards/rank_answer_foramt_reward": 0.431640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9971200972795486, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9971200972795486, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 472.421875, "epoch": 0.384, "grad_norm": 0.027028290554881096, "kl": 0.00026175379753112793, "learning_rate": 1.9999590690558545e-05, "loss": -0.054, "reward": 5.350240349769592, "reward_std": 1.9697438478469849, "rewards/mrr_reward": 0.42695312947034836, "rewards/rank_analyze_format_reward": 0.21482349652796984, "rewards/rank_answer_foramt_reward": 0.625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9989583343267441, "rewards/rank_overall_format_reward_more": 0.8359375, "rewards/rank_verify_format_reward": 0.9677083343267441, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 476.34375, "epoch": 0.392, "grad_norm": 0.021585488691926003, "kl": 0.0002930760383605957, "learning_rate": 1.9999567635486086e-05, "loss": -0.0243, "reward": 4.152051568031311, "reward_std": 1.6824184954166412, "rewards/mrr_reward": 0.18816964142024517, "rewards/rank_analyze_format_reward": 0.12541021592915058, "rewards/rank_answer_foramt_reward": 0.39453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983095824718475, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.9983095824718475, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 483.484375, "epoch": 0.4, "grad_norm": 0.022128406912088394, "kl": 0.00023129582405090332, "learning_rate": 1.9999543948786258e-05, "loss": -0.0018, "reward": 4.990848183631897, "reward_std": 1.9261715412139893, "rewards/mrr_reward": 0.3342633992433548, "rewards/rank_analyze_format_reward": 0.1260274900123477, "rewards/rank_answer_foramt_reward": 0.609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 492.078125, "epoch": 0.408, "grad_norm": 0.023543158546090126, "kl": 0.0002911984920501709, "learning_rate": 1.9999519630460554e-05, "loss": -0.0076, "reward": 5.144826769828796, "reward_std": 1.6632727682590485, "rewards/mrr_reward": 0.3661458343267441, "rewards/rank_analyze_format_reward": 0.16852473467588425, "rewards/rank_answer_foramt_reward": 0.59765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9140625, "rewards/rank_verify_format_reward": 1.0, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 466.515625, "epoch": 0.416, "grad_norm": 0.024417538195848465, "kl": 0.0004246234893798828, "learning_rate": 1.999949468051052e-05, "loss": -0.0313, "reward": 5.0145174860954285, "reward_std": 1.8828826546669006, "rewards/mrr_reward": 0.38802083767950535, "rewards/rank_analyze_format_reward": 0.10110596101731062, "rewards/rank_answer_foramt_reward": 0.556640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.8359375, "rewards/rank_verify_format_reward": 0.96875, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 494.75, "epoch": 0.424, "grad_norm": 0.024848150089383125, "kl": 0.0002892911434173584, "learning_rate": 1.9999469098937726e-05, "loss": -0.0361, "reward": 4.832870543003082, "reward_std": 1.565253883600235, "rewards/mrr_reward": 0.2958891298621893, "rewards/rank_analyze_format_reward": 0.1942360121756792, "rewards/rank_answer_foramt_reward": 0.611328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.875, "rewards/rank_verify_format_reward": 0.984375, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 504.5625, "epoch": 0.432, "grad_norm": 0.02211805246770382, "kl": 0.00029155611991882324, "learning_rate": 1.9999442885743785e-05, "loss": -0.016, "reward": 4.681830644607544, "reward_std": 1.6615483164787292, "rewards/mrr_reward": 0.28389756940305233, "rewards/rank_analyze_format_reward": 0.1718399478122592, "rewards/rank_answer_foramt_reward": 0.568359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9811454266309738, "rewards/rank_overall_format_reward_more": 0.859375, "rewards/rank_verify_format_reward": 0.9655204266309738, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 471.765625, "epoch": 0.44, "grad_norm": 0.02444814145565033, "kl": 0.0004519224166870117, "learning_rate": 1.9999416040930354e-05, "loss": -0.0462, "reward": 5.167219042778015, "reward_std": 1.9449047446250916, "rewards/mrr_reward": 0.3921875059604645, "rewards/rank_analyze_format_reward": 0.1719050519168377, "rewards/rank_answer_foramt_reward": 0.513671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9955085963010788, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9955085963010788, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 501.21875, "epoch": 0.448, "grad_norm": 0.024404721334576607, "kl": 0.00047457218170166016, "learning_rate": 1.9999388564499135e-05, "loss": -0.047, "reward": 5.111963272094727, "reward_std": 1.9699311256408691, "rewards/mrr_reward": 0.340104166418314, "rewards/rank_analyze_format_reward": 0.30795731022953987, "rewards/rank_answer_foramt_reward": 0.650390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9669117629528046, "rewards/rank_overall_format_reward_more": 0.859375, "rewards/rank_verify_format_reward": 0.9669117629528046, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 481.421875, "epoch": 0.456, "grad_norm": 0.024884849786758423, "kl": 0.0005426406860351562, "learning_rate": 1.999936045645186e-05, "loss": -0.0116, "reward": 4.459952890872955, "reward_std": 1.6162844747304916, "rewards/mrr_reward": 0.24435143917798996, "rewards/rank_analyze_format_reward": 0.10262943152338266, "rewards/rank_answer_foramt_reward": 0.5234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9985526353120804, "rewards/rank_overall_format_reward_more": 0.890625, "rewards/rank_verify_format_reward": 0.9673026353120804, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 476.625, "epoch": 0.464, "grad_norm": 0.02534506469964981, "kl": 0.0007425546646118164, "learning_rate": 1.9999331716790303e-05, "loss": -0.0169, "reward": 4.837222576141357, "reward_std": 1.9827671647071838, "rewards/mrr_reward": 0.33585068956017494, "rewards/rank_analyze_format_reward": 0.20996354706585407, "rewards/rank_answer_foramt_reward": 0.470703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9807952791452408, "rewards/rank_overall_format_reward_more": 0.8515625, "rewards/rank_verify_format_reward": 0.9807952791452408, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 530.046875, "epoch": 0.472, "grad_norm": 0.022839965298771858, "kl": 0.0004405379295349121, "learning_rate": 1.9999302345516278e-05, "loss": -0.0295, "reward": 5.279780864715576, "reward_std": 1.9629344046115875, "rewards/mrr_reward": 0.36336806416511536, "rewards/rank_analyze_format_reward": 0.2832249477505684, "rewards/rank_answer_foramt_reward": 0.654296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.890625, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 516.65625, "epoch": 0.48, "grad_norm": 0.0263381227850914, "kl": 0.0005091428756713867, "learning_rate": 1.9999272342631644e-05, "loss": -0.0381, "reward": 6.471034526824951, "reward_std": 1.9417240023612976, "rewards/mrr_reward": 0.6197172403335571, "rewards/rank_analyze_format_reward": 0.26364994794130325, "rewards/rank_answer_foramt_reward": 0.791015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 1.0, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 480.421875, "epoch": 0.488, "grad_norm": 0.02566557377576828, "kl": 0.0005975961685180664, "learning_rate": 1.9999241708138296e-05, "loss": -0.0056, "reward": 5.077809810638428, "reward_std": 1.307851292192936, "rewards/mrr_reward": 0.35500991344451904, "rewards/rank_analyze_format_reward": 0.10372397117316723, "rewards/rank_answer_foramt_reward": 0.6328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9957729876041412, "rewards/rank_overall_format_reward_more": 0.9296875, "rewards/rank_verify_format_reward": 0.9957729876041412, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 472.875, "epoch": 0.496, "grad_norm": 0.027827711775898933, "kl": 0.000952601432800293, "learning_rate": 1.9999210442038164e-05, "loss": -0.0339, "reward": 4.869051575660706, "reward_std": 1.8942435383796692, "rewards/mrr_reward": 0.3203125074505806, "rewards/rank_analyze_format_reward": 0.16581160761415958, "rewards/rank_answer_foramt_reward": 0.548828125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9834558814764023, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9678308814764023, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 485.609375, "epoch": 0.504, "grad_norm": 0.024270422756671906, "kl": 0.000729680061340332, "learning_rate": 1.9999178544333228e-05, "loss": 0.0064, "reward": 5.877958178520203, "reward_std": 1.8244962692260742, "rewards/mrr_reward": 0.5174479112029076, "rewards/rank_analyze_format_reward": 0.19235198944807053, "rewards/rank_answer_foramt_reward": 0.736328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983368366956711, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.9983368366956711, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 515.46875, "epoch": 0.512, "grad_norm": 0.022133484482765198, "kl": 0.0008175373077392578, "learning_rate": 1.9999146015025503e-05, "loss": 0.0092, "reward": 5.555278539657593, "reward_std": 1.9869469702243805, "rewards/mrr_reward": 0.45848215371370316, "rewards/rank_analyze_format_reward": 0.22101733088493347, "rewards/rank_answer_foramt_reward": 0.666015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9913771450519562, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.9601271450519562, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 511.625, "epoch": 0.52, "grad_norm": 0.024551959708333015, "kl": 0.0007832050323486328, "learning_rate": 1.999911285411704e-05, "loss": -0.0049, "reward": 5.41889089345932, "reward_std": 1.9643912464380264, "rewards/mrr_reward": 0.43281250074505806, "rewards/rank_analyze_format_reward": 0.2138027586042881, "rewards/rank_answer_foramt_reward": 0.62890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9927783608436584, "rewards/rank_overall_format_reward_more": 0.875, "rewards/rank_verify_format_reward": 0.9771533608436584, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 528.5, "epoch": 0.528, "grad_norm": 0.02290545031428337, "kl": 0.0008490085601806641, "learning_rate": 1.9999079061609933e-05, "loss": -0.021, "reward": 4.910151720046997, "reward_std": 1.064635694026947, "rewards/mrr_reward": 0.2832217253744602, "rewards/rank_analyze_format_reward": 0.2712905704975128, "rewards/rank_answer_foramt_reward": 0.5859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 514.546875, "epoch": 0.536, "grad_norm": 0.024759415537118912, "kl": 0.0009107589721679688, "learning_rate": 1.999904463750632e-05, "loss": 0.0076, "reward": 4.854610323905945, "reward_std": 1.8393707275390625, "rewards/mrr_reward": 0.30915798619389534, "rewards/rank_analyze_format_reward": 0.23711884673684835, "rewards/rank_answer_foramt_reward": 0.576171875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.8203125, "rewards/rank_verify_format_reward": 0.984375, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 499.859375, "epoch": 0.544, "grad_norm": 0.024759415537118912, "kl": 0.0008776187896728516, "learning_rate": 1.999904463750632e-05, "loss": -0.0246, "reward": 5.42217218875885, "reward_std": 1.3200950622558594, "rewards/mrr_reward": 0.42010788805782795, "rewards/rank_analyze_format_reward": 0.19172357022762299, "rewards/rank_answer_foramt_reward": 0.654296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.998641312122345, "rewards/rank_overall_format_reward_more": 0.9140625, "rewards/rank_verify_format_reward": 0.983016312122345, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 496.140625, "epoch": 0.552, "grad_norm": 0.02841918356716633, "kl": 0.0010838508605957031, "learning_rate": 1.999900958180838e-05, "loss": -0.0281, "reward": 5.81439483165741, "reward_std": 1.740799367427826, "rewards/mrr_reward": 0.5312500074505806, "rewards/rank_analyze_format_reward": 0.1757229631766677, "rewards/rank_answer_foramt_reward": 0.677734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.8828125, "rewards/rank_verify_format_reward": 0.96875, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 466.078125, "epoch": 0.56, "grad_norm": 0.02891196869313717, "kl": 0.001157999038696289, "learning_rate": 1.9998973894518318e-05, "loss": -0.0123, "reward": 5.705892205238342, "reward_std": 2.0529025495052338, "rewards/mrr_reward": 0.4973958432674408, "rewards/rank_analyze_format_reward": 0.15696396678686142, "rewards/rank_answer_foramt_reward": 0.638671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9993990361690521, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9993990361690521, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 492.296875, "epoch": 0.568, "grad_norm": 0.024859309196472168, "kl": 0.0010205507278442383, "learning_rate": 1.999893757563839e-05, "loss": 0.0114, "reward": 5.464065313339233, "reward_std": 1.7677285969257355, "rewards/mrr_reward": 0.446893610060215, "rewards/rank_analyze_format_reward": 0.10556165501475334, "rewards/rank_answer_foramt_reward": 0.68359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.890625, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 531.109375, "epoch": 0.576, "grad_norm": 0.026754125952720642, "kl": 0.001056671142578125, "learning_rate": 1.9998900625170897e-05, "loss": -0.0067, "reward": 6.407280087471008, "reward_std": 1.8228637278079987, "rewards/mrr_reward": 0.5859375298023224, "rewards/rank_analyze_format_reward": 0.30571743845939636, "rewards/rank_answer_foramt_reward": 0.828125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9296875, "rewards/rank_verify_format_reward": 1.0, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 508.484375, "epoch": 0.584, "grad_norm": 0.028836144134402275, "kl": 0.00142669677734375, "learning_rate": 1.9998863043118163e-05, "loss": -0.0076, "reward": 4.505983591079712, "reward_std": 1.231943815946579, "rewards/mrr_reward": 0.2062872126698494, "rewards/rank_analyze_format_reward": 0.20817857421934605, "rewards/rank_answer_foramt_reward": 0.56640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.984375, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 511.453125, "epoch": 0.592, "grad_norm": 0.025052759796380997, "kl": 0.0013051033020019531, "learning_rate": 1.999882482948257e-05, "loss": -0.0097, "reward": 5.300284147262573, "reward_std": 1.6650860607624054, "rewards/mrr_reward": 0.38593750447034836, "rewards/rank_analyze_format_reward": 0.15106541197746992, "rewards/rank_answer_foramt_reward": 0.66015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 1.0, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 504.625, "epoch": 0.6, "grad_norm": 0.026283830404281616, "kl": 0.0021805763244628906, "learning_rate": 1.999878598426653e-05, "loss": -0.0317, "reward": 5.158125400543213, "reward_std": 1.4547997415065765, "rewards/mrr_reward": 0.3474578373134136, "rewards/rank_analyze_format_reward": 0.2175126150250435, "rewards/rank_answer_foramt_reward": 0.58203125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 1.0, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 516.265625, "epoch": 0.608, "grad_norm": 0.027127476409077644, "kl": 0.001390218734741211, "learning_rate": 1.9998746507472493e-05, "loss": -0.0426, "reward": 5.807446002960205, "reward_std": 1.929233893752098, "rewards/mrr_reward": 0.4895833358168602, "rewards/rank_analyze_format_reward": 0.3203737363219261, "rewards/rank_answer_foramt_reward": 0.689453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9821428656578064, "rewards/rank_overall_format_reward_more": 0.890625, "rewards/rank_verify_format_reward": 0.9665178656578064, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 515.015625, "epoch": 0.616, "grad_norm": 0.026926733553409576, "kl": 0.001764059066772461, "learning_rate": 1.999870639910296e-05, "loss": -0.0223, "reward": 5.370245575904846, "reward_std": 1.9943826496601105, "rewards/mrr_reward": 0.3968749977648258, "rewards/rank_analyze_format_reward": 0.2607038579881191, "rewards/rank_answer_foramt_reward": 0.607421875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9924661070108414, "rewards/rank_overall_format_reward_more": 0.9296875, "rewards/rank_verify_format_reward": 0.9924661070108414, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 481.015625, "epoch": 0.624, "grad_norm": 0.027938006445765495, "kl": 0.0017654895782470703, "learning_rate": 1.9998665659160453e-05, "loss": -0.0188, "reward": 5.412413477897644, "reward_std": 1.896736979484558, "rewards/mrr_reward": 0.41354167461395264, "rewards/rank_analyze_format_reward": 0.24247420020401478, "rewards/rank_answer_foramt_reward": 0.63671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9942144006490707, "rewards/rank_overall_format_reward_more": 0.90625, "rewards/rank_verify_format_reward": 0.9785894006490707, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 499.15625, "epoch": 0.632, "grad_norm": 0.024667983874678612, "kl": 0.0013856887817382812, "learning_rate": 1.999862428764756e-05, "loss": -0.0076, "reward": 6.024145722389221, "reward_std": 1.524814635515213, "rewards/mrr_reward": 0.5302269533276558, "rewards/rank_analyze_format_reward": 0.23562652617692947, "rewards/rank_answer_foramt_reward": 0.794921875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9984335899353027, "rewards/rank_overall_format_reward_more": 0.90625, "rewards/rank_verify_format_reward": 0.9680059552192688, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 501.484375, "epoch": 0.64, "grad_norm": 0.028410576283931732, "kl": 0.0016107559204101562, "learning_rate": 1.9998582284566878e-05, "loss": 0.0072, "reward": 5.220240831375122, "reward_std": 1.5586610436439514, "rewards/mrr_reward": 0.35975322872400284, "rewards/rank_analyze_format_reward": 0.2044668523594737, "rewards/rank_answer_foramt_reward": 0.640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9954117387533188, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9954117387533188, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 541.9375, "epoch": 0.648, "grad_norm": 0.024989139288663864, "kl": 0.002213001251220703, "learning_rate": 1.999853964992107e-05, "loss": -0.0076, "reward": 5.271288990974426, "reward_std": 1.666042000055313, "rewards/mrr_reward": 0.3229972794651985, "rewards/rank_analyze_format_reward": 0.38286497443914413, "rewards/rank_answer_foramt_reward": 0.65625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 446.703125, "epoch": 0.656, "grad_norm": 0.03217592090368271, "kl": 0.0023698806762695312, "learning_rate": 1.9998496383712828e-05, "loss": -0.0122, "reward": 5.724093914031982, "reward_std": 1.503628522157669, "rewards/mrr_reward": 0.4970238097012043, "rewards/rank_analyze_format_reward": 0.05476433038711548, "rewards/rank_answer_foramt_reward": 0.75, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9968671798706055, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 0.9968671798706055, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 511.421875, "epoch": 0.664, "grad_norm": 0.026632074266672134, "kl": 0.001974821090698242, "learning_rate": 1.999845248594489e-05, "loss": -0.0378, "reward": 5.284371018409729, "reward_std": 1.7509951293468475, "rewards/mrr_reward": 0.37621527537703514, "rewards/rank_analyze_format_reward": 0.1659046746790409, "rewards/rank_answer_foramt_reward": 0.693359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9952791333198547, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9796541333198547, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 497.40625, "epoch": 0.672, "grad_norm": 0.028008421882987022, "kl": 0.002154827117919922, "learning_rate": 1.9998407956620017e-05, "loss": -0.0174, "reward": 5.500829696655273, "reward_std": 1.7818693816661835, "rewards/mrr_reward": 0.46015624701976776, "rewards/rank_analyze_format_reward": 0.18122385442256927, "rewards/rank_answer_foramt_reward": 0.69140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9836309552192688, "rewards/rank_overall_format_reward_more": 0.8515625, "rewards/rank_verify_format_reward": 0.9523809552192688, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 504.109375, "epoch": 0.68, "grad_norm": 0.02929595857858658, "kl": 0.0015263557434082031, "learning_rate": 1.9998362795741027e-05, "loss": -0.0149, "reward": 4.848661541938782, "reward_std": 1.5195987075567245, "rewards/mrr_reward": 0.27783359214663506, "rewards/rank_analyze_format_reward": 0.19695308804512024, "rewards/rank_answer_foramt_reward": 0.658203125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9801479876041412, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 0.9645229876041412, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 520.03125, "epoch": 0.688, "grad_norm": 0.028656957671046257, "kl": 0.0018339157104492188, "learning_rate": 1.9998317003310775e-05, "loss": 0.0018, "reward": 6.04072630405426, "reward_std": 1.6347778737545013, "rewards/mrr_reward": 0.5263826847076416, "rewards/rank_analyze_format_reward": 0.2711330959573388, "rewards/rank_answer_foramt_reward": 0.7890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.90625, "rewards/rank_verify_format_reward": 0.96875, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 514.953125, "epoch": 0.696, "grad_norm": 0.0294534619897604, "kl": 0.0030837059020996094, "learning_rate": 1.9998270579332154e-05, "loss": -0.0213, "reward": 5.602773904800415, "reward_std": 1.9321411848068237, "rewards/mrr_reward": 0.45494791865348816, "rewards/rank_analyze_format_reward": 0.22951603773981333, "rewards/rank_answer_foramt_reward": 0.65234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.9818111509084702, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 517.953125, "epoch": 0.704, "grad_norm": 0.02795729972422123, "kl": 0.0021820068359375, "learning_rate": 1.9998223523808092e-05, "loss": -0.005, "reward": 5.259730100631714, "reward_std": 1.7037486732006073, "rewards/mrr_reward": 0.384002972394228, "rewards/rank_analyze_format_reward": 0.18732355255633593, "rewards/rank_answer_foramt_reward": 0.642578125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9976895451545715, "rewards/rank_overall_format_reward_more": 0.8984375, "rewards/rank_verify_format_reward": 0.9976895451545715, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 572.234375, "epoch": 0.712, "grad_norm": 0.025803212076425552, "kl": 0.0023365020751953125, "learning_rate": 1.9998175836741564e-05, "loss": -0.0233, "reward": 5.643940687179565, "reward_std": 2.12572318315506, "rewards/mrr_reward": 0.41141493432223797, "rewards/rank_analyze_format_reward": 0.43535757809877396, "rewards/rank_answer_foramt_reward": 0.65234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9826335161924362, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 518.875, "epoch": 0.72, "grad_norm": 0.027814343571662903, "kl": 0.0020236968994140625, "learning_rate": 1.999812751813558e-05, "loss": -0.051, "reward": 5.96042013168335, "reward_std": 1.2770089283585548, "rewards/mrr_reward": 0.4757130518555641, "rewards/rank_analyze_format_reward": 0.3149571679532528, "rewards/rank_answer_foramt_reward": 0.79296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 495.8125, "epoch": 0.728, "grad_norm": 0.031177863478660583, "kl": 0.002357959747314453, "learning_rate": 1.9998078567993197e-05, "loss": -0.0346, "reward": 5.881357431411743, "reward_std": 1.7492572218179703, "rewards/mrr_reward": 0.5347842201590538, "rewards/rank_analyze_format_reward": 0.09312985371798277, "rewards/rank_answer_foramt_reward": 0.7265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9964202791452408, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9807952791452408, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 511.15625, "epoch": 0.736, "grad_norm": 0.028669161722064018, "kl": 0.002231597900390625, "learning_rate": 1.9998028986317504e-05, "loss": -0.0145, "reward": 5.656317114830017, "reward_std": 1.6166883707046509, "rewards/mrr_reward": 0.44843751192092896, "rewards/rank_analyze_format_reward": 0.20554364286363125, "rewards/rank_answer_foramt_reward": 0.76953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9984335899353027, "rewards/rank_overall_format_reward_more": 0.90625, "rewards/rank_verify_format_reward": 0.9828085899353027, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 522.484375, "epoch": 0.744, "grad_norm": 0.027386236935853958, "kl": 0.0021200180053710938, "learning_rate": 1.999797877311163e-05, "loss": -0.0246, "reward": 5.928924918174744, "reward_std": 1.48914834856987, "rewards/mrr_reward": 0.4644097238779068, "rewards/rank_analyze_format_reward": 0.3623017445206642, "rewards/rank_answer_foramt_reward": 0.802734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.921875, "rewards/rank_verify_format_reward": 0.984375, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 497.1875, "epoch": 0.752, "grad_norm": 0.0282078068703413, "kl": 0.003062725067138672, "learning_rate": 1.9997927928378753e-05, "loss": 0.0186, "reward": 6.396650433540344, "reward_std": 1.9676957428455353, "rewards/mrr_reward": 0.6083333343267441, "rewards/rank_analyze_format_reward": 0.25528283044695854, "rewards/rank_answer_foramt_reward": 0.76171875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9965953528881073, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9965953528881073, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 517.390625, "epoch": 0.76, "grad_norm": 0.03063831850886345, "kl": 0.002585887908935547, "learning_rate": 1.999787645212208e-05, "loss": -0.0102, "reward": 6.281728744506836, "reward_std": 1.7576136887073517, "rewards/mrr_reward": 0.5565538108348846, "rewards/rank_analyze_format_reward": 0.30067696794867516, "rewards/rank_answer_foramt_reward": 0.7890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9985119104385376, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9985119104385376, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 522.171875, "epoch": 0.768, "grad_norm": 0.02829769253730774, "kl": 0.0035347938537597656, "learning_rate": 1.999782434434486e-05, "loss": 0.0108, "reward": 5.318088173866272, "reward_std": 1.6170280575752258, "rewards/mrr_reward": 0.3567398265004158, "rewards/rank_analyze_format_reward": 0.24012142419815063, "rewards/rank_answer_foramt_reward": 0.708984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9827302694320679, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 537.703125, "epoch": 0.776, "grad_norm": 0.026142382994294167, "kl": 0.002566814422607422, "learning_rate": 1.999777160505039e-05, "loss": -0.0223, "reward": 5.818326234817505, "reward_std": 1.489253669977188, "rewards/mrr_reward": 0.45615699887275696, "rewards/rank_analyze_format_reward": 0.23597807995975018, "rewards/rank_answer_foramt_reward": 0.822265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9872584789991379, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9872584789991379, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 513.171875, "epoch": 0.784, "grad_norm": 0.03225073963403702, "kl": 0.0032796859741210938, "learning_rate": 1.9997718234242e-05, "loss": -0.0376, "reward": 5.64834451675415, "reward_std": 1.8088513016700745, "rewards/mrr_reward": 0.4361979216337204, "rewards/rank_analyze_format_reward": 0.2531745582818985, "rewards/rank_answer_foramt_reward": 0.6953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9931579083204269, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9931579083204269, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 513.203125, "epoch": 0.792, "grad_norm": 0.028324192389845848, "kl": 0.002949237823486328, "learning_rate": 1.999766423192306e-05, "loss": -0.0073, "reward": 5.801540851593018, "reward_std": 1.364225059747696, "rewards/mrr_reward": 0.45811013877391815, "rewards/rank_analyze_format_reward": 0.2562095895409584, "rewards/rank_answer_foramt_reward": 0.736328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 522.625, "epoch": 0.8, "grad_norm": 0.03095146454870701, "kl": 0.0033349990844726562, "learning_rate": 1.9997609598096982e-05, "loss": -0.0571, "reward": 5.498512506484985, "reward_std": 1.5701228380203247, "rewards/mrr_reward": 0.38402778655290604, "rewards/rank_analyze_format_reward": 0.31815899908542633, "rewards/rank_answer_foramt_reward": 0.720703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9891133904457092, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9734883904457092, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 505.75, "epoch": 0.808, "grad_norm": 0.030804021283984184, "kl": 0.003856658935546875, "learning_rate": 1.9997554332767214e-05, "loss": -0.0226, "reward": 6.090959072113037, "reward_std": 1.7257481813430786, "rewards/mrr_reward": 0.5503038242459297, "rewards/rank_analyze_format_reward": 0.20266878511756659, "rewards/rank_answer_foramt_reward": 0.748046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9929515719413757, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9929515719413757, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 486.828125, "epoch": 0.816, "grad_norm": 0.03461439907550812, "kl": 0.0033884048461914062, "learning_rate": 1.9997498435937254e-05, "loss": -0.0362, "reward": 5.366485238075256, "reward_std": 1.3259476721286774, "rewards/mrr_reward": 0.3723524361848831, "rewards/rank_analyze_format_reward": 0.1968497335910797, "rewards/rank_answer_foramt_reward": 0.732421875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9973393976688385, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9973393976688385, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 533.84375, "epoch": 0.824, "grad_norm": 0.028838761150836945, "kl": 0.0028543472290039062, "learning_rate": 1.9997441907610624e-05, "loss": -0.0262, "reward": 5.746440768241882, "reward_std": 1.2651481330394745, "rewards/mrr_reward": 0.41945064067840576, "rewards/rank_analyze_format_reward": 0.30614617466926575, "rewards/rank_answer_foramt_reward": 0.828125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9984335899353027, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 0.9984335899353027, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 534.390625, "epoch": 0.832, "grad_norm": 0.03118244931101799, "kl": 0.0032520294189453125, "learning_rate": 1.9997384747790903e-05, "loss": -0.0115, "reward": 5.606603145599365, "reward_std": 1.3689128905534744, "rewards/mrr_reward": 0.4183097779750824, "rewards/rank_analyze_format_reward": 0.1789928413927555, "rewards/rank_answer_foramt_reward": 0.841796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.9296875, "rewards/rank_verify_format_reward": 0.9836309552192688, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 541.140625, "epoch": 0.84, "grad_norm": 0.0289431344717741, "kl": 0.004002094268798828, "learning_rate": 1.9997326956481693e-05, "loss": 0.0299, "reward": 5.412080824375153, "reward_std": 1.5776411294937134, "rewards/mrr_reward": 0.40980901941657066, "rewards/rank_analyze_format_reward": 0.2225375398993492, "rewards/rank_answer_foramt_reward": 0.6484375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9704661071300507, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9704661071300507, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 546.625, "epoch": 0.848, "grad_norm": 0.02917851321399212, "kl": 0.003693103790283203, "learning_rate": 1.999726853368665e-05, "loss": -0.0132, "reward": 6.237439870834351, "reward_std": 1.6906473636627197, "rewards/mrr_reward": 0.5492559522390366, "rewards/rank_analyze_format_reward": 0.32810740265995264, "rewards/rank_answer_foramt_reward": 0.744140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9958027005195618, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9958027005195618, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 552.84375, "epoch": 0.856, "grad_norm": 0.02737216092646122, "kl": 0.0034399032592773438, "learning_rate": 1.9997209479409464e-05, "loss": -0.0087, "reward": 5.876426458358765, "reward_std": 1.4228278696537018, "rewards/mrr_reward": 0.4435453861951828, "rewards/rank_analyze_format_reward": 0.38482026010751724, "rewards/rank_answer_foramt_reward": 0.79296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9895716160535812, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.9895716160535812, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 508.390625, "epoch": 0.864, "grad_norm": 0.028911437839269638, "kl": 0.0033540725708007812, "learning_rate": 1.9997149793653862e-05, "loss": -0.0094, "reward": 6.699026107788086, "reward_std": 1.3455817177891731, "rewards/mrr_reward": 0.6710069477558136, "rewards/rank_analyze_format_reward": 0.18851793929934502, "rewards/rank_answer_foramt_reward": 0.8359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 486.1875, "epoch": 0.872, "grad_norm": 0.03616398200392723, "kl": 0.004034519195556641, "learning_rate": 1.9997089476423617e-05, "loss": 0.0287, "reward": 6.017909646034241, "reward_std": 1.8136086165904999, "rewards/mrr_reward": 0.5158420130610466, "rewards/rank_analyze_format_reward": 0.26905644312500954, "rewards/rank_answer_foramt_reward": 0.74609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9853207767009735, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9853207767009735, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 513.421875, "epoch": 0.88, "grad_norm": 0.029927760362625122, "kl": 0.003938198089599609, "learning_rate": 1.999702852772254e-05, "loss": 0.0003, "reward": 5.642710447311401, "reward_std": 1.466000735759735, "rewards/mrr_reward": 0.3956349194049835, "rewards/rank_analyze_format_reward": 0.3304584436118603, "rewards/rank_answer_foramt_reward": 0.814453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.981067106127739, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.981067106127739, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 527.78125, "epoch": 0.888, "grad_norm": 0.03488059714436531, "kl": 0.0036568641662597656, "learning_rate": 1.9996966947554476e-05, "loss": -0.0217, "reward": 6.343585729598999, "reward_std": 1.7120259702205658, "rewards/mrr_reward": 0.5808779820799828, "rewards/rank_analyze_format_reward": 0.3235646188259125, "rewards/rank_answer_foramt_reward": 0.75, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.996692106127739, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.981067106127739, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 508.9375, "epoch": 0.896, "grad_norm": 0.0328449085354805, "kl": 0.0038404464721679688, "learning_rate": 1.9996904735923325e-05, "loss": -0.0289, "reward": 6.122893452644348, "reward_std": 1.4696560502052307, "rewards/mrr_reward": 0.5161458402872086, "rewards/rank_analyze_format_reward": 0.31539197266101837, "rewards/rank_answer_foramt_reward": 0.806640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9915762841701508, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9915762841701508, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 541.578125, "epoch": 0.904, "grad_norm": 0.031444139778614044, "kl": 0.0041828155517578125, "learning_rate": 1.9996841892833e-05, "loss": -0.0134, "reward": 6.206910610198975, "reward_std": 1.6045927107334137, "rewards/mrr_reward": 0.5138206705451012, "rewards/rank_analyze_format_reward": 0.36523886024951935, "rewards/rank_answer_foramt_reward": 0.8203125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9947571158409119, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9947571158409119, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 511.28125, "epoch": 0.912, "grad_norm": 0.034729719161987305, "kl": 0.004141807556152344, "learning_rate": 1.9996778418287486e-05, "loss": 0.0052, "reward": 5.282190442085266, "reward_std": 1.5597249567508698, "rewards/mrr_reward": 0.35017360746860504, "rewards/rank_analyze_format_reward": 0.25228141248226166, "rewards/rank_answer_foramt_reward": 0.685546875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 531.9375, "epoch": 0.92, "grad_norm": 0.03240945562720299, "kl": 0.003923892974853516, "learning_rate": 1.9996714312290784e-05, "loss": -0.0297, "reward": 5.8050724267959595, "reward_std": 1.4556776583194733, "rewards/mrr_reward": 0.4047433137893677, "rewards/rank_analyze_format_reward": 0.3966461531817913, "rewards/rank_answer_foramt_reward": 0.830078125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992187470197678, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9992187470197678, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 545.4375, "epoch": 0.928, "grad_norm": 0.033669959753751755, "kl": 0.0044651031494140625, "learning_rate": 1.9996649574846948e-05, "loss": -0.0214, "reward": 6.237725496292114, "reward_std": 1.6311175972223282, "rewards/mrr_reward": 0.5039434656500816, "rewards/rank_analyze_format_reward": 0.40749866887927055, "rewards/rank_answer_foramt_reward": 0.830078125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 541.984375, "epoch": 0.936, "grad_norm": 0.03145231306552887, "kl": 0.004774570465087891, "learning_rate": 1.9996584205960063e-05, "loss": -0.0014, "reward": 5.562940955162048, "reward_std": 1.5884797871112823, "rewards/mrr_reward": 0.40140748769044876, "rewards/rank_analyze_format_reward": 0.33169008791446686, "rewards/rank_answer_foramt_reward": 0.689453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9954276382923126, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9798026382923126, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 527.875, "epoch": 0.944, "grad_norm": 0.035210635513067245, "kl": 0.0044097900390625, "learning_rate": 1.999651820563426e-05, "loss": -0.0421, "reward": 5.673676252365112, "reward_std": 1.3604719787836075, "rewards/mrr_reward": 0.3857142850756645, "rewards/rank_analyze_format_reward": 0.39724233001470566, "rewards/rank_answer_foramt_reward": 0.7890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9956946671009064, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9800696671009064, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 534.984375, "epoch": 0.952, "grad_norm": 0.03147454559803009, "kl": 0.0076541900634765625, "learning_rate": 1.999645157387371e-05, "loss": -0.0133, "reward": 6.33061408996582, "reward_std": 1.298683062195778, "rewards/mrr_reward": 0.553689256310463, "rewards/rank_analyze_format_reward": 0.3629737161099911, "rewards/rank_answer_foramt_reward": 0.787109375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9985119104385376, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9985119104385376, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 551.703125, "epoch": 0.96, "grad_norm": 0.031304825097322464, "kl": 0.004342079162597656, "learning_rate": 1.9996384310682615e-05, "loss": -0.0365, "reward": 5.393260598182678, "reward_std": 1.5107265412807465, "rewards/mrr_reward": 0.31743552163243294, "rewards/rank_analyze_format_reward": 0.3929348886013031, "rewards/rank_answer_foramt_reward": 0.763671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 501.890625, "epoch": 0.968, "grad_norm": 0.037323277443647385, "kl": 0.004292488098144531, "learning_rate": 1.999631641606523e-05, "loss": -0.0058, "reward": 6.189559578895569, "reward_std": 1.2323561608791351, "rewards/mrr_reward": 0.5643229335546494, "rewards/rank_analyze_format_reward": 0.11282643768936396, "rewards/rank_answer_foramt_reward": 0.87109375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9819862246513367, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9976112246513367, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 545.25, "epoch": 0.976, "grad_norm": 0.034240033477544785, "kl": 0.005130767822265625, "learning_rate": 1.9996247890025845e-05, "loss": -0.0263, "reward": 5.799539566040039, "reward_std": 1.6965691149234772, "rewards/mrr_reward": 0.4192212335765362, "rewards/rank_analyze_format_reward": 0.41699668765068054, "rewards/rank_answer_foramt_reward": 0.748046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983368366956711, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9983368366956711, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 522.9375, "epoch": 0.984, "grad_norm": 0.03087456338107586, "kl": 0.004132270812988281, "learning_rate": 1.9996178732568784e-05, "loss": -0.0128, "reward": 5.433954238891602, "reward_std": 1.3873755782842636, "rewards/mrr_reward": 0.35366444662213326, "rewards/rank_analyze_format_reward": 0.3041442818939686, "rewards/rank_answer_foramt_reward": 0.802734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9296875, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 548.90625, "epoch": 0.992, "grad_norm": 0.03247498720884323, "kl": 0.004190921783447266, "learning_rate": 1.9996108943698412e-05, "loss": -0.02, "reward": 6.039711356163025, "reward_std": 1.745398223400116, "rewards/mrr_reward": 0.49064359068870544, "rewards/rank_analyze_format_reward": 0.3508656769990921, "rewards/rank_answer_foramt_reward": 0.734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9959480613470078, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9959480613470078, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 502.75, "epoch": 1.0, "grad_norm": 0.031171714887022972, "kl": 0.00469207763671875, "learning_rate": 1.9996038523419148e-05, "loss": -0.0226, "reward": 5.955172896385193, "reward_std": 1.2524618208408356, "rewards/mrr_reward": 0.460627481341362, "rewards/rank_analyze_format_reward": 0.33911067247390747, "rewards/rank_answer_foramt_reward": 0.806640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 549.53125, "epoch": 1.008, "grad_norm": 0.03288634493947029, "kl": 0.00435638427734375, "learning_rate": 1.9995967471735433e-05, "loss": -0.0184, "reward": 6.16729462146759, "reward_std": 1.4908590912818909, "rewards/mrr_reward": 0.5096540227532387, "rewards/rank_analyze_format_reward": 0.39567676931619644, "rewards/rank_answer_foramt_reward": 0.775390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983368366956711, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9983368366956711, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 533.65625, "epoch": 1.016, "grad_norm": 0.03470674157142639, "kl": 0.005002021789550781, "learning_rate": 1.9995895788651753e-05, "loss": -0.0254, "reward": 6.5735520124435425, "reward_std": 1.469813510775566, "rewards/mrr_reward": 0.621657982468605, "rewards/rank_analyze_format_reward": 0.31275077164173126, "rewards/rank_answer_foramt_reward": 0.818359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 546.34375, "epoch": 1.024, "grad_norm": 0.03493339568376541, "kl": 0.004825592041015625, "learning_rate": 1.9995823474172644e-05, "loss": -0.0097, "reward": 5.744642496109009, "reward_std": 1.9137973487377167, "rewards/mrr_reward": 0.4302021265029907, "rewards/rank_analyze_format_reward": 0.3747362494468689, "rewards/rank_answer_foramt_reward": 0.6953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9925176054239273, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9925176054239273, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 522.65625, "epoch": 1.032, "grad_norm": 0.03346191346645355, "kl": 0.0044116973876953125, "learning_rate": 1.9995750528302668e-05, "loss": -0.0069, "reward": 6.34830904006958, "reward_std": 1.5635737180709839, "rewards/mrr_reward": 0.5352182611823082, "rewards/rank_analyze_format_reward": 0.3498992621898651, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 518.90625, "epoch": 1.04, "grad_norm": 0.027642706409096718, "kl": 0.0034575462341308594, "learning_rate": 1.999567695104643e-05, "loss": -0.0083, "reward": 6.863049745559692, "reward_std": 0.995637645944953, "rewards/mrr_reward": 0.6565104126930237, "rewards/rank_analyze_format_reward": 0.3502893391996622, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 546.265625, "epoch": 1.048, "grad_norm": 0.03320496901869774, "kl": 0.00482177734375, "learning_rate": 1.9995602742408584e-05, "loss": -0.0297, "reward": 5.561194658279419, "reward_std": 1.0441379398107529, "rewards/mrr_reward": 0.3508804552257061, "rewards/rank_analyze_format_reward": 0.3411516472697258, "rewards/rank_answer_foramt_reward": 0.857421875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 538.40625, "epoch": 1.056, "grad_norm": 0.03059810772538185, "kl": 0.004889488220214844, "learning_rate": 1.9995527902393814e-05, "loss": -0.031, "reward": 6.096472501754761, "reward_std": 1.3641002774238586, "rewards/mrr_reward": 0.4807477816939354, "rewards/rank_analyze_format_reward": 0.329423014074564, "rewards/rank_answer_foramt_reward": 0.845703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 530.125, "epoch": 1.064, "grad_norm": 0.03329962119460106, "kl": 0.004602909088134766, "learning_rate": 1.9995452431006844e-05, "loss": -0.0196, "reward": 5.331088542938232, "reward_std": 0.830422654747963, "rewards/mrr_reward": 0.3011222556233406, "rewards/rank_analyze_format_reward": 0.27261858060956, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9992559552192688, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 546.765625, "epoch": 1.072, "grad_norm": 0.03147532418370247, "kl": 0.004273891448974609, "learning_rate": 1.999537632825245e-05, "loss": -0.0228, "reward": 5.826651930809021, "reward_std": 1.0173790007829666, "rewards/mrr_reward": 0.4183593764901161, "rewards/rank_analyze_format_reward": 0.37596380710601807, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.997023805975914, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.965773805975914, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 540.265625, "epoch": 1.08, "grad_norm": 0.03314289450645447, "kl": 0.0045490264892578125, "learning_rate": 1.9995299594135434e-05, "loss": -0.0181, "reward": 6.364633798599243, "reward_std": 1.2888767421245575, "rewards/mrr_reward": 0.5286644101142883, "rewards/rank_analyze_format_reward": 0.40576110780239105, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9992559552192688, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 537.515625, "epoch": 1.088, "grad_norm": 0.03256648778915405, "kl": 0.005957603454589844, "learning_rate": 1.999522222866064e-05, "loss": -0.0253, "reward": 6.410487055778503, "reward_std": 1.080582246184349, "rewards/mrr_reward": 0.5400917902588844, "rewards/rank_analyze_format_reward": 0.3477761074900627, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 565.78125, "epoch": 1.096, "grad_norm": 0.03460180386900902, "kl": 0.012660980224609375, "learning_rate": 1.999514423183296e-05, "loss": -0.0144, "reward": 6.023637771606445, "reward_std": 1.5978916585445404, "rewards/mrr_reward": 0.44875992834568024, "rewards/rank_analyze_format_reward": 0.44508665800094604, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9835526347160339, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 515.0, "epoch": 1.104, "grad_norm": 0.030293360352516174, "kl": 0.005436897277832031, "learning_rate": 1.9995065603657317e-05, "loss": -0.0128, "reward": 6.0460041761398315, "reward_std": 1.0893033295869827, "rewards/mrr_reward": 0.48198164254426956, "rewards/rank_analyze_format_reward": 0.2604257594794035, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 560.9375, "epoch": 1.112, "grad_norm": 0.034913912415504456, "kl": 0.0055980682373046875, "learning_rate": 1.999498634413868e-05, "loss": -0.009, "reward": 6.033616900444031, "reward_std": 1.7777451276779175, "rewards/mrr_reward": 0.4813368245959282, "rewards/rank_analyze_format_reward": 0.3849602974951267, "rewards/rank_answer_foramt_reward": 0.74609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9964202791452408, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9964202791452408, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 519.21875, "epoch": 1.12, "grad_norm": 0.03727564588189125, "kl": 0.004870414733886719, "learning_rate": 1.9994906453282055e-05, "loss": -0.0243, "reward": 6.689180135726929, "reward_std": 1.181299865245819, "rewards/mrr_reward": 0.6408420205116272, "rewards/rank_analyze_format_reward": 0.2992168888449669, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 556.578125, "epoch": 1.1280000000000001, "grad_norm": 0.035206038504838943, "kl": 0.006304740905761719, "learning_rate": 1.9994825931092486e-05, "loss": -0.0367, "reward": 6.255677342414856, "reward_std": 1.9543142914772034, "rewards/mrr_reward": 0.5096974149346352, "rewards/rank_analyze_format_reward": 0.4908938556909561, "rewards/rank_answer_foramt_reward": 0.75390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9938564151525497, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9938564151525497, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 531.640625, "epoch": 1.1360000000000001, "grad_norm": 0.031349923461675644, "kl": 0.005738258361816406, "learning_rate": 1.9994744777575064e-05, "loss": 0.0027, "reward": 6.044549226760864, "reward_std": 1.1382797956466675, "rewards/mrr_reward": 0.48732637614011765, "rewards/rank_analyze_format_reward": 0.32322094589471817, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9817143976688385, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9817143976688385, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 566.484375, "epoch": 1.144, "grad_norm": 0.03165869414806366, "kl": 0.00574493408203125, "learning_rate": 1.999466299273491e-05, "loss": 0.0042, "reward": 6.553520321846008, "reward_std": 1.627190262079239, "rewards/mrr_reward": 0.5967448204755783, "rewards/rank_analyze_format_reward": 0.44682280719280243, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9907185882329941, "rewards/rank_overall_format_reward_more": 0.9375, "rewards/rank_verify_format_reward": 0.9438435882329941, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 531.203125, "epoch": 1.152, "grad_norm": 0.03519825637340546, "kl": 0.0061855316162109375, "learning_rate": 1.9994580576577193e-05, "loss": -0.0129, "reward": 5.729455947875977, "reward_std": 1.3913188576698303, "rewards/mrr_reward": 0.41431671380996704, "rewards/rank_analyze_format_reward": 0.32957829907536507, "rewards/rank_answer_foramt_reward": 0.76171875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 524.3125, "epoch": 1.16, "grad_norm": 0.03151266649365425, "kl": 0.006320953369140625, "learning_rate": 1.9994497529107118e-05, "loss": -0.0148, "reward": 6.072369456291199, "reward_std": 1.2223908305168152, "rewards/mrr_reward": 0.4943700544536114, "rewards/rank_analyze_format_reward": 0.23118487000465393, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 536.890625, "epoch": 1.168, "grad_norm": 0.03530154004693031, "kl": 0.005878448486328125, "learning_rate": 1.999441385032993e-05, "loss": -0.0308, "reward": 6.625038385391235, "reward_std": 1.2249456346035004, "rewards/mrr_reward": 0.6044022962450981, "rewards/rank_analyze_format_reward": 0.34935425966978073, "rewards/rank_answer_foramt_reward": 0.880859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9964202791452408, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9964202791452408, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 520.6875, "epoch": 1.176, "grad_norm": 0.035086773335933685, "kl": 0.007404327392578125, "learning_rate": 1.9994329540250918e-05, "loss": -0.0321, "reward": 6.542271018028259, "reward_std": 1.3827645033597946, "rewards/mrr_reward": 0.5946986712515354, "rewards/rank_analyze_format_reward": 0.292152963578701, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 533.8125, "epoch": 1.184, "grad_norm": 0.032674577087163925, "kl": 0.00643157958984375, "learning_rate": 1.99942445988754e-05, "loss": -0.033, "reward": 6.091751337051392, "reward_std": 1.2941071689128876, "rewards/mrr_reward": 0.479445680975914, "rewards/rank_analyze_format_reward": 0.35560934245586395, "rewards/rank_answer_foramt_reward": 0.818359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 551.0625, "epoch": 1.192, "grad_norm": 0.03282872214913368, "kl": 0.005850791931152344, "learning_rate": 1.999415902620875e-05, "loss": -0.025, "reward": 6.667526364326477, "reward_std": 1.0924562439322472, "rewards/mrr_reward": 0.604253463447094, "rewards/rank_analyze_format_reward": 0.3833247348666191, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 564.640625, "epoch": 1.2, "grad_norm": 0.032991521060466766, "kl": 0.0060176849365234375, "learning_rate": 1.999407282225637e-05, "loss": 0.0052, "reward": 5.798678278923035, "reward_std": 1.1788080930709839, "rewards/mrr_reward": 0.402951393276453, "rewards/rank_analyze_format_reward": 0.38478153944015503, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9938189834356308, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9781939834356308, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 575.6875, "epoch": 1.208, "grad_norm": 0.02944212593138218, "kl": 0.005663871765136719, "learning_rate": 1.9993985987023703e-05, "loss": -0.0115, "reward": 6.4621899127960205, "reward_std": 1.2538374364376068, "rewards/mrr_reward": 0.5166728720068932, "rewards/rank_analyze_format_reward": 0.4970608651638031, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 583.15625, "epoch": 1.216, "grad_norm": 0.03144199773669243, "kl": 0.005504608154296875, "learning_rate": 1.9993898520516233e-05, "loss": 0.0178, "reward": 7.210927963256836, "reward_std": 1.4847297072410583, "rewards/mrr_reward": 0.7067708373069763, "rewards/rank_analyze_format_reward": 0.5710361748933792, "rewards/rank_answer_foramt_reward": 0.845703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9835526347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 569.125, "epoch": 1.224, "grad_norm": 0.03476106375455856, "kl": 0.006566047668457031, "learning_rate": 1.9993810422739496e-05, "loss": -0.0255, "reward": 5.501855969429016, "reward_std": 1.1074179112911224, "rewards/mrr_reward": 0.2869729772210121, "rewards/rank_analyze_format_reward": 0.5554128363728523, "rewards/rank_answer_foramt_reward": 0.8359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9969318807125092, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9969318807125092, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 559.140625, "epoch": 1.232, "grad_norm": 0.0336473323404789, "kl": 0.006374359130859375, "learning_rate": 1.999372169369904e-05, "loss": -0.0304, "reward": 7.210146188735962, "reward_std": 1.3585944771766663, "rewards/mrr_reward": 0.7406249940395355, "rewards/rank_analyze_format_reward": 0.4076874777674675, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 565.015625, "epoch": 1.24, "grad_norm": 0.03243670612573624, "kl": 0.0064258575439453125, "learning_rate": 1.999363233340048e-05, "loss": 0.0124, "reward": 6.775290489196777, "reward_std": 1.6960014998912811, "rewards/mrr_reward": 0.6491319388151169, "rewards/rank_analyze_format_reward": 0.3369657965376973, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 567.671875, "epoch": 1.248, "grad_norm": 0.03587043285369873, "kl": 0.008923530578613281, "learning_rate": 1.9993542341849462e-05, "loss": -0.0172, "reward": 6.484335541725159, "reward_std": 1.4846598207950592, "rewards/mrr_reward": 0.545331098139286, "rewards/rank_analyze_format_reward": 0.5122500844299793, "rewards/rank_answer_foramt_reward": 0.822265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9959664940834045, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9959664940834045, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 541.015625, "epoch": 1.256, "grad_norm": 0.03247671574354172, "kl": 0.006220817565917969, "learning_rate": 1.9993451719051663e-05, "loss": -0.0057, "reward": 6.91133987903595, "reward_std": 1.076777160167694, "rewards/mrr_reward": 0.6208333373069763, "rewards/rank_analyze_format_reward": 0.4648074358701706, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 536.0, "epoch": 1.264, "grad_norm": 0.033995021134614944, "kl": 0.006260871887207031, "learning_rate": 1.999336046501281e-05, "loss": -0.0107, "reward": 6.491420269012451, "reward_std": 1.1248966604471207, "rewards/mrr_reward": 0.5537760369479656, "rewards/rank_analyze_format_reward": 0.38733571022748947, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 537.09375, "epoch": 1.272, "grad_norm": 0.03796149790287018, "kl": 0.008052825927734375, "learning_rate": 1.999326857973867e-05, "loss": -0.0482, "reward": 7.359132528305054, "reward_std": 1.4535967111587524, "rewards/mrr_reward": 0.7499999850988388, "rewards/rank_analyze_format_reward": 0.4489763230085373, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 564.671875, "epoch": 1.28, "grad_norm": 0.036241207271814346, "kl": 0.0068416595458984375, "learning_rate": 1.9993176063235046e-05, "loss": -0.0176, "reward": 7.0445317029953, "reward_std": 1.626471757888794, "rewards/mrr_reward": 0.662822425365448, "rewards/rank_analyze_format_reward": 0.5297309085726738, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 579.453125, "epoch": 1.288, "grad_norm": 0.037075724452733994, "kl": 0.006366729736328125, "learning_rate": 1.9993082915507776e-05, "loss": -0.0144, "reward": 6.376061797142029, "reward_std": 1.2733474969863892, "rewards/mrr_reward": 0.5289496555924416, "rewards/rank_analyze_format_reward": 0.42237265408039093, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 1.0, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 577.296875, "epoch": 1.296, "grad_norm": 0.03217633441090584, "kl": 0.006763458251953125, "learning_rate": 1.999298913656275e-05, "loss": -0.0085, "reward": 6.5947242975234985, "reward_std": 1.2779672592878342, "rewards/mrr_reward": 0.5658544301986694, "rewards/rank_analyze_format_reward": 0.4991602599620819, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 527.328125, "epoch": 1.304, "grad_norm": 0.03425245359539986, "kl": 0.007500648498535156, "learning_rate": 1.9992894726405894e-05, "loss": -0.0124, "reward": 6.557482957839966, "reward_std": 1.4475017786026, "rewards/mrr_reward": 0.5980902686715126, "rewards/rank_analyze_format_reward": 0.2979341112077236, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 564.59375, "epoch": 1.312, "grad_norm": 0.03478895127773285, "kl": 0.007842063903808594, "learning_rate": 1.9992799685043165e-05, "loss": -0.0553, "reward": 6.300098657608032, "reward_std": 1.0886222496628761, "rewards/mrr_reward": 0.48148561269044876, "rewards/rank_analyze_format_reward": 0.5164258703589439, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 568.953125, "epoch": 1.32, "grad_norm": 0.0446629598736763, "kl": 0.006804466247558594, "learning_rate": 1.999270401248057e-05, "loss": -0.0217, "reward": 6.716991662979126, "reward_std": 1.4165300726890564, "rewards/mrr_reward": 0.5955481305718422, "rewards/rank_analyze_format_reward": 0.43624673783779144, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 584.984375, "epoch": 1.328, "grad_norm": 0.03580395132303238, "kl": 0.008753776550292969, "learning_rate": 1.999260770872415e-05, "loss": 0.0004, "reward": 5.964340448379517, "reward_std": 1.1115762144327164, "rewards/mrr_reward": 0.3914806619286537, "rewards/rank_analyze_format_reward": 0.5389279127120972, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 571.453125, "epoch": 1.336, "grad_norm": 0.03663269430398941, "kl": 0.007559776306152344, "learning_rate": 1.999251077377999e-05, "loss": -0.0458, "reward": 6.374191999435425, "reward_std": 1.2914250791072845, "rewards/mrr_reward": 0.4921874962747097, "rewards/rank_analyze_format_reward": 0.5749406069517136, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 573.96875, "epoch": 1.3439999999999999, "grad_norm": 0.033142536878585815, "kl": 0.0076465606689453125, "learning_rate": 1.999241320765421e-05, "loss": -0.0188, "reward": 6.285339713096619, "reward_std": 1.369349867105484, "rewards/mrr_reward": 0.4851934462785721, "rewards/rank_analyze_format_reward": 0.479331374168396, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 547.984375, "epoch": 1.3519999999999999, "grad_norm": 0.03498915210366249, "kl": 0.008561134338378906, "learning_rate": 1.9992315010352978e-05, "loss": -0.0274, "reward": 6.904844880104065, "reward_std": 1.2574369013309479, "rewards/mrr_reward": 0.6432291716337204, "rewards/rank_analyze_format_reward": 0.44520963728427887, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 576.765625, "epoch": 1.3599999999999999, "grad_norm": 0.03831981122493744, "kl": 0.010663986206054688, "learning_rate": 1.9992216181882492e-05, "loss": -0.0089, "reward": 6.317743182182312, "reward_std": 1.1524057537317276, "rewards/mrr_reward": 0.4661892428994179, "rewards/rank_analyze_format_reward": 0.5786355137825012, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9977221935987473, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9977221935987473, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 549.5625, "epoch": 1.3679999999999999, "grad_norm": 0.03543487936258316, "kl": 0.008349418640136719, "learning_rate": 1.9992116722248997e-05, "loss": 0.009, "reward": 6.215874433517456, "reward_std": 1.5494773089885712, "rewards/mrr_reward": 0.5115203410387039, "rewards/rank_analyze_format_reward": 0.3291758671402931, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9984335899353027, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9984335899353027, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 563.40625, "epoch": 1.376, "grad_norm": 0.03732339292764664, "kl": 0.00754547119140625, "learning_rate": 1.9992016631458774e-05, "loss": -0.0044, "reward": 6.333064913749695, "reward_std": 1.5073265135288239, "rewards/mrr_reward": 0.5336371585726738, "rewards/rank_analyze_format_reward": 0.373980063945055, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9962525367736816, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9962525367736816, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 565.109375, "epoch": 1.384, "grad_norm": 0.03858262300491333, "kl": 0.0084381103515625, "learning_rate": 1.9991915909518146e-05, "loss": -0.0484, "reward": 6.43630588054657, "reward_std": 1.1248457580804825, "rewards/mrr_reward": 0.5476128421723843, "rewards/rank_analyze_format_reward": 0.5014840885996819, "rewards/rank_answer_foramt_reward": 0.779296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9825367629528046, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 603.625, "epoch": 1.392, "grad_norm": 0.031945351511240005, "kl": 0.008295059204101562, "learning_rate": 1.9991814556433475e-05, "loss": -0.0415, "reward": 6.396109580993652, "reward_std": 1.257490947842598, "rewards/mrr_reward": 0.5233692973852158, "rewards/rank_analyze_format_reward": 0.5277140513062477, "rewards/rank_answer_foramt_reward": 0.830078125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9919514656066895, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9919514656066895, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 547.390625, "epoch": 1.4, "grad_norm": 0.03459803760051727, "kl": 0.009899139404296875, "learning_rate": 1.9991712572211163e-05, "loss": -0.0283, "reward": 6.693902850151062, "reward_std": 1.5040415227413177, "rewards/mrr_reward": 0.6218750029802322, "rewards/rank_analyze_format_reward": 0.38910815864801407, "rewards/rank_answer_foramt_reward": 0.822265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.997514471411705, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.997514471411705, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 596.953125, "epoch": 1.408, "grad_norm": 0.03564726933836937, "kl": 0.008817672729492188, "learning_rate": 1.999160995685765e-05, "loss": 0.0041, "reward": 6.393037676811218, "reward_std": 1.5305506885051727, "rewards/mrr_reward": 0.4989583343267441, "rewards/rank_analyze_format_reward": 0.5883020609617233, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 610.8125, "epoch": 1.416, "grad_norm": 0.0317256897687912, "kl": 0.007786750793457031, "learning_rate": 1.9991506710379424e-05, "loss": -0.0038, "reward": 6.927413702011108, "reward_std": 1.1728498041629791, "rewards/mrr_reward": 0.5874070003628731, "rewards/rank_analyze_format_reward": 0.7316593676805496, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 614.015625, "epoch": 1.424, "grad_norm": 0.03363263979554176, "kl": 0.006999015808105469, "learning_rate": 1.9991402832783e-05, "loss": -0.0222, "reward": 6.334396123886108, "reward_std": 1.1969702541828156, "rewards/mrr_reward": 0.49082961305975914, "rewards/rank_analyze_format_reward": 0.532570406794548, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9671052694320679, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 639.0625, "epoch": 1.432, "grad_norm": 0.0324893482029438, "kl": 0.0071563720703125, "learning_rate": 1.9991298324074942e-05, "loss": -0.0215, "reward": 6.33887255191803, "reward_std": 1.0472588911652565, "rewards/mrr_reward": 0.4644531235098839, "rewards/rank_analyze_format_reward": 0.6554184406995773, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9817143976688385, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.996271014213562, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 597.53125, "epoch": 1.44, "grad_norm": 0.038431741297245026, "kl": 0.0085906982421875, "learning_rate": 1.999119318426185e-05, "loss": -0.0425, "reward": 5.978406071662903, "reward_std": 1.4375847578048706, "rewards/mrr_reward": 0.37400173395872116, "rewards/rank_analyze_format_reward": 0.654976025223732, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9937897026538849, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9937897026538849, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 644.21875, "epoch": 1.448, "grad_norm": 0.03540867194533348, "kl": 0.008702278137207031, "learning_rate": 1.9991087413350367e-05, "loss": 0.0273, "reward": 7.00466001033783, "reward_std": 1.6190518736839294, "rewards/mrr_reward": 0.6167968884110451, "rewards/rank_analyze_format_reward": 0.7251099199056625, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_contrast_format_reward": 0.012996495701372623, "rewards/rank_initial_format_reward": 0.9817143976688385, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9817143976688385, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 575.96875, "epoch": 1.456, "grad_norm": 0.03581292927265167, "kl": 0.009832382202148438, "learning_rate": 1.9990981011347172e-05, "loss": -0.0048, "reward": 5.947044134140015, "reward_std": 0.9751862585544586, "rewards/mrr_reward": 0.3696366660296917, "rewards/rank_analyze_format_reward": 0.6148670166730881, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 584.296875, "epoch": 1.464, "grad_norm": 0.0367310456931591, "kl": 0.008490562438964844, "learning_rate": 1.999087397825899e-05, "loss": -0.0219, "reward": 6.547907114028931, "reward_std": 0.9392938762903214, "rewards/mrr_reward": 0.5391679182648659, "rewards/rank_analyze_format_reward": 0.5218650847673416, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9825367629528046, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 584.265625, "epoch": 1.472, "grad_norm": 0.036771222949028015, "kl": 0.00975799560546875, "learning_rate": 1.9990766314092575e-05, "loss": 0.0093, "reward": 7.504821062088013, "reward_std": 1.017032966017723, "rewards/mrr_reward": 0.7345609813928604, "rewards/rank_analyze_format_reward": 0.6810796558856964, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 570.015625, "epoch": 1.48, "grad_norm": 0.035612449049949646, "kl": 0.009767532348632812, "learning_rate": 1.9990658018854737e-05, "loss": -0.0192, "reward": 6.572237730026245, "reward_std": 1.1024248152971268, "rewards/mrr_reward": 0.5541418492794037, "rewards/rank_analyze_format_reward": 0.48548950254917145, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9966137856245041, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9809887856245041, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 610.625, "epoch": 1.488, "grad_norm": 0.03268010914325714, "kl": 0.007953643798828125, "learning_rate": 1.9990549092552307e-05, "loss": -0.0163, "reward": 7.752923250198364, "reward_std": 1.1804132461547852, "rewards/mrr_reward": 0.7671007066965103, "rewards/rank_analyze_format_reward": 0.7255359292030334, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 618.6875, "epoch": 1.496, "grad_norm": 0.033477578312158585, "kl": 0.010374069213867188, "learning_rate": 1.999043953519217e-05, "loss": -0.0446, "reward": 6.951627135276794, "reward_std": 1.142410233616829, "rewards/mrr_reward": 0.5984498858451843, "rewards/rank_analyze_format_reward": 0.6071917712688446, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9977788031101227, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9977788031101227, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 591.640625, "epoch": 1.504, "grad_norm": 0.03295287489891052, "kl": 0.008535385131835938, "learning_rate": 1.999032934678125e-05, "loss": -0.0228, "reward": 6.217561841011047, "reward_std": 0.885568305850029, "rewards/mrr_reward": 0.4311321973800659, "rewards/rank_analyze_format_reward": 0.5961254388093948, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 595.359375, "epoch": 1.512, "grad_norm": 0.03543277829885483, "kl": 0.008122444152832031, "learning_rate": 1.99902185273265e-05, "loss": -0.0164, "reward": 6.661153793334961, "reward_std": 0.7280477955937386, "rewards/mrr_reward": 0.5164248645305634, "rewards/rank_analyze_format_reward": 0.623047724366188, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9969455003738403, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9969455003738403, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 626.109375, "epoch": 1.52, "grad_norm": 0.037641286849975586, "kl": 0.008847236633300781, "learning_rate": 1.999010707683492e-05, "loss": -0.0658, "reward": 6.347493886947632, "reward_std": 0.9116277098655701, "rewards/mrr_reward": 0.43297991901636124, "rewards/rank_analyze_format_reward": 0.666047140955925, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 610.25, "epoch": 1.528, "grad_norm": 0.03490091487765312, "kl": 0.009138107299804688, "learning_rate": 1.998999499531356e-05, "loss": -0.0516, "reward": 7.269640564918518, "reward_std": 0.6211766228079796, "rewards/mrr_reward": 0.6727616675198078, "rewards/rank_analyze_format_reward": 0.6237920597195625, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.011442550458014011, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 581.203125, "epoch": 1.536, "grad_norm": 0.03623748943209648, "kl": 0.010578155517578125, "learning_rate": 1.9989882282769485e-05, "loss": -0.0328, "reward": 6.117859721183777, "reward_std": 1.3281791657209396, "rewards/mrr_reward": 0.4266369119286537, "rewards/rank_analyze_format_reward": 0.5988272428512573, "rewards/rank_answer_foramt_reward": 0.83203125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9980392158031464, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9980392158031464, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 631.859375, "epoch": 1.544, "grad_norm": 0.03717571124434471, "kl": 0.012277603149414062, "learning_rate": 1.9989768939209826e-05, "loss": -0.0291, "reward": 6.472392678260803, "reward_std": 1.0950042307376862, "rewards/mrr_reward": 0.4958333298563957, "rewards/rank_analyze_format_reward": 0.7097622603178024, "rewards/rank_answer_foramt_reward": 0.833984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 1.0, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 603.59375, "epoch": 1.552, "grad_norm": 0.031889960169792175, "kl": 0.0109710693359375, "learning_rate": 1.9989654964641737e-05, "loss": -0.0297, "reward": 6.880647420883179, "reward_std": 0.8547341153025627, "rewards/mrr_reward": 0.580071933567524, "rewards/rank_analyze_format_reward": 0.6654053032398224, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 599.234375, "epoch": 1.56, "grad_norm": 0.036006029695272446, "kl": 0.012359619140625, "learning_rate": 1.998954035907242e-05, "loss": -0.0148, "reward": 6.577338814735413, "reward_std": 1.2951306998729706, "rewards/mrr_reward": 0.5316840335726738, "rewards/rank_analyze_format_reward": 0.5455324053764343, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9984335899353027, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9984335899353027, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 631.953125, "epoch": 1.568, "grad_norm": 0.030870715156197548, "kl": 0.0106658935546875, "learning_rate": 1.9989425122509113e-05, "loss": -0.0305, "reward": 6.851738214492798, "reward_std": 0.7111386805772781, "rewards/mrr_reward": 0.5270833075046539, "rewards/rank_analyze_format_reward": 0.784420520067215, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 620.390625, "epoch": 1.576, "grad_norm": 0.03541827201843262, "kl": 0.011089324951171875, "learning_rate": 1.9989309254959096e-05, "loss": -0.0172, "reward": 7.087416887283325, "reward_std": 1.3555363416671753, "rewards/mrr_reward": 0.6345486119389534, "rewards/rank_analyze_format_reward": 0.7166584730148315, "rewards/rank_answer_foramt_reward": 0.873046875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992897808551788, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9836647808551788, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 621.65625, "epoch": 1.584, "grad_norm": 0.0363469123840332, "kl": 0.011362075805664062, "learning_rate": 1.998919275642968e-05, "loss": 0.0444, "reward": 6.63647723197937, "reward_std": 1.5355401635169983, "rewards/mrr_reward": 0.537413202226162, "rewards/rank_analyze_format_reward": 0.7207763195037842, "rewards/rank_answer_foramt_reward": 0.81640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 619.609375, "epoch": 1.592, "grad_norm": 0.0350794680416584, "kl": 0.010587692260742188, "learning_rate": 1.9989075626928237e-05, "loss": -0.0324, "reward": 7.593704700469971, "reward_std": 1.2587448060512543, "rewards/mrr_reward": 0.7476562410593033, "rewards/rank_analyze_format_reward": 0.6766816079616547, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 641.125, "epoch": 1.6, "grad_norm": 0.03762039542198181, "kl": 0.011552810668945312, "learning_rate": 1.9988957866462155e-05, "loss": 0.0012, "reward": 6.556584358215332, "reward_std": 0.7802992425858974, "rewards/mrr_reward": 0.4782552234828472, "rewards/rank_analyze_format_reward": 0.7131441533565521, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 615.515625, "epoch": 1.608, "grad_norm": 0.03529913350939751, "kl": 0.011404037475585938, "learning_rate": 1.998883947503888e-05, "loss": -0.0285, "reward": 6.747278928756714, "reward_std": 0.8986479938030243, "rewards/mrr_reward": 0.5536458566784859, "rewards/rank_analyze_format_reward": 0.6645828187465668, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9975329041481018, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9975329041481018, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 636.9375, "epoch": 1.616, "grad_norm": 0.03680592030286789, "kl": 0.011167526245117188, "learning_rate": 1.9988720452665885e-05, "loss": -0.0142, "reward": 7.523893117904663, "reward_std": 1.5109763741493225, "rewards/mrr_reward": 0.7254092246294022, "rewards/rank_analyze_format_reward": 0.6940822452306747, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.013573232106864452, "rewards/rank_initial_format_reward": 0.9973393976688385, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9973393976688385, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 654.671875, "epoch": 1.624, "grad_norm": 0.031202631071209908, "kl": 0.011119842529296875, "learning_rate": 1.9988600799350685e-05, "loss": -0.011, "reward": 7.5892653465271, "reward_std": 0.8718039393424988, "rewards/mrr_reward": 0.7219122052192688, "rewards/rank_analyze_format_reward": 0.7621632516384125, "rewards/rank_answer_foramt_reward": 0.955078125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 624.46875, "epoch": 1.6320000000000001, "grad_norm": 0.03513794392347336, "kl": 0.011323928833007812, "learning_rate": 1.998848051510085e-05, "loss": -0.0116, "reward": 7.873760461807251, "reward_std": 0.9668747493997216, "rewards/mrr_reward": 0.8035590276122093, "rewards/rank_analyze_format_reward": 0.738672748208046, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9985119104385376, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9985119104385376, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 616.203125, "epoch": 1.6400000000000001, "grad_norm": 0.03514819219708443, "kl": 0.012950897216796875, "learning_rate": 1.9988359599923964e-05, "loss": -0.0071, "reward": 6.787094712257385, "reward_std": 1.260214388370514, "rewards/mrr_reward": 0.5561384037137032, "rewards/rank_analyze_format_reward": 0.7126235961914062, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 650.375, "epoch": 1.6480000000000001, "grad_norm": 0.03228963539004326, "kl": 0.012664794921875, "learning_rate": 1.9988238053827677e-05, "loss": -0.0375, "reward": 7.256770491600037, "reward_std": 0.48313772678375244, "rewards/mrr_reward": 0.6615699455142021, "rewards/rank_analyze_format_reward": 0.6667077392339706, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 667.546875, "epoch": 1.6560000000000001, "grad_norm": 0.03247380256652832, "kl": 0.011510848999023438, "learning_rate": 1.9988115876819654e-05, "loss": -0.0066, "reward": 7.226160883903503, "reward_std": 0.7291913609951735, "rewards/mrr_reward": 0.6235677003860474, "rewards/rank_analyze_format_reward": 0.7994760870933533, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9984335899353027, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9984335899353027, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 617.203125, "epoch": 1.6640000000000001, "grad_norm": 0.03485206514596939, "kl": 0.012271881103515625, "learning_rate": 1.9987993068907624e-05, "loss": -0.0256, "reward": 6.819635629653931, "reward_std": 1.4911159574985504, "rewards/mrr_reward": 0.5687500089406967, "rewards/rank_analyze_format_reward": 0.6510217636823654, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.014248084276914597, "rewards/rank_initial_format_reward": 0.9963235259056091, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9963235259056091, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 640.609375, "epoch": 1.6720000000000002, "grad_norm": 0.03412061929702759, "kl": 0.011951446533203125, "learning_rate": 1.9987869630099333e-05, "loss": -0.0183, "reward": 7.066570281982422, "reward_std": 1.0215441137552261, "rewards/mrr_reward": 0.6142113208770752, "rewards/rank_analyze_format_reward": 0.7064512819051743, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9985119104385376, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9985119104385376, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 618.9375, "epoch": 1.6800000000000002, "grad_norm": 0.035708099603652954, "kl": 0.011415481567382812, "learning_rate": 1.998774556040259e-05, "loss": 0.0207, "reward": 7.148289203643799, "reward_std": 0.40048687532544136, "rewards/mrr_reward": 0.6233135014772415, "rewards/rank_analyze_format_reward": 0.6780079305171967, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9992559552192688, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 642.90625, "epoch": 1.688, "grad_norm": 0.03594611957669258, "kl": 0.012083053588867188, "learning_rate": 1.9987620859825225e-05, "loss": 0.007, "reward": 7.130272626876831, "reward_std": 1.0038132444024086, "rewards/mrr_reward": 0.5943328440189362, "rewards/rank_analyze_format_reward": 0.8232535421848297, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 632.3125, "epoch": 1.696, "grad_norm": 0.03679274767637253, "kl": 0.012105941772460938, "learning_rate": 1.9987495528375115e-05, "loss": 0.0071, "reward": 7.324402451515198, "reward_std": 1.0858530811965466, "rewards/mrr_reward": 0.6619791686534882, "rewards/rank_analyze_format_reward": 0.7507043033838272, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 633.578125, "epoch": 1.704, "grad_norm": 0.032590728253126144, "kl": 0.011951446533203125, "learning_rate": 1.998736956606018e-05, "loss": -0.0204, "reward": 7.353400826454163, "reward_std": 1.2198131084442139, "rewards/mrr_reward": 0.6886904761195183, "rewards/rank_analyze_format_reward": 0.7275451272726059, "rewards/rank_answer_foramt_reward": 0.92578125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.984375, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 648.125, "epoch": 1.712, "grad_norm": 0.03516772761940956, "kl": 0.011865615844726562, "learning_rate": 1.9987242972888368e-05, "loss": -0.0256, "reward": 6.390246629714966, "reward_std": 1.2206433862447739, "rewards/mrr_reward": 0.4318266250193119, "rewards/rank_analyze_format_reward": 0.744662880897522, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 626.515625, "epoch": 1.72, "grad_norm": 0.034917186945676804, "kl": 0.010568618774414062, "learning_rate": 1.9987115748867685e-05, "loss": -0.0075, "reward": 7.013459086418152, "reward_std": 1.1758202761411667, "rewards/mrr_reward": 0.6146267428994179, "rewards/rank_analyze_format_reward": 0.6679250225424767, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 645.75, "epoch": 1.728, "grad_norm": 0.03526122495532036, "kl": 0.011056900024414062, "learning_rate": 1.9986987894006164e-05, "loss": -0.0348, "reward": 7.004386067390442, "reward_std": 1.013509213924408, "rewards/mrr_reward": 0.6190104186534882, "rewards/rank_analyze_format_reward": 0.6921346038579941, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_contrast_format_reward": 0.014774133451282978, "rewards/rank_initial_format_reward": 0.9898194670677185, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9898194670677185, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 587.296875, "epoch": 1.736, "grad_norm": 0.03770997375249863, "kl": 0.013156890869140625, "learning_rate": 1.9986859408311878e-05, "loss": -0.0243, "reward": 7.723721385002136, "reward_std": 1.2577708065509796, "rewards/mrr_reward": 0.8122829794883728, "rewards/rank_analyze_format_reward": 0.5527143776416779, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 625.390625, "epoch": 1.744, "grad_norm": 0.03594063222408295, "kl": 0.01361083984375, "learning_rate": 1.9986730291792945e-05, "loss": -0.0125, "reward": 6.763970732688904, "reward_std": 1.1345993727445602, "rewards/mrr_reward": 0.5692894533276558, "rewards/rank_analyze_format_reward": 0.6561181470751762, "rewards/rank_answer_foramt_reward": 0.857421875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 595.8125, "epoch": 1.752, "grad_norm": 0.03884103149175644, "kl": 0.012781143188476562, "learning_rate": 1.9986600544457524e-05, "loss": -0.0204, "reward": 6.09786331653595, "reward_std": 1.128006488084793, "rewards/mrr_reward": 0.4502604268491268, "rewards/rank_analyze_format_reward": 0.48041532188653946, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 611.0625, "epoch": 1.76, "grad_norm": 0.03623896837234497, "kl": 0.01174163818359375, "learning_rate": 1.9986470166313805e-05, "loss": 0.0022, "reward": 6.999427080154419, "reward_std": 0.6746486648917198, "rewards/mrr_reward": 0.608004704117775, "rewards/rank_analyze_format_reward": 0.6940975040197372, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991554021835327, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9835304021835327, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 640.984375, "epoch": 1.768, "grad_norm": 0.03681391850113869, "kl": 0.01168060302734375, "learning_rate": 1.9986339157370026e-05, "loss": 0.0156, "reward": 6.224501371383667, "reward_std": 1.1424128413200378, "rewards/mrr_reward": 0.4014260917901993, "rewards/rank_analyze_format_reward": 0.7514945864677429, "rewards/rank_answer_foramt_reward": 0.884765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 675.609375, "epoch": 1.776, "grad_norm": 0.035715728998184204, "kl": 0.011419296264648438, "learning_rate": 1.9986207517634466e-05, "loss": -0.0075, "reward": 6.838769316673279, "reward_std": 1.138169839978218, "rewards/mrr_reward": 0.5326946973800659, "rewards/rank_analyze_format_reward": 0.8114955276250839, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9951225072145462, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9951225072145462, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 650.0625, "epoch": 1.784, "grad_norm": 0.03682604804635048, "kl": 0.012578964233398438, "learning_rate": 1.998607524711543e-05, "loss": -0.024, "reward": 6.9665446281433105, "reward_std": 1.341919094324112, "rewards/mrr_reward": 0.5829861015081406, "rewards/rank_analyze_format_reward": 0.8279595226049423, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9453125, "rewards/rank_verify_format_reward": 0.984375, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 664.046875, "epoch": 1.792, "grad_norm": 0.03405497223138809, "kl": 0.012273788452148438, "learning_rate": 1.9985942345821285e-05, "loss": 0.0101, "reward": 7.542881608009338, "reward_std": 0.9405869543552399, "rewards/mrr_reward": 0.70331721752882, "rewards/rank_analyze_format_reward": 0.8331284523010254, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 1.0, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 641.34375, "epoch": 1.8, "grad_norm": 0.03588543459773064, "kl": 0.010999679565429688, "learning_rate": 1.998580881376042e-05, "loss": 0.0182, "reward": 7.186712980270386, "reward_std": 1.0480735301971436, "rewards/mrr_reward": 0.6471106112003326, "rewards/rank_analyze_format_reward": 0.7620245963335037, "rewards/rank_answer_foramt_reward": 0.869140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 679.96875, "epoch": 1.808, "grad_norm": 0.03298197686672211, "kl": 0.011339187622070312, "learning_rate": 1.9985674650941265e-05, "loss": -0.0075, "reward": 6.580728888511658, "reward_std": 1.171303242444992, "rewards/mrr_reward": 0.49358879029750824, "rewards/rank_analyze_format_reward": 0.7704363465309143, "rewards/rank_answer_foramt_reward": 0.8984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.984375, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 641.984375, "epoch": 1.8159999999999998, "grad_norm": 0.032649777829647064, "kl": 0.011322021484375, "learning_rate": 1.9985539857372303e-05, "loss": -0.0173, "reward": 6.867309093475342, "reward_std": 0.8678697645664215, "rewards/mrr_reward": 0.557161457836628, "rewards/rank_analyze_format_reward": 0.736319363117218, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 644.921875, "epoch": 1.8239999999999998, "grad_norm": 0.038027409464120865, "kl": 0.011888504028320312, "learning_rate": 1.998540443306204e-05, "loss": 0.0094, "reward": 6.406673431396484, "reward_std": 1.36880823969841, "rewards/mrr_reward": 0.47701510787010193, "rewards/rank_analyze_format_reward": 0.7093206197023392, "rewards/rank_answer_foramt_reward": 0.84765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9825367629528046, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 691.5625, "epoch": 1.8319999999999999, "grad_norm": 0.03739263862371445, "kl": 0.016162872314453125, "learning_rate": 1.998526837801904e-05, "loss": -0.0163, "reward": 6.16663670539856, "reward_std": 0.7895801216363907, "rewards/mrr_reward": 0.36532738618552685, "rewards/rank_analyze_format_reward": 0.8101400434970856, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.013089364394545555, "rewards/rank_initial_format_reward": 0.9976895451545715, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9820645451545715, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 646.640625, "epoch": 1.8399999999999999, "grad_norm": 0.03776266425848007, "kl": 0.010849952697753906, "learning_rate": 1.9985131692251887e-05, "loss": 0.0068, "reward": 6.760786771774292, "reward_std": 1.123057559132576, "rewards/mrr_reward": 0.5368923768401146, "rewards/rank_analyze_format_reward": 0.7293006330728531, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9985989332199097, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9985989332199097, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 613.015625, "epoch": 1.8479999999999999, "grad_norm": 0.03792120888829231, "kl": 0.01201629638671875, "learning_rate": 1.9984994375769222e-05, "loss": -0.0071, "reward": 7.100589036941528, "reward_std": 1.1812313869595528, "rewards/mrr_reward": 0.6353298723697662, "rewards/rank_analyze_format_reward": 0.6587639302015305, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 645.921875, "epoch": 1.8559999999999999, "grad_norm": 0.034471407532691956, "kl": 0.012517929077148438, "learning_rate": 1.9984856428579717e-05, "loss": -0.0154, "reward": 7.1253886222839355, "reward_std": 0.9221947491168976, "rewards/mrr_reward": 0.6032862067222595, "rewards/rank_analyze_format_reward": 0.8059941083192825, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 621.453125, "epoch": 1.8639999999999999, "grad_norm": 0.03487012907862663, "kl": 0.010175704956054688, "learning_rate": 1.998471785069208e-05, "loss": -0.0252, "reward": 7.108256816864014, "reward_std": 1.0599358081817627, "rewards/mrr_reward": 0.6225880309939384, "rewards/rank_analyze_format_reward": 0.7038420140743256, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 614.71875, "epoch": 1.8719999999999999, "grad_norm": 0.03678525239229202, "kl": 0.011201858520507812, "learning_rate": 1.9984578642115072e-05, "loss": -0.0072, "reward": 7.174077749252319, "reward_std": 1.0892403870821, "rewards/mrr_reward": 0.6339843720197678, "rewards/rank_analyze_format_reward": 0.7592339366674423, "rewards/rank_answer_foramt_reward": 0.92578125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.984375, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 626.484375, "epoch": 1.88, "grad_norm": 0.03547394275665283, "kl": 0.012744903564453125, "learning_rate": 1.998443880285748e-05, "loss": -0.0371, "reward": 7.188539266586304, "reward_std": 1.558995470404625, "rewards/mrr_reward": 0.6575520932674408, "rewards/rank_analyze_format_reward": 0.7216200232505798, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9828085899353027, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9828085899353027, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 612.84375, "epoch": 1.888, "grad_norm": 0.03983930125832558, "kl": 0.011608123779296875, "learning_rate": 1.9984298332928142e-05, "loss": -0.0087, "reward": 7.840075254440308, "reward_std": 1.4074196517467499, "rewards/mrr_reward": 0.8069010525941849, "rewards/rank_analyze_format_reward": 0.7511429786682129, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.96875, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 608.953125, "epoch": 1.896, "grad_norm": 0.03890369087457657, "kl": 0.012737274169921875, "learning_rate": 1.9984157232335926e-05, "loss": -0.0036, "reward": 6.91395902633667, "reward_std": 1.4606387615203857, "rewards/mrr_reward": 0.5853298753499985, "rewards/rank_analyze_format_reward": 0.6925735026597977, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9966736733913422, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9966736733913422, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 626.671875, "epoch": 1.904, "grad_norm": 0.03019222430884838, "kl": 0.010616302490234375, "learning_rate": 1.998401550108975e-05, "loss": -0.0175, "reward": 7.32897675037384, "reward_std": 0.9665245488286018, "rewards/mrr_reward": 0.676432304084301, "rewards/rank_analyze_format_reward": 0.7130914330482483, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 636.8125, "epoch": 1.912, "grad_norm": 0.03469119966030121, "kl": 0.011953353881835938, "learning_rate": 1.9983873139198565e-05, "loss": 0.0037, "reward": 6.612988352775574, "reward_std": 1.0446814224123955, "rewards/mrr_reward": 0.47405755519866943, "rewards/rank_analyze_format_reward": 0.7889089584350586, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 621.203125, "epoch": 1.92, "grad_norm": 0.03232351318001747, "kl": 0.0106048583984375, "learning_rate": 1.9983730146671363e-05, "loss": -0.0148, "reward": 6.731534361839294, "reward_std": 1.2606956362724304, "rewards/mrr_reward": 0.5494357720017433, "rewards/rank_analyze_format_reward": 0.6778992190957069, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 606.515625, "epoch": 1.928, "grad_norm": 0.03420478478074074, "kl": 0.01219940185546875, "learning_rate": 1.9983586523517175e-05, "loss": -0.0438, "reward": 7.590452075004578, "reward_std": 1.6388859748840332, "rewards/mrr_reward": 0.7669270932674408, "rewards/rank_analyze_format_reward": 0.6672752201557159, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.96875, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 636.84375, "epoch": 1.936, "grad_norm": 0.03276892751455307, "kl": 0.01093292236328125, "learning_rate": 1.9983442269745073e-05, "loss": -0.0257, "reward": 6.300868988037109, "reward_std": 0.995959609746933, "rewards/mrr_reward": 0.4575396776199341, "rewards/rank_analyze_format_reward": 0.6771672368049622, "rewards/rank_answer_foramt_reward": 0.818359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9954044073820114, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9954044073820114, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 617.359375, "epoch": 1.944, "grad_norm": 0.03749570995569229, "kl": 0.010999679565429688, "learning_rate": 1.9983297385364166e-05, "loss": -0.0007, "reward": 7.169430136680603, "reward_std": 1.120530128479004, "rewards/mrr_reward": 0.6705729141831398, "rewards/rank_analyze_format_reward": 0.6551071107387543, "rewards/rank_answer_foramt_reward": 0.86328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 613.453125, "epoch": 1.952, "grad_norm": 0.04473373666405678, "kl": 0.01145172119140625, "learning_rate": 1.9983151870383614e-05, "loss": -0.0107, "reward": 6.484450101852417, "reward_std": 1.0988103747367859, "rewards/mrr_reward": 0.46861979365348816, "rewards/rank_analyze_format_reward": 0.764109417796135, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9932432472705841, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9776182472705841, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 621.328125, "epoch": 1.96, "grad_norm": 0.03367482125759125, "kl": 0.013071060180664062, "learning_rate": 1.99830057248126e-05, "loss": -0.0296, "reward": 6.685883641242981, "reward_std": 0.9533030688762665, "rewards/mrr_reward": 0.5285590291023254, "rewards/rank_analyze_format_reward": 0.6315773874521255, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 655.34375, "epoch": 1.968, "grad_norm": 0.034726452082395554, "kl": 0.010492324829101562, "learning_rate": 1.9982858948660363e-05, "loss": -0.0181, "reward": 6.672136902809143, "reward_std": 1.0319916605949402, "rewards/mrr_reward": 0.48452381789684296, "rewards/rank_analyze_format_reward": 0.7848227173089981, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 612.4375, "epoch": 1.976, "grad_norm": 0.03818318620324135, "kl": 0.00988006591796875, "learning_rate": 1.9982711541936167e-05, "loss": -0.0117, "reward": 7.333935976028442, "reward_std": 1.081397719681263, "rewards/mrr_reward": 0.6711309552192688, "rewards/rank_analyze_format_reward": 0.7507446557283401, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 619.75, "epoch": 1.984, "grad_norm": 0.09436666965484619, "kl": 0.037281036376953125, "learning_rate": 1.9982563504649327e-05, "loss": -0.0099, "reward": 7.042810320854187, "reward_std": 1.4771567583084106, "rewards/mrr_reward": 0.6272321417927742, "rewards/rank_analyze_format_reward": 0.7505638301372528, "rewards/rank_answer_foramt_reward": 0.80859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 593.03125, "epoch": 1.992, "grad_norm": 0.040008366107940674, "kl": 0.011211395263671875, "learning_rate": 1.998241483680919e-05, "loss": 0.0073, "reward": 6.97391951084137, "reward_std": 1.2818303257226944, "rewards/mrr_reward": 0.5991319715976715, "rewards/rank_analyze_format_reward": 0.7252494841814041, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983368366956711, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9827118366956711, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 644.59375, "epoch": 2.0, "grad_norm": 0.03942989930510521, "kl": 0.011959075927734375, "learning_rate": 1.9982265538425157e-05, "loss": 0.0371, "reward": 6.234715461730957, "reward_std": 1.436354637145996, "rewards/mrr_reward": 0.47746776789426804, "rewards/rank_analyze_format_reward": 0.5933748111128807, "rewards/rank_answer_foramt_reward": 0.818359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9800696671009064, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 0.9799154698848724, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 636.46875, "epoch": 2.008, "grad_norm": 0.03840762376785278, "kl": 0.01082611083984375, "learning_rate": 1.9982115609506648e-05, "loss": -0.0149, "reward": 7.465001344680786, "reward_std": 1.3534227311611176, "rewards/mrr_reward": 0.701078861951828, "rewards/rank_analyze_format_reward": 0.73264279961586, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 623.4375, "epoch": 2.016, "grad_norm": 0.03643304482102394, "kl": 0.0111846923828125, "learning_rate": 1.9981965050063134e-05, "loss": 0.0095, "reward": 6.563894629478455, "reward_std": 1.0918782949447632, "rewards/mrr_reward": 0.49427083879709244, "rewards/rank_analyze_format_reward": 0.6970945447683334, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9975927919149399, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9975927919149399, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 623.140625, "epoch": 2.024, "grad_norm": 0.03906136751174927, "kl": 0.011167526245117188, "learning_rate": 1.998181386010413e-05, "loss": 0.0076, "reward": 7.883460879325867, "reward_std": 0.9759941548109055, "rewards/mrr_reward": 0.7747395783662796, "rewards/rank_analyze_format_reward": 0.8154443502426147, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 589.5, "epoch": 2.032, "grad_norm": 0.035734061151742935, "kl": 0.014560699462890625, "learning_rate": 1.9981662039639182e-05, "loss": -0.0189, "reward": 7.1975014209747314, "reward_std": 1.0746060460805893, "rewards/mrr_reward": 0.6796006858348846, "rewards/rank_analyze_format_reward": 0.5943331569433212, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 608.125, "epoch": 2.04, "grad_norm": 0.035253558307886124, "kl": 0.011186599731445312, "learning_rate": 1.9981509588677883e-05, "loss": -0.0403, "reward": 6.368244171142578, "reward_std": 0.9275897480547428, "rewards/mrr_reward": 0.43844248354434967, "rewards/rank_analyze_format_reward": 0.7043182849884033, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 593.515625, "epoch": 2.048, "grad_norm": 0.03972550854086876, "kl": 0.012750625610351562, "learning_rate": 1.9981356507229862e-05, "loss": -0.0269, "reward": 6.800292491912842, "reward_std": 1.1689245849847794, "rewards/mrr_reward": 0.5714161694049835, "rewards/rank_analyze_format_reward": 0.627255916595459, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9964202791452408, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9964202791452408, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 593.203125, "epoch": 2.056, "grad_norm": 0.03649269416928291, "kl": 0.009584426879882812, "learning_rate": 1.9981202795304787e-05, "loss": -0.0051, "reward": 7.230230689048767, "reward_std": 1.2953073680400848, "rewards/mrr_reward": 0.6908172070980072, "rewards/rank_analyze_format_reward": 0.6150907501578331, "rewards/rank_answer_foramt_reward": 0.900390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 640.5, "epoch": 2.064, "grad_norm": 0.03631464019417763, "kl": 0.010379791259765625, "learning_rate": 1.9981048452912364e-05, "loss": 0.0223, "reward": 6.423146486282349, "reward_std": 1.1042785942554474, "rewards/mrr_reward": 0.46945685893297195, "rewards/rank_analyze_format_reward": 0.7502825409173965, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9834558814764023, "rewards/rank_overall_format_reward_more": 0.9140625, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 601.90625, "epoch": 2.072, "grad_norm": 0.03482425957918167, "kl": 0.011255264282226562, "learning_rate": 1.998089348006235e-05, "loss": -0.0123, "reward": 6.214681625366211, "reward_std": 1.3232944011688232, "rewards/mrr_reward": 0.4206349328160286, "rewards/rank_analyze_format_reward": 0.6760562360286713, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 607.171875, "epoch": 2.08, "grad_norm": 0.03515848517417908, "kl": 0.008946418762207031, "learning_rate": 1.998073787676453e-05, "loss": -0.0182, "reward": 6.849403977394104, "reward_std": 1.1705361306667328, "rewards/mrr_reward": 0.5722842365503311, "rewards/rank_analyze_format_reward": 0.6880913898348808, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9976112246513367, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9976112246513367, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 625.765625, "epoch": 2.088, "grad_norm": 0.033137645572423935, "kl": 0.010486602783203125, "learning_rate": 1.9980581643028732e-05, "loss": -0.0257, "reward": 6.725158452987671, "reward_std": 0.918092668056488, "rewards/mrr_reward": 0.5186383947730064, "rewards/rank_analyze_format_reward": 0.7032241895794868, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 596.21875, "epoch": 2.096, "grad_norm": 0.03581111505627632, "kl": 0.011474609375, "learning_rate": 1.9980424778864825e-05, "loss": -0.028, "reward": 6.540898442268372, "reward_std": 1.0225854963064194, "rewards/mrr_reward": 0.4951760917901993, "rewards/rank_analyze_format_reward": 0.618479423224926, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 587.578125, "epoch": 2.104, "grad_norm": 0.033393606543540955, "kl": 0.009927749633789062, "learning_rate": 1.9980267284282718e-05, "loss": -0.0212, "reward": 7.45247495174408, "reward_std": 0.4263784661889076, "rewards/mrr_reward": 0.7192708477377892, "rewards/rank_analyze_format_reward": 0.5848489105701447, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 607.234375, "epoch": 2.112, "grad_norm": 0.034759897738695145, "kl": 0.008701324462890625, "learning_rate": 1.998010915929236e-05, "loss": -0.0146, "reward": 7.091454982757568, "reward_std": 0.9185773134231567, "rewards/mrr_reward": 0.6087363660335541, "rewards/rank_analyze_format_reward": 0.6858063042163849, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 601.015625, "epoch": 2.12, "grad_norm": 0.03577468544244766, "kl": 0.01145172119140625, "learning_rate": 1.9979950403903732e-05, "loss": -0.0014, "reward": 6.77937126159668, "reward_std": 1.279131755232811, "rewards/mrr_reward": 0.563430055975914, "rewards/rank_analyze_format_reward": 0.6385087594389915, "rewards/rank_answer_foramt_reward": 0.8984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 559.375, "epoch": 2.128, "grad_norm": 0.03831040486693382, "kl": 0.010589599609375, "learning_rate": 1.9979791018126874e-05, "loss": -0.0106, "reward": 6.678526520729065, "reward_std": 1.4866646826267242, "rewards/mrr_reward": 0.5484995096921921, "rewards/rank_analyze_format_reward": 0.595856636762619, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 583.3125, "epoch": 2.136, "grad_norm": 0.0412379652261734, "kl": 0.010957717895507812, "learning_rate": 1.9979631001971848e-05, "loss": -0.0116, "reward": 7.416189789772034, "reward_std": 1.0926668643951416, "rewards/mrr_reward": 0.7192708253860474, "rewards/rank_analyze_format_reward": 0.6211378127336502, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 609.828125, "epoch": 2.144, "grad_norm": 0.03350284695625305, "kl": 0.008955001831054688, "learning_rate": 1.9979470355448756e-05, "loss": -0.0158, "reward": 7.620032906532288, "reward_std": 0.6238258853554726, "rewards/mrr_reward": 0.7218749970197678, "rewards/rank_analyze_format_reward": 0.7774548083543777, "rewards/rank_answer_foramt_reward": 0.955078125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 592.359375, "epoch": 2.152, "grad_norm": 0.03770367428660393, "kl": 0.011816024780273438, "learning_rate": 1.9979309078567756e-05, "loss": -0.0043, "reward": 6.694323897361755, "reward_std": 1.3028307557106018, "rewards/mrr_reward": 0.5659226104617119, "rewards/rank_analyze_format_reward": 0.5534504503011703, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 606.828125, "epoch": 2.16, "grad_norm": 0.03764275088906288, "kl": 0.009786605834960938, "learning_rate": 1.9979147171339022e-05, "loss": -0.019, "reward": 6.99415135383606, "reward_std": 1.3437075316905975, "rewards/mrr_reward": 0.6053075417876244, "rewards/rank_analyze_format_reward": 0.6764369979500771, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 595.4375, "epoch": 2.168, "grad_norm": 0.03837438300251961, "kl": 0.011205673217773438, "learning_rate": 1.9978984633772795e-05, "loss": -0.0289, "reward": 5.901566505432129, "reward_std": 0.9236202016472816, "rewards/mrr_reward": 0.35381324775516987, "rewards/rank_analyze_format_reward": 0.59522345662117, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9992559552192688, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 575.375, "epoch": 2.176, "grad_norm": 0.04059956222772598, "kl": 0.011159896850585938, "learning_rate": 1.9978821465879332e-05, "loss": -0.0362, "reward": 6.7173460721969604, "reward_std": 0.7962133586406708, "rewards/mrr_reward": 0.5370783656835556, "rewards/rank_analyze_format_reward": 0.6237201392650604, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 570.328125, "epoch": 2.184, "grad_norm": 0.03902539238333702, "kl": 0.0102081298828125, "learning_rate": 1.9978657667668945e-05, "loss": -0.032, "reward": 6.786892771720886, "reward_std": 1.4763158559799194, "rewards/mrr_reward": 0.6156250163912773, "rewards/rank_analyze_format_reward": 0.49029337987303734, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 632.28125, "epoch": 2.192, "grad_norm": 0.035940494388341904, "kl": 0.012010574340820312, "learning_rate": 1.9978493239151976e-05, "loss": -0.0052, "reward": 7.241865515708923, "reward_std": 1.5207486748695374, "rewards/mrr_reward": 0.6480902805924416, "rewards/rank_analyze_format_reward": 0.7944418787956238, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9968671798706055, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9968671798706055, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 616.765625, "epoch": 2.2, "grad_norm": 0.03509166091680527, "kl": 0.013257980346679688, "learning_rate": 1.997832818033881e-05, "loss": 0.0139, "reward": 6.9878867864608765, "reward_std": 1.2263060361146927, "rewards/mrr_reward": 0.592051088809967, "rewards/rank_analyze_format_reward": 0.7836297750473022, "rewards/rank_answer_foramt_reward": 0.884765625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9834558814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 574.640625, "epoch": 2.208, "grad_norm": 0.03720112144947052, "kl": 0.013032913208007812, "learning_rate": 1.9978162491239882e-05, "loss": -0.0178, "reward": 7.190923571586609, "reward_std": 1.171968013048172, "rewards/mrr_reward": 0.640625, "rewards/rank_analyze_format_reward": 0.6909236311912537, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 613.5625, "epoch": 2.216, "grad_norm": 0.04091575741767883, "kl": 0.012386322021484375, "learning_rate": 1.997799617186565e-05, "loss": -0.003, "reward": 6.570623397827148, "reward_std": 1.0336104482412338, "rewards/mrr_reward": 0.48072298616170883, "rewards/rank_analyze_format_reward": 0.7270140051841736, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9994212985038757, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9994212985038757, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 622.53125, "epoch": 2.224, "grad_norm": 0.03800741583108902, "kl": 0.01259613037109375, "learning_rate": 1.9977829222226622e-05, "loss": -0.0266, "reward": 6.372930645942688, "reward_std": 0.8913363832980394, "rewards/mrr_reward": 0.46861979365348816, "rewards/rank_analyze_format_reward": 0.731744721531868, "rewards/rank_answer_foramt_reward": 0.802734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9976112246513367, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9819862246513367, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 608.8125, "epoch": 2.232, "grad_norm": 0.0358574353158474, "kl": 0.012134552001953125, "learning_rate": 1.9977661642333344e-05, "loss": -0.0335, "reward": 6.156337261199951, "reward_std": 1.1192015409469604, "rewards/mrr_reward": 0.4033792242407799, "rewards/rank_analyze_format_reward": 0.7136551886796951, "rewards/rank_answer_foramt_reward": 0.859375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9966137856245041, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9966137856245041, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 590.6875, "epoch": 2.24, "grad_norm": 0.03665808215737343, "kl": 0.012432098388671875, "learning_rate": 1.99774934321964e-05, "loss": -0.0148, "reward": 7.189491271972656, "reward_std": 1.3065388202667236, "rewards/mrr_reward": 0.682291679084301, "rewards/rank_analyze_format_reward": 0.6308017671108246, "rewards/rank_answer_foramt_reward": 0.8828125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9967927634716034, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9811677634716034, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 632.296875, "epoch": 2.248, "grad_norm": 0.039501260966062546, "kl": 0.010639190673828125, "learning_rate": 1.9977324591826415e-05, "loss": -0.0105, "reward": 6.4820040464401245, "reward_std": 1.1038605086505413, "rewards/mrr_reward": 0.45491691678762436, "rewards/rank_analyze_format_reward": 0.7659522593021393, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.012876884080469608, "rewards/rank_initial_format_reward": 0.9974177181720734, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9974177181720734, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 621.796875, "epoch": 2.2560000000000002, "grad_norm": 0.040637850761413574, "kl": 0.012548446655273438, "learning_rate": 1.9977155121234056e-05, "loss": 0.008, "reward": 6.498598098754883, "reward_std": 1.4399305284023285, "rewards/mrr_reward": 0.4924045279622078, "rewards/rank_analyze_format_reward": 0.7026933282613754, "rewards/rank_answer_foramt_reward": 0.8359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 622.609375, "epoch": 2.2640000000000002, "grad_norm": 0.039998337626457214, "kl": 0.01102447509765625, "learning_rate": 1.9976985020430022e-05, "loss": 0.0019, "reward": 6.484407901763916, "reward_std": 0.9918918311595917, "rewards/mrr_reward": 0.4627170190215111, "rewards/rank_analyze_format_reward": 0.7155710011720657, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 634.28125, "epoch": 2.2720000000000002, "grad_norm": 0.034088097512722015, "kl": 0.0094451904296875, "learning_rate": 1.9976814289425066e-05, "loss": 0.0066, "reward": 6.765654683113098, "reward_std": 1.0432685762643814, "rewards/mrr_reward": 0.5332837402820587, "rewards/rank_analyze_format_reward": 0.6904967427253723, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 620.1875, "epoch": 2.2800000000000002, "grad_norm": 0.03567035123705864, "kl": 0.015941619873046875, "learning_rate": 1.9976642928229965e-05, "loss": -0.0143, "reward": 7.0589940547943115, "reward_std": 0.7747539728879929, "rewards/mrr_reward": 0.5898003429174423, "rewards/rank_analyze_format_reward": 0.7460509389638901, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 640.59375, "epoch": 2.288, "grad_norm": 0.033182136714458466, "kl": 0.009305953979492188, "learning_rate": 1.997647093685555e-05, "loss": 0.0029, "reward": 7.651683449745178, "reward_std": 0.4577641859650612, "rewards/mrr_reward": 0.7307725697755814, "rewards/rank_analyze_format_reward": 0.7285931408405304, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 589.5, "epoch": 2.296, "grad_norm": 0.03666054829955101, "kl": 0.010992050170898438, "learning_rate": 1.9976298315312675e-05, "loss": -0.0206, "reward": 7.6038994789123535, "reward_std": 1.4697020053863525, "rewards/mrr_reward": 0.7263020724058151, "rewards/rank_analyze_format_reward": 0.7533785998821259, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 636.65625, "epoch": 2.304, "grad_norm": 0.03196245804429054, "kl": 0.009922027587890625, "learning_rate": 1.9976125063612254e-05, "loss": -0.0084, "reward": 7.176369905471802, "reward_std": 1.0738315135240555, "rewards/mrr_reward": 0.6143229156732559, "rewards/rank_analyze_format_reward": 0.7952503263950348, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 575.3125, "epoch": 2.312, "grad_norm": 0.036641672253608704, "kl": 0.01151275634765625, "learning_rate": 1.9975951181765226e-05, "loss": -0.0135, "reward": 6.732638239860535, "reward_std": 1.1722622215747833, "rewards/mrr_reward": 0.5541604608297348, "rewards/rank_analyze_format_reward": 0.6565065011382103, "rewards/rank_answer_foramt_reward": 0.861328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 620.125, "epoch": 2.32, "grad_norm": 0.03352293372154236, "kl": 0.00873565673828125, "learning_rate": 1.9975776669782572e-05, "loss": -0.0098, "reward": 7.056705951690674, "reward_std": 0.74837876111269, "rewards/mrr_reward": 0.5602182596921921, "rewards/rank_analyze_format_reward": 0.8049077540636063, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_contrast_format_reward": 0.012763278558850288, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 594.203125, "epoch": 2.328, "grad_norm": 0.03776485472917557, "kl": 0.011941909790039062, "learning_rate": 1.997560152767532e-05, "loss": -0.011, "reward": 7.487109661102295, "reward_std": 0.8209907524287701, "rewards/mrr_reward": 0.7063492089509964, "rewards/rank_analyze_format_reward": 0.7513267993927002, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9825367629528046, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 610.453125, "epoch": 2.336, "grad_norm": 0.037363357841968536, "kl": 0.013795852661132812, "learning_rate": 1.997542575545453e-05, "loss": 0.0103, "reward": 7.0443562269210815, "reward_std": 1.2127674743533134, "rewards/mrr_reward": 0.5873635932803154, "rewards/rank_analyze_format_reward": 0.7670525759458542, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 590.578125, "epoch": 2.344, "grad_norm": 0.03627763316035271, "kl": 0.01107025146484375, "learning_rate": 1.9975249353131304e-05, "loss": -0.0153, "reward": 7.811681151390076, "reward_std": 1.2126767039299011, "rewards/mrr_reward": 0.8069444298744202, "rewards/rank_analyze_format_reward": 0.6737470030784607, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 590.75, "epoch": 2.352, "grad_norm": 0.03678389638662338, "kl": 0.009225845336914062, "learning_rate": 1.9975072320716785e-05, "loss": -0.0396, "reward": 6.60707688331604, "reward_std": 1.2315413057804108, "rewards/mrr_reward": 0.5236669182777405, "rewards/rank_analyze_format_reward": 0.584445059299469, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 576.390625, "epoch": 2.36, "grad_norm": 0.03965243697166443, "kl": 0.013790130615234375, "learning_rate": 1.997489465822216e-05, "loss": -0.0106, "reward": 7.775085091590881, "reward_std": 1.3139366656541824, "rewards/mrr_reward": 0.8050967454910278, "rewards/rank_analyze_format_reward": 0.6439119428396225, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9944556355476379, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9944556355476379, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 606.703125, "epoch": 2.368, "grad_norm": 0.039732079952955246, "kl": 0.011026382446289062, "learning_rate": 1.9974716365658646e-05, "loss": -0.0467, "reward": 7.427183151245117, "reward_std": 1.2437842339277267, "rewards/mrr_reward": 0.7122395783662796, "rewards/rank_analyze_format_reward": 0.7224476039409637, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 605.25, "epoch": 2.376, "grad_norm": 0.03692779690027237, "kl": 0.010181427001953125, "learning_rate": 1.9974537443037504e-05, "loss": -0.0119, "reward": 7.6130610704422, "reward_std": 1.0890810042619705, "rewards/mrr_reward": 0.7197916656732559, "rewards/rank_analyze_format_reward": 0.8107158541679382, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9977221935987473, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9977221935987473, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 611.5625, "epoch": 2.384, "grad_norm": 0.039538830518722534, "kl": 0.013807296752929688, "learning_rate": 1.9974357890370038e-05, "loss": -0.008, "reward": 6.635961890220642, "reward_std": 0.7657184079289436, "rewards/mrr_reward": 0.48133058845996857, "rewards/rank_analyze_format_reward": 0.7749776542186737, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 606.328125, "epoch": 2.392, "grad_norm": 0.03752804920077324, "kl": 0.013395309448242188, "learning_rate": 1.9974177707667594e-05, "loss": 0.0098, "reward": 7.015731453895569, "reward_std": 1.1001620888710022, "rewards/mrr_reward": 0.6202257052063942, "rewards/rank_analyze_format_reward": 0.6712347567081451, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9962500035762787, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9806250035762787, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 593.0, "epoch": 2.4, "grad_norm": 0.039174940437078476, "kl": 0.011379241943359375, "learning_rate": 1.9973996894941545e-05, "loss": -0.0011, "reward": 7.0397127866744995, "reward_std": 1.0055639445781708, "rewards/mrr_reward": 0.5911644473671913, "rewards/rank_analyze_format_reward": 0.7411527559161186, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 590.3125, "epoch": 2.408, "grad_norm": 0.0378861129283905, "kl": 0.011119842529296875, "learning_rate": 1.9973815452203314e-05, "loss": 0.0056, "reward": 7.447056770324707, "reward_std": 1.2125954329967499, "rewards/mrr_reward": 0.7122395783662796, "rewards/rank_analyze_format_reward": 0.669488713145256, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9994612038135529, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9994612038135529, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 565.15625, "epoch": 2.416, "grad_norm": 0.03746671974658966, "kl": 0.011993408203125, "learning_rate": 1.997363337946437e-05, "loss": -0.0198, "reward": 6.575040936470032, "reward_std": 0.9133451133966446, "rewards/mrr_reward": 0.5259300693869591, "rewards/rank_analyze_format_reward": 0.5709301829338074, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 641.546875, "epoch": 2.424, "grad_norm": 0.03554888442158699, "kl": 0.010702133178710938, "learning_rate": 1.9973450676736205e-05, "loss": -0.0074, "reward": 7.236762523651123, "reward_std": 0.604234242811799, "rewards/mrr_reward": 0.6168154701590538, "rewards/rank_analyze_format_reward": 0.8082548528909683, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 614.53125, "epoch": 2.432, "grad_norm": 0.03649809956550598, "kl": 0.011503219604492188, "learning_rate": 1.997326734403036e-05, "loss": -0.0239, "reward": 6.725122928619385, "reward_std": 1.2124179899692535, "rewards/mrr_reward": 0.5331907123327255, "rewards/rank_analyze_format_reward": 0.7333841472864151, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 595.609375, "epoch": 2.44, "grad_norm": 0.03357018902897835, "kl": 0.011198043823242188, "learning_rate": 1.997308338135842e-05, "loss": -0.0394, "reward": 7.099708437919617, "reward_std": 1.0707662254571915, "rewards/mrr_reward": 0.617491327226162, "rewards/rank_analyze_format_reward": 0.6802160441875458, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 639.46875, "epoch": 2.448, "grad_norm": 0.04073842614889145, "kl": 0.01114654541015625, "learning_rate": 1.9972898788732e-05, "loss": -0.0205, "reward": 6.205634713172913, "reward_std": 1.0768165290355682, "rewards/mrr_reward": 0.40212054550647736, "rewards/rank_analyze_format_reward": 0.6713712811470032, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 592.875, "epoch": 2.456, "grad_norm": 0.038296766579151154, "kl": 0.0117034912109375, "learning_rate": 1.9972713566162763e-05, "loss": -0.0115, "reward": 6.65511429309845, "reward_std": 0.8909335732460022, "rewards/mrr_reward": 0.5184585936367512, "rewards/rank_analyze_format_reward": 0.6747215688228607, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 571.96875, "epoch": 2.464, "grad_norm": 0.03361259400844574, "kl": 0.010142326354980469, "learning_rate": 1.997252771366241e-05, "loss": -0.0059, "reward": 7.825888633728027, "reward_std": 0.7059714342467487, "rewards/mrr_reward": 0.8350446447730064, "rewards/rank_analyze_format_reward": 0.4857100807130337, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 606.9375, "epoch": 2.472, "grad_norm": 0.03526763245463371, "kl": 0.0130767822265625, "learning_rate": 1.9972341231242675e-05, "loss": -0.0398, "reward": 6.988335967063904, "reward_std": 0.7815524078905582, "rewards/mrr_reward": 0.5860863253474236, "rewards/rank_analyze_format_reward": 0.722115769982338, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 599.140625, "epoch": 2.48, "grad_norm": 0.03733767569065094, "kl": 0.012132644653320312, "learning_rate": 1.9972154118915344e-05, "loss": -0.0251, "reward": 7.347846150398254, "reward_std": 1.1197139769792557, "rewards/mrr_reward": 0.6794270873069763, "rewards/rank_analyze_format_reward": 0.7024035751819611, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 592.796875, "epoch": 2.488, "grad_norm": 0.037743665277957916, "kl": 0.010995864868164062, "learning_rate": 1.997196637669223e-05, "loss": -0.0057, "reward": 7.16385281085968, "reward_std": 0.9465463161468506, "rewards/mrr_reward": 0.614341527223587, "rewards/rank_analyze_format_reward": 0.7628190815448761, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 628.25, "epoch": 2.496, "grad_norm": 0.03614401817321777, "kl": 0.010850906372070312, "learning_rate": 1.99717780045852e-05, "loss": -0.0312, "reward": 7.732061147689819, "reward_std": 0.6288701333105564, "rewards/mrr_reward": 0.7590463757514954, "rewards/rank_analyze_format_reward": 0.7517846375703812, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 605.609375, "epoch": 2.504, "grad_norm": 0.035900671035051346, "kl": 0.010019302368164062, "learning_rate": 1.997158900260614e-05, "loss": 0.001, "reward": 7.1635472774505615, "reward_std": 1.0679296404123306, "rewards/mrr_reward": 0.6484374925494194, "rewards/rank_analyze_format_reward": 0.6771043539047241, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9834558814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 596.1875, "epoch": 2.512, "grad_norm": 0.041989874094724655, "kl": 0.015058517456054688, "learning_rate": 1.9971399370767e-05, "loss": -0.0166, "reward": 6.863955616950989, "reward_std": 0.7592495381832123, "rewards/mrr_reward": 0.565854400396347, "rewards/rank_analyze_format_reward": 0.6161628141999245, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 599.0, "epoch": 2.52, "grad_norm": 0.041266754269599915, "kl": 0.013032913208007812, "learning_rate": 1.9971209109079752e-05, "loss": -0.0229, "reward": 7.460736155509949, "reward_std": 1.0799484848976135, "rewards/mrr_reward": 0.7114583253860474, "rewards/rank_analyze_format_reward": 0.6832623034715652, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 646.28125, "epoch": 2.528, "grad_norm": 0.03333236649632454, "kl": 0.009759902954101562, "learning_rate": 1.9971018217556416e-05, "loss": -0.0106, "reward": 6.682798147201538, "reward_std": 0.5989858657121658, "rewards/mrr_reward": 0.4994109719991684, "rewards/rank_analyze_format_reward": 0.7437479048967361, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 631.4375, "epoch": 2.536, "grad_norm": 0.0350511260330677, "kl": 0.010562896728515625, "learning_rate": 1.997082669620905e-05, "loss": -0.0302, "reward": 6.6315062046051025, "reward_std": 1.0686845779418945, "rewards/mrr_reward": 0.4913690462708473, "rewards/rank_analyze_format_reward": 0.7528755962848663, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9975927770137787, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9975927770137787, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 583.140625, "epoch": 2.544, "grad_norm": 0.03914601355791092, "kl": 0.013078689575195312, "learning_rate": 1.997063454504975e-05, "loss": -0.0055, "reward": 6.575037002563477, "reward_std": 1.3988100588321686, "rewards/mrr_reward": 0.514732152223587, "rewards/rank_analyze_format_reward": 0.6743116676807404, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 618.71875, "epoch": 2.552, "grad_norm": 0.0380953773856163, "kl": 0.013456344604492188, "learning_rate": 1.9970441764090654e-05, "loss": -0.0518, "reward": 7.295857548713684, "reward_std": 1.004029467701912, "rewards/mrr_reward": 0.6721354275941849, "rewards/rank_analyze_format_reward": 0.7370236366987228, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9878805130720139, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9878805130720139, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 613.4375, "epoch": 2.56, "grad_norm": 0.040197595953941345, "kl": 0.013912200927734375, "learning_rate": 1.9970248353343943e-05, "loss": -0.0075, "reward": 6.5366517305374146, "reward_std": 1.0288221687078476, "rewards/mrr_reward": 0.4604600891470909, "rewards/rank_analyze_format_reward": 0.7944208830595016, "rewards/rank_answer_foramt_reward": 0.900390625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 649.296875, "epoch": 2.568, "grad_norm": 0.038607921451330185, "kl": 0.013824462890625, "learning_rate": 1.997005431282183e-05, "loss": 0.0172, "reward": 7.0922359228134155, "reward_std": 1.0854482501745224, "rewards/mrr_reward": 0.603298619389534, "rewards/rank_analyze_format_reward": 0.7901396751403809, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9825367629528046, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 641.296875, "epoch": 2.576, "grad_norm": 0.03311832994222641, "kl": 0.011646270751953125, "learning_rate": 1.996985964253657e-05, "loss": -0.0369, "reward": 6.7459012269973755, "reward_std": 0.9181017801165581, "rewards/mrr_reward": 0.5021019279956818, "rewards/rank_analyze_format_reward": 0.7544548064470291, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 627.34375, "epoch": 2.584, "grad_norm": 0.037428632378578186, "kl": 0.012990951538085938, "learning_rate": 1.996966434250046e-05, "loss": -0.0228, "reward": 7.209717512130737, "reward_std": 1.1640962213277817, "rewards/mrr_reward": 0.6627604216337204, "rewards/rank_analyze_format_reward": 0.7012539207935333, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.953125, "rewards/rank_verify_format_reward": 1.0, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 586.28125, "epoch": 2.592, "grad_norm": 0.035629965364933014, "kl": 0.011270523071289062, "learning_rate": 1.996946841272584e-05, "loss": -0.0126, "reward": 6.940586090087891, "reward_std": 1.4230458736419678, "rewards/mrr_reward": 0.5958519503474236, "rewards/rank_analyze_format_reward": 0.6704594492912292, "rewards/rank_answer_foramt_reward": 0.88671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 625.984375, "epoch": 2.6, "grad_norm": 0.032520923763513565, "kl": 0.011152267456054688, "learning_rate": 1.9969271853225083e-05, "loss": -0.0061, "reward": 7.102632761001587, "reward_std": 0.8966164737939835, "rewards/mrr_reward": 0.6060329973697662, "rewards/rank_analyze_format_reward": 0.717139944434166, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 620.765625, "epoch": 2.608, "grad_norm": 0.03493724763393402, "kl": 0.011966705322265625, "learning_rate": 1.9969074664010605e-05, "loss": -0.0149, "reward": 6.612971305847168, "reward_std": 0.9198006242513657, "rewards/mrr_reward": 0.479879729449749, "rewards/rank_analyze_format_reward": 0.7714625149965286, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 610.78125, "epoch": 2.616, "grad_norm": 0.03426508232951164, "kl": 0.010488510131835938, "learning_rate": 1.9968876845094864e-05, "loss": -0.0116, "reward": 7.175417423248291, "reward_std": 0.7358394265174866, "rewards/mrr_reward": 0.6250000074505806, "rewards/rank_analyze_format_reward": 0.764837920665741, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 612.21875, "epoch": 2.624, "grad_norm": 0.03677660971879959, "kl": 0.01381683349609375, "learning_rate": 1.996867839649035e-05, "loss": -0.0066, "reward": 7.328829765319824, "reward_std": 0.97315713763237, "rewards/mrr_reward": 0.6791852787137032, "rewards/rank_analyze_format_reward": 0.7214639633893967, "rewards/rank_answer_foramt_reward": 0.9140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 587.625, "epoch": 2.632, "grad_norm": 0.03748798742890358, "kl": 0.01129150390625, "learning_rate": 1.9968479318209603e-05, "loss": 0.0107, "reward": 7.366376042366028, "reward_std": 0.7245956286787987, "rewards/mrr_reward": 0.6915550529956818, "rewards/rank_analyze_format_reward": 0.6485605537891388, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 603.265625, "epoch": 2.64, "grad_norm": 0.038382068276405334, "kl": 0.014995574951171875, "learning_rate": 1.9968279610265194e-05, "loss": -0.0244, "reward": 7.351204872131348, "reward_std": 1.0787476003170013, "rewards/mrr_reward": 0.6888020783662796, "rewards/rank_analyze_format_reward": 0.7559229284524918, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9825367629528046, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 617.171875, "epoch": 2.648, "grad_norm": 0.036839522421360016, "kl": 0.011720657348632812, "learning_rate": 1.9968079272669744e-05, "loss": 0.0057, "reward": 6.830013751983643, "reward_std": 1.1275426745414734, "rewards/mrr_reward": 0.5580295100808144, "rewards/rank_analyze_format_reward": 0.7516124844551086, "rewards/rank_answer_foramt_reward": 0.849609375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983368366956711, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9983368366956711, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 620.265625, "epoch": 2.656, "grad_norm": 0.03564363345503807, "kl": 0.012371063232421875, "learning_rate": 1.9967878305435902e-05, "loss": -0.0231, "reward": 7.337198257446289, "reward_std": 0.7928859405219555, "rewards/mrr_reward": 0.6541852578520775, "rewards/rank_analyze_format_reward": 0.8176662474870682, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9972937107086182, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9972937107086182, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 647.890625, "epoch": 2.664, "grad_norm": 0.038595810532569885, "kl": 0.010858535766601562, "learning_rate": 1.9967676708576362e-05, "loss": -0.0045, "reward": 6.599027991294861, "reward_std": 0.9832871407270432, "rewards/mrr_reward": 0.4508804567158222, "rewards/rank_analyze_format_reward": 0.8443343043327332, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 644.796875, "epoch": 2.672, "grad_norm": 0.03759092092514038, "kl": 0.0118408203125, "learning_rate": 1.9967474482103863e-05, "loss": -0.0121, "reward": 6.94339394569397, "reward_std": 0.9748950749635696, "rewards/mrr_reward": 0.5725632309913635, "rewards/rank_analyze_format_reward": 0.733219176530838, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 660.65625, "epoch": 2.68, "grad_norm": 0.03415609896183014, "kl": 0.011899948120117188, "learning_rate": 1.996727162603117e-05, "loss": -0.0132, "reward": 6.538380742073059, "reward_std": 0.7016656026244164, "rewards/mrr_reward": 0.44487228989601135, "rewards/rank_analyze_format_reward": 0.8194384127855301, "rewards/rank_answer_foramt_reward": 0.955078125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 634.390625, "epoch": 2.6879999999999997, "grad_norm": 0.03727827966213226, "kl": 0.011407852172851562, "learning_rate": 1.9967068140371103e-05, "loss": 0.0018, "reward": 7.043541312217712, "reward_std": 0.7633183086290956, "rewards/mrr_reward": 0.5886718779802322, "rewards/rank_analyze_format_reward": 0.7822953313589096, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 600.71875, "epoch": 2.6959999999999997, "grad_norm": 0.03895430639386177, "kl": 0.013332366943359375, "learning_rate": 1.9966864025136518e-05, "loss": -0.0042, "reward": 6.765047073364258, "reward_std": 0.8223965764045715, "rewards/mrr_reward": 0.5205729305744171, "rewards/rank_analyze_format_reward": 0.7120521813631058, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 660.328125, "epoch": 2.7039999999999997, "grad_norm": 0.038571566343307495, "kl": 0.012439727783203125, "learning_rate": 1.99666592803403e-05, "loss": -0.0154, "reward": 6.975342035293579, "reward_std": 0.840043693780899, "rewards/mrr_reward": 0.5555741637945175, "rewards/rank_analyze_format_reward": 0.811639130115509, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 576.015625, "epoch": 2.7119999999999997, "grad_norm": 0.03610292449593544, "kl": 0.012050628662109375, "learning_rate": 1.9966453905995386e-05, "loss": -0.0219, "reward": 6.419227600097656, "reward_std": 1.1811564713716507, "rewards/mrr_reward": 0.46623264998197556, "rewards/rank_analyze_format_reward": 0.7026196420192719, "rewards/rank_answer_foramt_reward": 0.876953125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 611.953125, "epoch": 2.7199999999999998, "grad_norm": 0.040995605289936066, "kl": 0.010702133178710938, "learning_rate": 1.996624790211475e-05, "loss": 0.0069, "reward": 7.764137506484985, "reward_std": 0.872068215161562, "rewards/mrr_reward": 0.7421006858348846, "rewards/rank_analyze_format_reward": 0.8094066381454468, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 601.390625, "epoch": 2.7279999999999998, "grad_norm": 0.036672018468379974, "kl": 0.010931015014648438, "learning_rate": 1.9966041268711404e-05, "loss": -0.0282, "reward": 7.355572700500488, "reward_std": 0.8193893283605576, "rewards/mrr_reward": 0.6698970645666122, "rewards/rank_analyze_format_reward": 0.7189528197050095, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 595.25, "epoch": 2.7359999999999998, "grad_norm": 0.036827512085437775, "kl": 0.011262893676757812, "learning_rate": 1.9965834005798395e-05, "loss": 0.0009, "reward": 7.232412695884705, "reward_std": 0.9624816030263901, "rewards/mrr_reward": 0.6321304589509964, "rewards/rank_analyze_format_reward": 0.776156485080719, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 583.015625, "epoch": 2.7439999999999998, "grad_norm": 0.041386183351278305, "kl": 0.01598358154296875, "learning_rate": 1.9965626113388823e-05, "loss": -0.0151, "reward": 7.414017677307129, "reward_std": 1.145112544298172, "rewards/mrr_reward": 0.7001488208770752, "rewards/rank_analyze_format_reward": 0.7220657765865326, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 610.671875, "epoch": 2.752, "grad_norm": 0.03522395342588425, "kl": 0.011472702026367188, "learning_rate": 1.9965417591495813e-05, "loss": -0.0021, "reward": 6.261266589164734, "reward_std": 0.648932583630085, "rewards/mrr_reward": 0.4110739082098007, "rewards/rank_analyze_format_reward": 0.6695905476808548, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 595.421875, "epoch": 2.76, "grad_norm": 0.036285560578107834, "kl": 0.011045455932617188, "learning_rate": 1.9965208440132538e-05, "loss": -0.0084, "reward": 7.684949636459351, "reward_std": 0.6939431764185429, "rewards/mrr_reward": 0.7300347238779068, "rewards/rank_analyze_format_reward": 0.782451868057251, "rewards/rank_answer_foramt_reward": 0.984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9989919364452362, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9989919364452362, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 602.15625, "epoch": 2.768, "grad_norm": 0.03910991922020912, "kl": 0.011774063110351562, "learning_rate": 1.9964998659312212e-05, "loss": -0.0189, "reward": 6.8010218143463135, "reward_std": 0.8553978726267815, "rewards/mrr_reward": 0.5462859645485878, "rewards/rank_analyze_format_reward": 0.7228547036647797, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9943632036447525, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9943632036447525, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 611.359375, "epoch": 2.776, "grad_norm": 0.038007643073797226, "kl": 0.010667800903320312, "learning_rate": 1.996478824904808e-05, "loss": 0.003, "reward": 7.355239748954773, "reward_std": 0.9060100615024567, "rewards/mrr_reward": 0.6795572899281979, "rewards/rank_analyze_format_reward": 0.7034169733524323, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 597.8125, "epoch": 2.784, "grad_norm": 0.03934268653392792, "kl": 0.01244354248046875, "learning_rate": 1.9964577209353438e-05, "loss": -0.0656, "reward": 7.2533485889434814, "reward_std": 1.1916275918483734, "rewards/mrr_reward": 0.6880208402872086, "rewards/rank_analyze_format_reward": 0.6423462107777596, "rewards/rank_answer_foramt_reward": 0.92578125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9821939468383789, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9821939468383789, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 616.140625, "epoch": 2.792, "grad_norm": 0.036522042006254196, "kl": 0.013441085815429688, "learning_rate": 1.9964365540241614e-05, "loss": 0.0013, "reward": 7.095219135284424, "reward_std": 1.0741036236286163, "rewards/mrr_reward": 0.6266059279441833, "rewards/rank_analyze_format_reward": 0.6649671494960785, "rewards/rank_answer_foramt_reward": 0.939453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 598.15625, "epoch": 2.8, "grad_norm": 0.03805486485362053, "kl": 0.010654449462890625, "learning_rate": 1.9964153241725984e-05, "loss": -0.0168, "reward": 7.228509426116943, "reward_std": 0.9055161625146866, "rewards/mrr_reward": 0.6221354231238365, "rewards/rank_analyze_format_reward": 0.8102801889181137, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 614.125, "epoch": 2.808, "grad_norm": 0.036437440663576126, "kl": 0.009984970092773438, "learning_rate": 1.996394031381995e-05, "loss": -0.0147, "reward": 6.869751572608948, "reward_std": 0.8186332434415817, "rewards/mrr_reward": 0.5391058996319771, "rewards/rank_analyze_format_reward": 0.77509605884552, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 588.75, "epoch": 2.816, "grad_norm": 0.04047045111656189, "kl": 0.013484954833984375, "learning_rate": 1.996372675653696e-05, "loss": 0.0169, "reward": 7.264615893363953, "reward_std": 1.1256726384162903, "rewards/mrr_reward": 0.6621279790997505, "rewards/rank_analyze_format_reward": 0.7302107512950897, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9961873590946198, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 614.28125, "epoch": 2.824, "grad_norm": 0.03461950272321701, "kl": 0.011775970458984375, "learning_rate": 1.9963512569890512e-05, "loss": -0.0006, "reward": 6.854212045669556, "reward_std": 0.9395613223314285, "rewards/mrr_reward": 0.5478236600756645, "rewards/rank_analyze_format_reward": 0.7212026119232178, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 601.59375, "epoch": 2.832, "grad_norm": 0.03836781159043312, "kl": 0.01210784912109375, "learning_rate": 1.9963297753894134e-05, "loss": -0.0137, "reward": 6.814990997314453, "reward_std": 1.3405095338821411, "rewards/mrr_reward": 0.5263392850756645, "rewards/rank_analyze_format_reward": 0.7955712080001831, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 613.484375, "epoch": 2.84, "grad_norm": 0.03449360653758049, "kl": 0.013011932373046875, "learning_rate": 1.9963082308561386e-05, "loss": -0.021, "reward": 7.53871476650238, "reward_std": 0.9666296392679214, "rewards/mrr_reward": 0.7184895724058151, "rewards/rank_analyze_format_reward": 0.7090613692998886, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 605.71875, "epoch": 2.848, "grad_norm": 0.04157762601971626, "kl": 0.012449264526367188, "learning_rate": 1.9962866233905887e-05, "loss": -0.0148, "reward": 7.414668679237366, "reward_std": 0.9551695212721825, "rewards/mrr_reward": 0.693489596247673, "rewards/rank_analyze_format_reward": 0.6817260161042213, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 614.6875, "epoch": 2.856, "grad_norm": 0.034568723291158676, "kl": 0.011356353759765625, "learning_rate": 1.9962649529941283e-05, "loss": -0.0159, "reward": 7.724859952926636, "reward_std": 0.819370448589325, "rewards/mrr_reward": 0.7456287145614624, "rewards/rank_analyze_format_reward": 0.7604811042547226, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9977678656578064, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9977678656578064, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 618.8125, "epoch": 2.864, "grad_norm": 0.039391741156578064, "kl": 0.012399673461914062, "learning_rate": 1.996243219668126e-05, "loss": -0.0153, "reward": 5.852332949638367, "reward_std": 1.0752842128276825, "rewards/mrr_reward": 0.323691725730896, "rewards/rank_analyze_format_reward": 0.7135076522827148, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9679276347160339, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 620.046875, "epoch": 2.872, "grad_norm": 0.040229834616184235, "kl": 0.011371612548828125, "learning_rate": 1.996221423413954e-05, "loss": 0.0015, "reward": 6.387848496437073, "reward_std": 1.1234539598226547, "rewards/mrr_reward": 0.4234747067093849, "rewards/rank_analyze_format_reward": 0.7486371248960495, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 628.109375, "epoch": 2.88, "grad_norm": 0.0374617837369442, "kl": 0.011615753173828125, "learning_rate": 1.9961995642329905e-05, "loss": 0.0084, "reward": 7.307153582572937, "reward_std": 1.3044872879981995, "rewards/mrr_reward": 0.6740141361951828, "rewards/rank_analyze_format_reward": 0.7395800352096558, "rewards/rank_answer_foramt_reward": 0.875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 591.71875, "epoch": 2.888, "grad_norm": 0.04277306795120239, "kl": 0.014926910400390625, "learning_rate": 1.996177642126615e-05, "loss": -0.0085, "reward": 7.5333287715911865, "reward_std": 0.9014619141817093, "rewards/mrr_reward": 0.6997581869363785, "rewards/rank_analyze_format_reward": 0.7635927647352219, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 603.28125, "epoch": 2.896, "grad_norm": 0.041481465101242065, "kl": 0.014026641845703125, "learning_rate": 1.996155657096213e-05, "loss": -0.0272, "reward": 6.84517502784729, "reward_std": 1.0781239420175552, "rewards/mrr_reward": 0.5556175634264946, "rewards/rank_analyze_format_reward": 0.7236873209476471, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983368366956711, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9983368366956711, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 580.53125, "epoch": 2.904, "grad_norm": 0.04171738028526306, "kl": 0.012874603271484375, "learning_rate": 1.9961336091431728e-05, "loss": -0.0004, "reward": 7.211669564247131, "reward_std": 0.8956931233406067, "rewards/mrr_reward": 0.6445312649011612, "rewards/rank_analyze_format_reward": 0.6827789545059204, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9968671649694443, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9968671649694443, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 636.328125, "epoch": 2.912, "grad_norm": 0.03671007230877876, "kl": 0.011234283447265625, "learning_rate": 1.9961114982688868e-05, "loss": -0.0257, "reward": 7.139348030090332, "reward_std": 0.8967479169368744, "rewards/mrr_reward": 0.6116319298744202, "rewards/rank_analyze_format_reward": 0.7958708107471466, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9992559552192688, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 648.640625, "epoch": 2.92, "grad_norm": 0.033731456845998764, "kl": 0.010288238525390625, "learning_rate": 1.9960893244747525e-05, "loss": -0.0108, "reward": 7.166544318199158, "reward_std": 0.6106544919312, "rewards/mrr_reward": 0.6010168790817261, "rewards/rank_analyze_format_reward": 0.7859143763780594, "rewards/rank_answer_foramt_reward": 0.984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 627.625, "epoch": 2.928, "grad_norm": 0.036414846777915955, "kl": 0.012035369873046875, "learning_rate": 1.9960670877621697e-05, "loss": -0.0184, "reward": 6.740770578384399, "reward_std": 0.8052867725491524, "rewards/mrr_reward": 0.5220424234867096, "rewards/rank_analyze_format_reward": 0.7072883993387222, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 619.90625, "epoch": 2.936, "grad_norm": 0.03801536187529564, "kl": 0.013200759887695312, "learning_rate": 1.9960447881325433e-05, "loss": -0.0308, "reward": 6.5149757862091064, "reward_std": 0.7093790546059608, "rewards/mrr_reward": 0.44720981270074844, "rewards/rank_analyze_format_reward": 0.7612926959991455, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 591.1875, "epoch": 2.944, "grad_norm": 0.040969040244817734, "kl": 0.01461029052734375, "learning_rate": 1.996022425587282e-05, "loss": -0.0185, "reward": 7.41820216178894, "reward_std": 0.9369710832834244, "rewards/mrr_reward": 0.6916666775941849, "rewards/rank_analyze_format_reward": 0.7156801223754883, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 636.859375, "epoch": 2.952, "grad_norm": 0.03583036735653877, "kl": 0.011312484741210938, "learning_rate": 1.9960000001277985e-05, "loss": -0.0276, "reward": 7.153052568435669, "reward_std": 0.6550789251923561, "rewards/mrr_reward": 0.6043154746294022, "rewards/rank_analyze_format_reward": 0.7746230661869049, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 607.65625, "epoch": 2.96, "grad_norm": 0.04044310748577118, "kl": 0.012559890747070312, "learning_rate": 1.9959775117555085e-05, "loss": 0.0112, "reward": 7.005048513412476, "reward_std": 1.1625263132154942, "rewards/mrr_reward": 0.5972346290946007, "rewards/rank_analyze_format_reward": 0.7758757621049881, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992187470197678, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9679687470197678, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 597.0, "epoch": 2.968, "grad_norm": 0.03745017945766449, "kl": 0.013088226318359375, "learning_rate": 1.995954960471833e-05, "loss": 0.0034, "reward": 7.509567379951477, "reward_std": 0.961163155734539, "rewards/mrr_reward": 0.7000806033611298, "rewards/rank_analyze_format_reward": 0.7853019386529922, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 592.859375, "epoch": 2.976, "grad_norm": 0.038514841347932816, "kl": 0.012132644653320312, "learning_rate": 1.995932346278197e-05, "loss": -0.0071, "reward": 7.772576689720154, "reward_std": 0.6515968926250935, "rewards/mrr_reward": 0.7575520798563957, "rewards/rank_analyze_format_reward": 0.7793630510568619, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 586.09375, "epoch": 2.984, "grad_norm": 0.03822262957692146, "kl": 0.013006210327148438, "learning_rate": 1.9959096691760284e-05, "loss": -0.0155, "reward": 7.534856200218201, "reward_std": 0.7668619826436043, "rewards/mrr_reward": 0.7446614354848862, "rewards/rank_analyze_format_reward": 0.685593493282795, "rewards/rank_answer_foramt_reward": 0.955078125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9773005694150925, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9773005694150925, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 621.109375, "epoch": 2.992, "grad_norm": 0.03631046786904335, "kl": 0.011692047119140625, "learning_rate": 1.995886929166759e-05, "loss": 0.0136, "reward": 7.2465866804122925, "reward_std": 0.8428932726383209, "rewards/mrr_reward": 0.6395833343267441, "rewards/rank_analyze_format_reward": 0.7468471378087997, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 668.40625, "epoch": 3.0, "grad_norm": 0.03699138015508652, "kl": 0.011791229248046875, "learning_rate": 1.9958641262518263e-05, "loss": 0.0192, "reward": 7.813745975494385, "reward_std": 0.7177924737334251, "rewards/mrr_reward": 0.7415550798177719, "rewards/rank_analyze_format_reward": 0.8690101951360703, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 615.921875, "epoch": 3.008, "grad_norm": 0.035117242485284805, "kl": 0.013750076293945312, "learning_rate": 3.4816627469912147e-06, "loss": 0.0291, "reward": 7.042345643043518, "reward_std": 0.7293612845242023, "rewards/mrr_reward": 0.5897755473852158, "rewards/rank_analyze_format_reward": 0.7183997631072998, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 637.125, "epoch": 3.016, "grad_norm": 0.03519332408905029, "kl": 0.01271820068359375, "learning_rate": 3.4341424424704373e-06, "loss": -0.0114, "reward": 6.630066633224487, "reward_std": 0.9696584269404411, "rewards/mrr_reward": 0.4780319929122925, "rewards/rank_analyze_format_reward": 0.7667666971683502, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 621.59375, "epoch": 3.024, "grad_norm": 0.036034248769283295, "kl": 0.014505386352539062, "learning_rate": 3.3868813467634833e-06, "loss": -0.0026, "reward": 7.198747515678406, "reward_std": 1.1167692840099335, "rewards/mrr_reward": 0.6099516302347183, "rewards/rank_analyze_format_reward": 0.8406638205051422, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 637.28125, "epoch": 3.032, "grad_norm": 0.03498866409063339, "kl": 0.01226043701171875, "learning_rate": 3.3398813256574847e-06, "loss": -0.0099, "reward": 7.360252737998962, "reward_std": 0.8017124682664871, "rewards/mrr_reward": 0.6533172130584717, "rewards/rank_analyze_format_reward": 0.8052692711353302, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 606.265625, "epoch": 3.04, "grad_norm": 0.0373137928545475, "kl": 0.013418197631835938, "learning_rate": 3.2931442346328e-06, "loss": 0.0002, "reward": 7.177944183349609, "reward_std": 1.186311975121498, "rewards/mrr_reward": 0.6419270783662796, "rewards/rank_analyze_format_reward": 0.6922670155763626, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 655.390625, "epoch": 3.048, "grad_norm": 0.03675440698862076, "kl": 0.0125885009765625, "learning_rate": 3.2466719187897555e-06, "loss": 0.0072, "reward": 6.830274343490601, "reward_std": 0.661302238702774, "rewards/mrr_reward": 0.4951822906732559, "rewards/rank_analyze_format_reward": 0.8651701956987381, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 610.34375, "epoch": 3.056, "grad_norm": 0.03555948659777641, "kl": 0.01416015625, "learning_rate": 3.200466212775808e-06, "loss": -0.0196, "reward": 7.550482988357544, "reward_std": 1.0687852203845978, "rewards/mrr_reward": 0.71484375, "rewards/rank_analyze_format_reward": 0.7417741417884827, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 619.625, "epoch": 3.064, "grad_norm": 0.03907148540019989, "kl": 0.013399124145507812, "learning_rate": 3.1545289407131128e-06, "loss": -0.0043, "reward": 7.558589220046997, "reward_std": 1.2266802489757538, "rewards/mrr_reward": 0.7244791686534882, "rewards/rank_analyze_format_reward": 0.7446569502353668, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 598.09375, "epoch": 3.072, "grad_norm": 0.038272228091955185, "kl": 0.011919021606445312, "learning_rate": 3.108861916126518e-06, "loss": 0.002, "reward": 8.19713008403778, "reward_std": 0.7785622999072075, "rewards/mrr_reward": 0.85546875, "rewards/rank_analyze_format_reward": 0.8240830302238464, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 627.640625, "epoch": 3.08, "grad_norm": 0.041524823755025864, "kl": 0.01570892333984375, "learning_rate": 3.063466941871952e-06, "loss": 0.0153, "reward": 7.146573901176453, "reward_std": 1.074029102921486, "rewards/mrr_reward": 0.6067274287343025, "rewards/rank_analyze_format_reward": 0.834898442029953, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 630.671875, "epoch": 3.088, "grad_norm": 0.03623941168189049, "kl": 0.013017654418945312, "learning_rate": 3.0183458100652752e-06, "loss": -0.0022, "reward": 7.186826229095459, "reward_std": 0.6731258956715465, "rewards/mrr_reward": 0.5905319899320602, "rewards/rank_analyze_format_reward": 0.863645926117897, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 624.265625, "epoch": 3.096, "grad_norm": 0.03690262883901596, "kl": 0.01531219482421875, "learning_rate": 2.9735003020115095e-06, "loss": 0.0131, "reward": 7.618446707725525, "reward_std": 0.5352663211524487, "rewards/mrr_reward": 0.7252604365348816, "rewards/rank_analyze_format_reward": 0.7408426254987717, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 633.4375, "epoch": 3.104, "grad_norm": 0.03921409696340561, "kl": 0.014377593994140625, "learning_rate": 2.9289321881345257e-06, "loss": -0.0006, "reward": 7.131399869918823, "reward_std": 1.3833198249340057, "rewards/mrr_reward": 0.6131696403026581, "rewards/rank_analyze_format_reward": 0.8094245195388794, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981250017881393, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9825000017881393, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 660.5625, "epoch": 3.112, "grad_norm": 0.03858316317200661, "kl": 0.01251220703125, "learning_rate": 2.884643227907147e-06, "loss": 0.0078, "reward": 6.986513733863831, "reward_std": 1.1185480952262878, "rewards/mrr_reward": 0.5550533309578896, "rewards/rank_analyze_format_reward": 0.8381428718566895, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 586.0625, "epoch": 3.12, "grad_norm": 0.03950336202979088, "kl": 0.01474761962890625, "learning_rate": 2.840635169781688e-06, "loss": -0.0229, "reward": 6.151705384254456, "reward_std": 1.3553853258490562, "rewards/mrr_reward": 0.416666679084301, "rewards/rank_analyze_format_reward": 0.666062742471695, "rewards/rank_answer_foramt_reward": 0.822265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 610.796875, "epoch": 3.128, "grad_norm": 0.03854774311184883, "kl": 0.013458251953125, "learning_rate": 2.796909751120931e-06, "loss": -0.007, "reward": 7.251393556594849, "reward_std": 1.445090800523758, "rewards/mrr_reward": 0.6562500074505806, "rewards/rank_analyze_format_reward": 0.7045186460018158, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 633.71875, "epoch": 3.136, "grad_norm": 0.03738197311758995, "kl": 0.013525009155273438, "learning_rate": 2.7534686981295335e-06, "loss": -0.0034, "reward": 6.909914255142212, "reward_std": 1.123517245054245, "rewards/mrr_reward": 0.5471974164247513, "rewards/rank_analyze_format_reward": 0.779603436589241, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 648.96875, "epoch": 3.144, "grad_norm": 0.0402173288166523, "kl": 0.010915756225585938, "learning_rate": 2.7103137257858867e-06, "loss": 0.0094, "reward": 6.921466946601868, "reward_std": 0.774784117937088, "rewards/mrr_reward": 0.5484312921762466, "rewards/rank_analyze_format_reward": 0.8028685003519058, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9975927919149399, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9975927919149399, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 619.296875, "epoch": 3.152, "grad_norm": 0.04159383475780487, "kl": 0.014692306518554688, "learning_rate": 2.667446537774402e-06, "loss": -0.0153, "reward": 6.572040319442749, "reward_std": 1.726172387599945, "rewards/mrr_reward": 0.4993923604488373, "rewards/rank_analyze_format_reward": 0.7517964094877243, "rewards/rank_answer_foramt_reward": 0.8203125, "rewards/rank_contrast_format_reward": 0.012423780746757984, "rewards/rank_initial_format_reward": 0.994969055056572, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.994969055056572, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 642.765625, "epoch": 3.16, "grad_norm": 0.042603906244039536, "kl": 0.010896682739257812, "learning_rate": 2.624868826418262e-06, "loss": 0.0296, "reward": 7.20228123664856, "reward_std": 0.9538848847150803, "rewards/mrr_reward": 0.6085069477558136, "rewards/rank_analyze_format_reward": 0.8265387862920761, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 678.578125, "epoch": 3.168, "grad_norm": 0.03769225999712944, "kl": 0.011178970336914062, "learning_rate": 2.5825822726126095e-06, "loss": 0.0099, "reward": 7.425193428993225, "reward_std": 1.2249933630228043, "rewards/mrr_reward": 0.6672184988856316, "rewards/rank_analyze_format_reward": 0.8207724988460541, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 612.90625, "epoch": 3.176, "grad_norm": 0.03580842167139053, "kl": 0.01148223876953125, "learning_rate": 2.5405885457581793e-06, "loss": 0.0051, "reward": 7.102351069450378, "reward_std": 1.0760410577058792, "rewards/mrr_reward": 0.5938120186328888, "rewards/rank_analyze_format_reward": 0.7681185156106949, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 622.234375, "epoch": 3.184, "grad_norm": 0.03826119750738144, "kl": 0.014284133911132812, "learning_rate": 2.4988893036954045e-06, "loss": 0.0084, "reward": 7.598180770874023, "reward_std": 1.0497512221336365, "rewards/mrr_reward": 0.7157738208770752, "rewards/rank_analyze_format_reward": 0.8209080398082733, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 629.640625, "epoch": 3.192, "grad_norm": 0.039424311369657516, "kl": 0.012294769287109375, "learning_rate": 2.4574861926389615e-06, "loss": 0.0079, "reward": 7.362653613090515, "reward_std": 0.824449434876442, "rewards/mrr_reward": 0.6566406339406967, "rewards/rank_analyze_format_reward": 0.8023822903633118, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 642.015625, "epoch": 3.2, "grad_norm": 0.03927793353796005, "kl": 0.01302337646484375, "learning_rate": 2.4163808471127815e-06, "loss": -0.0046, "reward": 7.52125608921051, "reward_std": 1.2097734808921814, "rewards/mrr_reward": 0.7062934041023254, "rewards/rank_analyze_format_reward": 0.7816215455532074, "rewards/rank_answer_foramt_reward": 0.955078125, "rewards/rank_contrast_format_reward": 0.014070273377001286, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.984375, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 611.1875, "epoch": 3.208, "grad_norm": 0.03897445276379585, "kl": 0.013471603393554688, "learning_rate": 2.37557488988552e-06, "loss": -0.0031, "reward": 6.832857847213745, "reward_std": 1.2639935612678528, "rewards/mrr_reward": 0.5474764406681061, "rewards/rank_analyze_format_reward": 0.7873684614896774, "rewards/rank_answer_foramt_reward": 0.904296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 627.421875, "epoch": 3.216, "grad_norm": 0.03820272535085678, "kl": 0.012819290161132812, "learning_rate": 2.335069931906503e-06, "loss": -0.0068, "reward": 7.258768320083618, "reward_std": 1.3467806428670883, "rewards/mrr_reward": 0.6381944566965103, "rewards/rank_analyze_format_reward": 0.7626311928033829, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 623.78125, "epoch": 3.224, "grad_norm": 0.037081021815538406, "kl": 0.013256072998046875, "learning_rate": 2.2948675722421086e-06, "loss": -0.0032, "reward": 7.068072199821472, "reward_std": 1.0497987121343613, "rewards/mrr_reward": 0.5989149361848831, "rewards/rank_analyze_format_reward": 0.7660476416349411, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 605.0625, "epoch": 3.232, "grad_norm": 0.0360063761472702, "kl": 0.015211105346679688, "learning_rate": 2.254969398012663e-06, "loss": -0.0158, "reward": 7.160408020019531, "reward_std": 1.019886076450348, "rewards/mrr_reward": 0.646112360060215, "rewards/rank_analyze_format_reward": 0.7062013298273087, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983552694320679, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9983552694320679, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 625.453125, "epoch": 3.24, "grad_norm": 0.03534236177802086, "kl": 0.014194488525390625, "learning_rate": 2.215376984329767e-06, "loss": -0.0216, "reward": 7.393091082572937, "reward_std": 0.8330601751804352, "rewards/mrr_reward": 0.6536458283662796, "rewards/rank_analyze_format_reward": 0.7999920099973679, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 604.921875, "epoch": 3.248, "grad_norm": 0.038391463458538055, "kl": 0.013559341430664062, "learning_rate": 2.1760918942341193e-06, "loss": -0.0178, "reward": 7.6344475746154785, "reward_std": 0.9299002774059772, "rewards/mrr_reward": 0.7469618022441864, "rewards/rank_analyze_format_reward": 0.7185573130846024, "rewards/rank_answer_foramt_reward": 0.96875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 630.015625, "epoch": 3.2560000000000002, "grad_norm": 0.03509964421391487, "kl": 0.01255035400390625, "learning_rate": 2.1371156786338108e-06, "loss": -0.0117, "reward": 6.898256897926331, "reward_std": 0.9209974706172943, "rewards/mrr_reward": 0.5367559418082237, "rewards/rank_analyze_format_reward": 0.8304647654294968, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9965170323848724, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9808920323848724, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 608.5625, "epoch": 3.2640000000000002, "grad_norm": 0.03695489838719368, "kl": 0.0167388916015625, "learning_rate": 2.098449876243096e-06, "loss": -0.0313, "reward": 6.917726039886475, "reward_std": 0.8459838628768921, "rewards/mrr_reward": 0.5599144399166107, "rewards/rank_analyze_format_reward": 0.7054120153188705, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 585.078125, "epoch": 3.2720000000000002, "grad_norm": 0.039599835872650146, "kl": 0.014032363891601562, "learning_rate": 2.0600960135216463e-06, "loss": -0.0047, "reward": 7.127155780792236, "reward_std": 1.1092039048671722, "rewards/mrr_reward": 0.6348276287317276, "rewards/rank_analyze_format_reward": 0.7182396054267883, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9992559552192688, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 606.296875, "epoch": 3.2800000000000002, "grad_norm": 0.039675965905189514, "kl": 0.013446807861328125, "learning_rate": 2.022055604614289e-06, "loss": -0.0103, "reward": 6.730955481529236, "reward_std": 0.9299670159816742, "rewards/mrr_reward": 0.5159474387764931, "rewards/rank_analyze_format_reward": 0.7542870342731476, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9945252537727356, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9945252537727356, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 636.3125, "epoch": 3.288, "grad_norm": 0.03656487911939621, "kl": 0.012666702270507812, "learning_rate": 1.984330151291233e-06, "loss": -0.016, "reward": 6.7821091413497925, "reward_std": 0.9743772521615028, "rewards/mrr_reward": 0.5557911694049835, "rewards/rank_analyze_format_reward": 0.7064568400382996, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9819079041481018, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.9662829041481018, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 631.171875, "epoch": 3.296, "grad_norm": 0.037856802344322205, "kl": 0.01450347900390625, "learning_rate": 1.9469211428887813e-06, "loss": -0.0176, "reward": 6.753392338752747, "reward_std": 0.9574991762638092, "rewards/mrr_reward": 0.5006696432828903, "rewards/rank_analyze_format_reward": 0.8190732151269913, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 618.59375, "epoch": 3.304, "grad_norm": 0.040868956595659256, "kl": 0.011432647705078125, "learning_rate": 1.9098300562505266e-06, "loss": 0.0035, "reward": 6.817109823226929, "reward_std": 0.7666200622916222, "rewards/mrr_reward": 0.5467447973787785, "rewards/rank_analyze_format_reward": 0.7559238225221634, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9966736733913422, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9966736733913422, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 616.671875, "epoch": 3.312, "grad_norm": 0.04021133482456207, "kl": 0.013158798217773438, "learning_rate": 1.8730583556690607e-06, "loss": 0.0066, "reward": 6.991716146469116, "reward_std": 0.8390699215233326, "rewards/mrr_reward": 0.5647073462605476, "rewards/rank_analyze_format_reward": 0.8067970871925354, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 644.8125, "epoch": 3.32, "grad_norm": 0.037540923804044724, "kl": 0.01172637939453125, "learning_rate": 1.8366074928281608e-06, "loss": 0.0074, "reward": 7.544142961502075, "reward_std": 1.008560985326767, "rewards/mrr_reward": 0.6979166716337204, "rewards/rank_analyze_format_reward": 0.8081740438938141, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.994612067937851, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.994612067937851, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 584.15625, "epoch": 3.328, "grad_norm": 0.04208629950881004, "kl": 0.012399673461914062, "learning_rate": 1.8004789067454763e-06, "loss": -0.0386, "reward": 7.588791251182556, "reward_std": 1.3209501877427101, "rewards/mrr_reward": 0.7485863268375397, "rewards/rank_analyze_format_reward": 0.6915150880813599, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9924812018871307, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9924812018871307, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 635.203125, "epoch": 3.336, "grad_norm": 0.043379127979278564, "kl": 0.012195587158203125, "learning_rate": 1.7646740237157256e-06, "loss": 0.0323, "reward": 7.012084484100342, "reward_std": 1.0143009573221207, "rewards/mrr_reward": 0.5759734660387039, "rewards/rank_analyze_format_reward": 0.8153042197227478, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9835526347160339, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 598.859375, "epoch": 3.344, "grad_norm": 0.04017612338066101, "kl": 0.014421463012695312, "learning_rate": 1.7291942572543806e-06, "loss": -0.006, "reward": 6.709989428520203, "reward_std": 1.0495906621217728, "rewards/mrr_reward": 0.529706098139286, "rewards/rank_analyze_format_reward": 0.7245771586894989, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9967704266309738, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9967704266309738, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 580.203125, "epoch": 3.352, "grad_norm": 0.040608614683151245, "kl": 0.012868881225585938, "learning_rate": 1.6940410080418723e-06, "loss": -0.0019, "reward": 7.201984643936157, "reward_std": 0.7756945788860321, "rewards/mrr_reward": 0.6367187723517418, "rewards/rank_analyze_format_reward": 0.7234692126512527, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.984375, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 634.3125, "epoch": 3.36, "grad_norm": 0.03586459904909134, "kl": 0.011873245239257812, "learning_rate": 1.6592156638682887e-06, "loss": -0.0093, "reward": 7.138599634170532, "reward_std": 0.7431515604257584, "rewards/mrr_reward": 0.6127170100808144, "rewards/rank_analyze_format_reward": 0.7191510647535324, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9979619532823563, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9979619532823563, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 644.703125, "epoch": 3.368, "grad_norm": 0.03901754319667816, "kl": 0.01136016845703125, "learning_rate": 1.6247195995785836e-06, "loss": 0.003, "reward": 6.6844483613967896, "reward_std": 0.7994736880064011, "rewards/mrr_reward": 0.4797743149101734, "rewards/rank_analyze_format_reward": 0.8219916969537735, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 605.625, "epoch": 3.376, "grad_norm": 0.03650727495551109, "kl": 0.01389312744140625, "learning_rate": 1.5905541770183096e-06, "loss": -0.0195, "reward": 6.6595494747161865, "reward_std": 0.500478945672512, "rewards/mrr_reward": 0.4967882111668587, "rewards/rank_analyze_format_reward": 0.7016936540603638, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 622.78125, "epoch": 3.384, "grad_norm": 0.04294556751847267, "kl": 0.012430191040039062, "learning_rate": 1.5567207449798517e-06, "loss": 0.0142, "reward": 7.437200546264648, "reward_std": 0.8200259059667587, "rewards/mrr_reward": 0.687189981341362, "rewards/rank_analyze_format_reward": 0.7644545584917068, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_contrast_format_reward": 0.013829787261784077, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 630.3125, "epoch": 3.392, "grad_norm": 0.03355753794312477, "kl": 0.011930465698242188, "learning_rate": 1.52322063914917e-06, "loss": -0.0143, "reward": 7.236422419548035, "reward_std": 1.3275894522666931, "rewards/mrr_reward": 0.6320932507514954, "rewards/rank_analyze_format_reward": 0.7606691271066666, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 616.0, "epoch": 3.4, "grad_norm": 0.03680611029267311, "kl": 0.0124969482421875, "learning_rate": 1.490055182053083e-06, "loss": -0.0241, "reward": 7.067311525344849, "reward_std": 0.7071668058633804, "rewards/mrr_reward": 0.5894097089767456, "rewards/rank_analyze_format_reward": 0.7389693707227707, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 618.25, "epoch": 3.408, "grad_norm": 0.03699398413300514, "kl": 0.012582778930664062, "learning_rate": 1.4572256830070497e-06, "loss": 0.0013, "reward": 7.260975360870361, "reward_std": 0.6704662144184113, "rewards/mrr_reward": 0.6398809552192688, "rewards/rank_analyze_format_reward": 0.8204772174358368, "rewards/rank_answer_foramt_reward": 0.890625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 598.4375, "epoch": 3.416, "grad_norm": 0.04159224405884743, "kl": 0.014894485473632812, "learning_rate": 1.4247334380634792e-06, "loss": -0.0191, "reward": 7.206877589225769, "reward_std": 0.8614709973335266, "rewards/mrr_reward": 0.6289062574505806, "rewards/rank_analyze_format_reward": 0.765241265296936, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 644.125, "epoch": 3.424, "grad_norm": 0.036023661494255066, "kl": 0.011859893798828125, "learning_rate": 1.3925797299605649e-06, "loss": -0.0067, "reward": 6.5737926959991455, "reward_std": 0.9088378921151161, "rewards/mrr_reward": 0.4585689455270767, "rewards/rank_analyze_format_reward": 0.7960425764322281, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 624.1875, "epoch": 3.432, "grad_norm": 0.047732554376125336, "kl": 0.011541366577148438, "learning_rate": 1.3607658280716474e-06, "loss": -0.028, "reward": 7.2520798444747925, "reward_std": 1.1335118561983109, "rewards/mrr_reward": 0.6308097690343857, "rewards/rank_analyze_format_reward": 0.7853662818670273, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 596.40625, "epoch": 3.44, "grad_norm": 0.03683609887957573, "kl": 0.013605117797851562, "learning_rate": 1.3292929883550998e-06, "loss": -0.0073, "reward": 8.030953884124756, "reward_std": 0.5688543245196342, "rewards/mrr_reward": 0.8156249970197678, "rewards/rank_analyze_format_reward": 0.7977506220340729, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 605.25, "epoch": 3.448, "grad_norm": 0.03646247833967209, "kl": 0.013217926025390625, "learning_rate": 1.2981624533047432e-06, "loss": 0.0074, "reward": 6.805210113525391, "reward_std": 0.8861361294984818, "rewards/mrr_reward": 0.5289062447845936, "rewards/rank_analyze_format_reward": 0.7668732404708862, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9994419664144516, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9994419664144516, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 609.875, "epoch": 3.456, "grad_norm": 0.03660254180431366, "kl": 0.011661529541015625, "learning_rate": 1.2673754519008008e-06, "loss": -0.0102, "reward": 6.942854642868042, "reward_std": 0.8882918208837509, "rewards/mrr_reward": 0.5530754029750824, "rewards/rank_analyze_format_reward": 0.7735218703746796, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 620.671875, "epoch": 3.464, "grad_norm": 0.037669289857149124, "kl": 0.013912200927734375, "learning_rate": 1.2369331995613664e-06, "loss": -0.0091, "reward": 7.411279797554016, "reward_std": 1.1482711285352707, "rewards/mrr_reward": 0.673480898141861, "rewards/rank_analyze_format_reward": 0.7954811006784439, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 642.984375, "epoch": 3.472, "grad_norm": 0.037749458104372025, "kl": 0.01103973388671875, "learning_rate": 1.206836898094439e-06, "loss": -0.0045, "reward": 7.326077461242676, "reward_std": 0.8330521434545517, "rewards/mrr_reward": 0.6379278004169464, "rewards/rank_analyze_format_reward": 0.7898766249418259, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 642.78125, "epoch": 3.48, "grad_norm": 0.035765353590250015, "kl": 0.012155532836914062, "learning_rate": 1.1770877356504684e-06, "loss": -0.02, "reward": 7.883293986320496, "reward_std": 0.8871591687202454, "rewards/mrr_reward": 0.7798177152872086, "rewards/rank_analyze_format_reward": 0.7656676918268204, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 611.78125, "epoch": 3.488, "grad_norm": 0.03705098107457161, "kl": 0.012918472290039062, "learning_rate": 1.1476868866754488e-06, "loss": -0.0083, "reward": 6.660637736320496, "reward_std": 0.9346826821565628, "rewards/mrr_reward": 0.500713050365448, "rewards/rank_analyze_format_reward": 0.7608778774738312, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 628.71875, "epoch": 3.496, "grad_norm": 0.03969808667898178, "kl": 0.015727996826171875, "learning_rate": 1.1186355118645552e-06, "loss": -0.0132, "reward": 7.052313804626465, "reward_std": 1.0558638274669647, "rewards/mrr_reward": 0.5923363342881203, "rewards/rank_analyze_format_reward": 0.7863690704107285, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 606.859375, "epoch": 3.504, "grad_norm": 0.037416357547044754, "kl": 0.012334823608398438, "learning_rate": 1.0899347581163222e-06, "loss": -0.0176, "reward": 7.955611228942871, "reward_std": 0.7251264750957489, "rewards/mrr_reward": 0.796651765704155, "rewards/rank_analyze_format_reward": 0.8157641887664795, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 637.5, "epoch": 3.512, "grad_norm": 0.03700832277536392, "kl": 0.011600494384765625, "learning_rate": 1.0615857584873624e-06, "loss": 0.0115, "reward": 7.665433883666992, "reward_std": 0.595935083925724, "rewards/mrr_reward": 0.7211123704910278, "rewards/rank_analyze_format_reward": 0.8138794153928757, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 615.5, "epoch": 3.52, "grad_norm": 0.038077887147665024, "kl": 0.01373291015625, "learning_rate": 1.0335896321476413e-06, "loss": -0.0342, "reward": 7.015413165092468, "reward_std": 1.31123448908329, "rewards/mrr_reward": 0.5999503880739212, "rewards/rank_analyze_format_reward": 0.7072935104370117, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 658.0625, "epoch": 3.528, "grad_norm": 0.03242430090904236, "kl": 0.013689041137695312, "learning_rate": 1.0059474843362893e-06, "loss": -0.0198, "reward": 6.429423809051514, "reward_std": 0.8046993911266327, "rewards/mrr_reward": 0.43027032166719437, "rewards/rank_analyze_format_reward": 0.7727955877780914, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 671.890625, "epoch": 3.536, "grad_norm": 0.034074753522872925, "kl": 0.0110321044921875, "learning_rate": 9.786604063179728e-07, "loss": -0.0013, "reward": 7.255928158760071, "reward_std": 0.7968212515115738, "rewards/mrr_reward": 0.6222656294703484, "rewards/rank_analyze_format_reward": 0.8351100534200668, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 612.109375, "epoch": 3.544, "grad_norm": 0.040858954191207886, "kl": 0.0142974853515625, "learning_rate": 9.517294753398066e-07, "loss": 0.0039, "reward": 7.048562169075012, "reward_std": 0.9897879660129547, "rewards/mrr_reward": 0.5950024798512459, "rewards/rank_analyze_format_reward": 0.7603489309549332, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.984375, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 630.3125, "epoch": 3.552, "grad_norm": 0.033798523247241974, "kl": 0.01203155517578125, "learning_rate": 9.251557545888312e-07, "loss": -0.0221, "reward": 7.666335582733154, "reward_std": 0.6299453526735306, "rewards/mrr_reward": 0.7138020843267441, "rewards/rank_analyze_format_reward": 0.8006909340620041, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_contrast_format_reward": 0.01411290280520916, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 587.46875, "epoch": 3.56, "grad_norm": 0.039384886622428894, "kl": 0.012357711791992188, "learning_rate": 8.989402931500434e-07, "loss": -0.0138, "reward": 8.181223034858704, "reward_std": 0.7046910002827644, "rewards/mrr_reward": 0.875, "rewards/rank_analyze_format_reward": 0.6948948577046394, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 606.15625, "epoch": 3.568, "grad_norm": 0.03874967247247696, "kl": 0.015522003173828125, "learning_rate": 8.730841259649725e-07, "loss": -0.0254, "reward": 7.0217931270599365, "reward_std": 0.8901334404945374, "rewards/mrr_reward": 0.6117807626724243, "rewards/rank_analyze_format_reward": 0.633263885974884, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 623.296875, "epoch": 3.576, "grad_norm": 0.03479482978582382, "kl": 0.011865615844726562, "learning_rate": 8.475882737908248e-07, "loss": 0.0008, "reward": 7.554774522781372, "reward_std": 0.9437515586614609, "rewards/mrr_reward": 0.7163194417953491, "rewards/rank_analyze_format_reward": 0.7170282900333405, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0127108134329319, "rewards/rank_initial_format_reward": 0.9984335899353027, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9984335899353027, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 632.0625, "epoch": 3.584, "grad_norm": 0.03798670321702957, "kl": 0.012453079223632812, "learning_rate": 8.224537431601886e-07, "loss": 0.0001, "reward": 6.237062215805054, "reward_std": 0.5429144222289324, "rewards/mrr_reward": 0.3848772421479225, "rewards/rank_analyze_format_reward": 0.7522407919168472, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 619.234375, "epoch": 3.592, "grad_norm": 0.036835283041000366, "kl": 0.012708663940429688, "learning_rate": 7.976815263412963e-07, "loss": -0.0548, "reward": 6.972110390663147, "reward_std": 1.0418353527784348, "rewards/mrr_reward": 0.6005208343267441, "rewards/rank_analyze_format_reward": 0.6950270235538483, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.984375, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 606.84375, "epoch": 3.6, "grad_norm": 0.03548385202884674, "kl": 0.01288604736328125, "learning_rate": 7.732726012988512e-07, "loss": -0.0231, "reward": 7.123287677764893, "reward_std": 0.9100038930773735, "rewards/mrr_reward": 0.6060701757669449, "rewards/rank_analyze_format_reward": 0.749788224697113, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 1.0, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 617.828125, "epoch": 3.608, "grad_norm": 0.03627340868115425, "kl": 0.011266708374023438, "learning_rate": 7.492279316554207e-07, "loss": -0.0177, "reward": 7.946985602378845, "reward_std": 0.7550379931926727, "rewards/mrr_reward": 0.80078125, "rewards/rank_analyze_format_reward": 0.7886674106121063, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 612.265625, "epoch": 3.616, "grad_norm": 0.04377419501543045, "kl": 0.0131683349609375, "learning_rate": 7.255484666533874e-07, "loss": -0.0026, "reward": 6.993055105209351, "reward_std": 1.2321006208658218, "rewards/mrr_reward": 0.6157738119363785, "rewards/rank_analyze_format_reward": 0.705741174519062, "rewards/rank_answer_foramt_reward": 0.90234375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.96875, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 572.984375, "epoch": 3.624, "grad_norm": 0.04109544679522514, "kl": 0.01351165771484375, "learning_rate": 7.022351411174866e-07, "loss": 0.005, "reward": 7.255419611930847, "reward_std": 1.0537290424108505, "rewards/mrr_reward": 0.6588541716337204, "rewards/rank_analyze_format_reward": 0.7091151028871536, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 606.953125, "epoch": 3.632, "grad_norm": 0.03640174865722656, "kl": 0.012472152709960938, "learning_rate": 6.792888754178906e-07, "loss": -0.0046, "reward": 7.461974620819092, "reward_std": 0.8970663994550705, "rewards/mrr_reward": 0.688616082072258, "rewards/rank_analyze_format_reward": 0.7289945930242538, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 609.234375, "epoch": 3.64, "grad_norm": 0.03923455998301506, "kl": 0.0132598876953125, "learning_rate": 6.567105754338798e-07, "loss": -0.0055, "reward": 7.089033126831055, "reward_std": 1.0927991718053818, "rewards/mrr_reward": 0.6148189604282379, "rewards/rank_analyze_format_reward": 0.7315036952495575, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.013373362831771374, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 670.09375, "epoch": 3.648, "grad_norm": 0.03714577481150627, "kl": 0.012269973754882812, "learning_rate": 6.345011325180772e-07, "loss": -0.006, "reward": 6.888863801956177, "reward_std": 0.7847508117556572, "rewards/mrr_reward": 0.5322854816913605, "rewards/rank_analyze_format_reward": 0.8160540610551834, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 576.171875, "epoch": 3.656, "grad_norm": 0.03968933969736099, "kl": 0.014535903930664062, "learning_rate": 6.126614234612593e-07, "loss": -0.0031, "reward": 6.883362054824829, "reward_std": 1.1338584274053574, "rewards/mrr_reward": 0.5649925693869591, "rewards/rank_analyze_format_reward": 0.7210482209920883, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 609.625, "epoch": 3.664, "grad_norm": 0.0411507710814476, "kl": 0.014348983764648438, "learning_rate": 5.911923104577455e-07, "loss": -0.017, "reward": 6.8970195055007935, "reward_std": 0.7303311824798584, "rewards/mrr_reward": 0.544177807867527, "rewards/rank_analyze_format_reward": 0.7589471489191055, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9982585161924362, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 600.109375, "epoch": 3.672, "grad_norm": 0.03855804726481438, "kl": 0.013666152954101562, "learning_rate": 5.700946410713548e-07, "loss": 0.0051, "reward": 7.619644403457642, "reward_std": 0.8956380970776081, "rewards/mrr_reward": 0.7256944477558136, "rewards/rank_analyze_format_reward": 0.7556206434965134, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 622.765625, "epoch": 3.68, "grad_norm": 0.04349582642316818, "kl": 0.011423110961914062, "learning_rate": 5.49369248201953e-07, "loss": 0.0175, "reward": 6.194160223007202, "reward_std": 0.9866833090782166, "rewards/mrr_reward": 0.3742373511195183, "rewards/rank_analyze_format_reward": 0.8241638392210007, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.984375, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 605.609375, "epoch": 3.6879999999999997, "grad_norm": 0.035147525370121, "kl": 0.011508941650390625, "learning_rate": 5.290169500525577e-07, "loss": -0.0124, "reward": 7.419980049133301, "reward_std": 0.46903695818036795, "rewards/mrr_reward": 0.7058593779802322, "rewards/rank_analyze_format_reward": 0.5965423956513405, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 607.328125, "epoch": 3.6959999999999997, "grad_norm": 0.038031551986932755, "kl": 0.0143585205078125, "learning_rate": 5.090385500970551e-07, "loss": -0.0282, "reward": 7.524974226951599, "reward_std": 0.7495295517146587, "rewards/mrr_reward": 0.7476562410593033, "rewards/rank_analyze_format_reward": 0.616380512714386, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9609375, "rewards/rank_verify_format_reward": 0.984375, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 624.5625, "epoch": 3.7039999999999997, "grad_norm": 0.03687411919236183, "kl": 0.01457977294921875, "learning_rate": 4.894348370484648e-07, "loss": -0.0306, "reward": 6.9998191595077515, "reward_std": 1.1533474028110504, "rewards/mrr_reward": 0.5960689634084702, "rewards/rank_analyze_format_reward": 0.72789466381073, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9985119104385376, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9828869104385376, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 662.578125, "epoch": 3.7119999999999997, "grad_norm": 0.035118553787469864, "kl": 0.010381698608398438, "learning_rate": 4.702065848278126e-07, "loss": 0.001, "reward": 7.487505078315735, "reward_std": 1.193233162164688, "rewards/mrr_reward": 0.7095052152872086, "rewards/rank_analyze_format_reward": 0.7606973052024841, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9834558814764023, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9834558814764023, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 613.640625, "epoch": 3.7199999999999998, "grad_norm": 0.039265792816877365, "kl": 0.012542724609375, "learning_rate": 4.5135455253357053e-07, "loss": -0.02, "reward": 6.985211730003357, "reward_std": 0.8345009088516235, "rewards/mrr_reward": 0.5719804167747498, "rewards/rank_analyze_format_reward": 0.7383057624101639, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 598.15625, "epoch": 3.7279999999999998, "grad_norm": 0.03676972910761833, "kl": 0.013540267944335938, "learning_rate": 4.3287948441169457e-07, "loss": -0.0256, "reward": 7.533303260803223, "reward_std": 0.6752141863107681, "rewards/mrr_reward": 0.7332217246294022, "rewards/rank_analyze_format_reward": 0.6644462794065475, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9982585161924362, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.9826335161924362, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 623.875, "epoch": 3.7359999999999998, "grad_norm": 0.03843839094042778, "kl": 0.012653350830078125, "learning_rate": 4.1478210982624055e-07, "loss": -0.0137, "reward": 7.101514220237732, "reward_std": 0.712131037376821, "rewards/mrr_reward": 0.6059895902872086, "rewards/rank_analyze_format_reward": 0.7317783385515213, "rewards/rank_answer_foramt_reward": 0.986328125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9992559552192688, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9836309552192688, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 627.5, "epoch": 3.7439999999999998, "grad_norm": 0.038445886224508286, "kl": 0.0115203857421875, "learning_rate": 3.9706314323056936e-07, "loss": -0.001, "reward": 7.2100324630737305, "reward_std": 0.8441106081008911, "rewards/mrr_reward": 0.6200706958770752, "rewards/rank_analyze_format_reward": 0.8110490888357162, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9974361509084702, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9974361509084702, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 567.046875, "epoch": 3.752, "grad_norm": 0.0397077351808548, "kl": 0.012897491455078125, "learning_rate": 3.7972328413914074e-07, "loss": -0.0162, "reward": 7.725077509880066, "reward_std": 1.1218221932649612, "rewards/mrr_reward": 0.7771391421556473, "rewards/rank_analyze_format_reward": 0.6673022508621216, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 634.859375, "epoch": 3.76, "grad_norm": 0.035067442804574966, "kl": 0.01100921630859375, "learning_rate": 3.627632170999029e-07, "loss": -0.0112, "reward": 7.772274732589722, "reward_std": 0.4194560647010803, "rewards/mrr_reward": 0.7544270902872086, "rewards/rank_analyze_format_reward": 0.75864277780056, "rewards/rank_answer_foramt_reward": 1.0, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9979619532823563, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9979619532823563, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 629.296875, "epoch": 3.768, "grad_norm": 0.03988838940858841, "kl": 0.013881683349609375, "learning_rate": 3.4618361166726123e-07, "loss": 0.0089, "reward": 6.951045513153076, "reward_std": 0.9670315980911255, "rewards/mrr_reward": 0.5557911768555641, "rewards/rank_analyze_format_reward": 0.811864972114563, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 610.359375, "epoch": 3.776, "grad_norm": 0.04017311707139015, "kl": 0.012922286987304688, "learning_rate": 3.2998512237565005e-07, "loss": 0.0021, "reward": 6.655686378479004, "reward_std": 0.8838780298829079, "rewards/mrr_reward": 0.49754463881254196, "rewards/rank_analyze_format_reward": 0.7454710304737091, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 598.546875, "epoch": 3.784, "grad_norm": 0.04061814397573471, "kl": 0.015371322631835938, "learning_rate": 3.1416838871368925e-07, "loss": 0.0023, "reward": 6.667187333106995, "reward_std": 1.6450905501842499, "rewards/mrr_reward": 0.546354167163372, "rewards/rank_analyze_format_reward": 0.6706438362598419, "rewards/rank_answer_foramt_reward": 0.822265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9983368366956711, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9983368366956711, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 634.578125, "epoch": 3.792, "grad_norm": 0.03757631406188011, "kl": 0.011842727661132812, "learning_rate": 2.987340350989421e-07, "loss": -0.0277, "reward": 7.062578439712524, "reward_std": 0.7171650826931, "rewards/mrr_reward": 0.5821800529956818, "rewards/rank_analyze_format_reward": 0.8331592828035355, "rewards/rank_answer_foramt_reward": 0.91796875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 630.390625, "epoch": 3.8, "grad_norm": 0.0385047122836113, "kl": 0.013031005859375, "learning_rate": 2.836826708532603e-07, "loss": -0.0071, "reward": 6.774095058441162, "reward_std": 1.0064187571406364, "rewards/mrr_reward": 0.5209139287471771, "rewards/rank_analyze_format_reward": 0.80762679874897, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.984375, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 621.0, "epoch": 3.808, "grad_norm": 0.03585473448038101, "kl": 0.013561248779296875, "learning_rate": 2.6901489017873375e-07, "loss": 0.0157, "reward": 7.464797139167786, "reward_std": 0.975187674164772, "rewards/mrr_reward": 0.7072048783302307, "rewards/rank_analyze_format_reward": 0.7355870008468628, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 597.921875, "epoch": 3.816, "grad_norm": 0.039436955004930496, "kl": 0.013492584228515625, "learning_rate": 2.547312721342277e-07, "loss": -0.0445, "reward": 6.666959643363953, "reward_std": 0.9370896592736244, "rewards/mrr_reward": 0.5154203921556473, "rewards/rank_analyze_format_reward": 0.6805618405342102, "rewards/rank_answer_foramt_reward": 0.9296875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.997514471411705, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.997514471411705, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 642.046875, "epoch": 3.824, "grad_norm": 0.03860106319189072, "kl": 0.013734817504882812, "learning_rate": 2.4083238061252565e-07, "loss": 0.0084, "reward": 6.7049055099487305, "reward_std": 0.43066950887441635, "rewards/mrr_reward": 0.4952381029725075, "rewards/rank_analyze_format_reward": 0.7649687975645065, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 616.34375, "epoch": 3.832, "grad_norm": 0.041024912148714066, "kl": 0.012483596801757812, "learning_rate": 2.273187643180652e-07, "loss": -0.023, "reward": 6.57793653011322, "reward_std": 1.0732970535755157, "rewards/mrr_reward": 0.4735739082098007, "rewards/rank_analyze_format_reward": 0.7500470578670502, "rewards/rank_answer_foramt_reward": 0.94140625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 606.578125, "epoch": 3.84, "grad_norm": 0.034815359860658646, "kl": 0.011325836181640625, "learning_rate": 2.1419095674527934e-07, "loss": 0.0153, "reward": 6.932149052619934, "reward_std": 1.0536329746246338, "rewards/mrr_reward": 0.5726066380739212, "rewards/rank_analyze_format_reward": 0.731004387140274, "rewards/rank_answer_foramt_reward": 0.943359375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9953981041908264, "rewards/rank_overall_format_reward_more": 0.9765625, "rewards/rank_verify_format_reward": 0.9953981041908264, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 630.40625, "epoch": 3.848, "grad_norm": 0.036056023091077805, "kl": 0.01219940185546875, "learning_rate": 2.014494761575314e-07, "loss": -0.0102, "reward": 7.459859132766724, "reward_std": 0.681392565369606, "rewards/mrr_reward": 0.6882998645305634, "rewards/rank_analyze_format_reward": 0.7418159544467926, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 632.6875, "epoch": 3.856, "grad_norm": 0.038142506033182144, "kl": 0.011157989501953125, "learning_rate": 1.8909482556666026e-07, "loss": -0.0052, "reward": 6.9656277894973755, "reward_std": 1.1684068441390991, "rewards/mrr_reward": 0.5538752377033234, "rewards/rank_analyze_format_reward": 0.8277124911546707, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9953869134187698, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9953869134187698, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 588.96875, "epoch": 3.864, "grad_norm": 0.03848228603601456, "kl": 0.01366424560546875, "learning_rate": 1.7712749271311392e-07, "loss": -0.0115, "reward": 7.90727972984314, "reward_std": 0.9489937871694565, "rewards/mrr_reward": 0.8121279776096344, "rewards/rank_analyze_format_reward": 0.7349397465586662, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 592.140625, "epoch": 3.872, "grad_norm": 0.03674139454960823, "kl": 0.01291656494140625, "learning_rate": 1.6554795004670389e-07, "loss": -0.0121, "reward": 7.151305317878723, "reward_std": 1.3574425652623177, "rewards/mrr_reward": 0.6450272798538208, "rewards/rank_analyze_format_reward": 0.7020555436611176, "rewards/rank_answer_foramt_reward": 0.916015625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.96875, "rewards/rank_verify_format_reward": 0.984375, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 593.875, "epoch": 3.88, "grad_norm": 0.03965899720788002, "kl": 0.012401580810546875, "learning_rate": 1.543566547079467e-07, "loss": -0.0167, "reward": 6.482243657112122, "reward_std": 0.920079916715622, "rewards/mrr_reward": 0.4476686418056488, "rewards/rank_analyze_format_reward": 0.7362185418605804, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.013944223523139954, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 614.71875, "epoch": 3.888, "grad_norm": 0.038730841130018234, "kl": 0.015628814697265625, "learning_rate": 1.4355404851001953e-07, "loss": -0.0013, "reward": 7.664029955863953, "reward_std": 0.8843671232461929, "rewards/mrr_reward": 0.7207217365503311, "rewards/rank_analyze_format_reward": 0.8122782856225967, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 584.703125, "epoch": 3.896, "grad_norm": 0.03808212652802467, "kl": 0.013837814331054688, "learning_rate": 1.3314055792131964e-07, "loss": 0.0003, "reward": 7.29535174369812, "reward_std": 0.6776691898703575, "rewards/mrr_reward": 0.6445312649011612, "rewards/rank_analyze_format_reward": 0.7601954787969589, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 632.71875, "epoch": 3.904, "grad_norm": 0.04019862040877342, "kl": 0.012102127075195312, "learning_rate": 1.231165940486234e-07, "loss": 0.0098, "reward": 7.2379196882247925, "reward_std": 1.0804044008255005, "rewards/mrr_reward": 0.6402343809604645, "rewards/rank_analyze_format_reward": 0.7469863891601562, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 611.21875, "epoch": 3.912, "grad_norm": 0.04155530408024788, "kl": 0.014692306518554688, "learning_rate": 1.134825526208605e-07, "loss": -0.0172, "reward": 6.462707042694092, "reward_std": 1.3991620540618896, "rewards/mrr_reward": 0.4669705033302307, "rewards/rank_analyze_format_reward": 0.7217782437801361, "rewards/rank_answer_foramt_reward": 0.888671875, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 1.0, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 616.140625, "epoch": 3.92, "grad_norm": 0.038728028535842896, "kl": 0.011913299560546875, "learning_rate": 1.0423881397349067e-07, "loss": -0.0274, "reward": 6.584546685218811, "reward_std": 0.987204298377037, "rewards/mrr_reward": 0.47931547462940216, "rewards/rank_analyze_format_reward": 0.7864252328872681, "rewards/rank_answer_foramt_reward": 0.927734375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.984375, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 643.890625, "epoch": 3.928, "grad_norm": 0.037255771458148956, "kl": 0.012048721313476562, "learning_rate": 9.538574303348813e-08, "loss": -0.0003, "reward": 7.248134255409241, "reward_std": 0.609087161719799, "rewards/mrr_reward": 0.6106956899166107, "rewards/rank_analyze_format_reward": 0.8346483111381531, "rewards/rank_answer_foramt_reward": 0.970703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 607.09375, "epoch": 3.936, "grad_norm": 0.03653496131300926, "kl": 0.014324188232421875, "learning_rate": 8.692368930493522e-08, "loss": -0.0078, "reward": 7.079232692718506, "reward_std": 0.7762657403945923, "rewards/mrr_reward": 0.5865451470017433, "rewards/rank_analyze_format_reward": 0.7700468897819519, "rewards/rank_answer_foramt_reward": 0.97265625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 630.9375, "epoch": 3.944, "grad_norm": 0.03887813538312912, "kl": 0.01277923583984375, "learning_rate": 7.885298685522235e-08, "loss": 0.0019, "reward": 6.853779196739197, "reward_std": 0.5883737578988075, "rewards/mrr_reward": 0.5194568485021591, "rewards/rank_analyze_format_reward": 0.8206439018249512, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9981617629528046, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 0.9981617629528046, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 628.890625, "epoch": 3.952, "grad_norm": 0.03760422766208649, "kl": 0.012228012084960938, "learning_rate": 7.117395430186414e-08, "loss": -0.0105, "reward": 7.296409845352173, "reward_std": 1.1353522688150406, "rewards/mrr_reward": 0.6470052301883698, "rewards/rank_analyze_format_reward": 0.7630764245986938, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 632.453125, "epoch": 3.96, "grad_norm": 0.04088395833969116, "kl": 0.011974334716796875, "learning_rate": 6.388689479991606e-08, "loss": -0.014, "reward": 6.792389988899231, "reward_std": 0.9225097447633743, "rewards/mrr_reward": 0.5218750275671482, "rewards/rank_analyze_format_reward": 0.7651283890008926, "rewards/rank_answer_foramt_reward": 0.95703125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9991776347160339, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 586.046875, "epoch": 3.968, "grad_norm": 0.038039859384298325, "kl": 0.012334823608398438, "learning_rate": 5.699209603001077e-08, "loss": -0.0243, "reward": 7.0821181535720825, "reward_std": 1.2305900156497955, "rewards/mrr_reward": 0.6263020932674408, "rewards/rank_analyze_format_reward": 0.676519088447094, "rewards/rank_answer_foramt_reward": 0.931640625, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.984375, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 619.65625, "epoch": 3.976, "grad_norm": 0.042080074548721313, "kl": 0.012048721313476562, "learning_rate": 5.048983018699827e-08, "loss": 0.0112, "reward": 7.104416489601135, "reward_std": 1.2093525528907776, "rewards/mrr_reward": 0.5863157212734222, "rewards/rank_analyze_format_reward": 0.7825910001993179, "rewards/rank_answer_foramt_reward": 0.984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 0.9921875, "rewards/rank_verify_format_reward": 1.0, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 602.734375, "epoch": 3.984, "grad_norm": 0.038315799087285995, "kl": 0.010751724243164062, "learning_rate": 4.438035396920004e-08, "loss": -0.0141, "reward": 6.625136733055115, "reward_std": 0.9687140211462975, "rewards/mrr_reward": 0.4734809100627899, "rewards/rank_analyze_format_reward": 0.7722287178039551, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 1.0, "rewards/rank_overall_format_reward_more": 1.0, "rewards/rank_verify_format_reward": 1.0, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 618.96875, "epoch": 3.992, "grad_norm": 0.03733893856406212, "kl": 0.01275634765625, "learning_rate": 3.866390856827495e-08, "loss": -0.0313, "reward": 7.275609493255615, "reward_std": 0.918476015329361, "rewards/mrr_reward": 0.6309213787317276, "rewards/rank_analyze_format_reward": 0.8395065367221832, "rewards/rank_answer_foramt_reward": 0.9453125, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9991776347160339, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9835526347160339, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 582.34375, "epoch": 4.0, "grad_norm": 0.03825841844081879, "kl": 0.015573501586914062, "learning_rate": 3.3340719659701315e-08, "loss": -0.0178, "reward": 7.2012619972229, "reward_std": 0.7794432565569878, "rewards/mrr_reward": 0.6161458268761635, "rewards/rank_analyze_format_reward": 0.795157715678215, "rewards/rank_answer_foramt_reward": 0.958984375, "rewards/rank_contrast_format_reward": 0.0, "rewards/rank_initial_format_reward": 0.9990808814764023, "rewards/rank_overall_format_reward_more": 0.984375, "rewards/rank_verify_format_reward": 0.9990808814764023, "step": 500 }, { "epoch": 4.0, "step": 500, "total_flos": 0.0, "train_loss": -0.0018289514125790446, "train_runtime": 36870.3642, "train_samples_per_second": 0.868, "train_steps_per_second": 0.014 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }