| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.0, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 455.8125, |
| "epoch": 0.008, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0618, |
| "reward": 4.689006567001343, |
| "reward_std": 1.78610560297966, |
| "rewards/mrr_reward": 0.2938988097012043, |
| "rewards/rank_analyze_format_reward": 0.11466514505445957, |
| "rewards/rank_answer_foramt_reward": 0.501953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.8984375, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 486.734375, |
| "epoch": 0.016, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": 0.0063, |
| "reward": 3.9557588696479797, |
| "reward_std": 1.5732559561729431, |
| "rewards/mrr_reward": 0.169766865670681, |
| "rewards/rank_analyze_format_reward": 0.07681952975690365, |
| "rewards/rank_answer_foramt_reward": 0.36328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9807952791452408, |
| "rewards/rank_overall_format_reward_more": 0.875, |
| "rewards/rank_verify_format_reward": 0.9807952791452408, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 441.203125, |
| "epoch": 0.024, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0565, |
| "reward": 4.4016576409339905, |
| "reward_std": 1.6944840550422668, |
| "rewards/mrr_reward": 0.2554253488779068, |
| "rewards/rank_analyze_format_reward": 0.11000172607600689, |
| "rewards/rank_answer_foramt_reward": 0.4140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.875, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 473.03125, |
| "epoch": 0.032, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0361, |
| "reward": 5.027822136878967, |
| "reward_std": 1.9866893887519836, |
| "rewards/mrr_reward": 0.3529265820980072, |
| "rewards/rank_analyze_format_reward": 0.15089312940835953, |
| "rewards/rank_answer_foramt_reward": 0.5546875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.998236283659935, |
| "rewards/rank_overall_format_reward_more": 0.9140625, |
| "rewards/rank_verify_format_reward": 0.998236283659935, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 488.078125, |
| "epoch": 0.04, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0246, |
| "reward": 4.68894362449646, |
| "reward_std": 1.7762902677059174, |
| "rewards/mrr_reward": 0.3031250014901161, |
| "rewards/rank_analyze_format_reward": 0.2197269294410944, |
| "rewards/rank_answer_foramt_reward": 0.474609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9808974117040634, |
| "rewards/rank_overall_format_reward_more": 0.8203125, |
| "rewards/rank_verify_format_reward": 0.9808974117040634, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 476.78125, |
| "epoch": 0.048, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0422, |
| "reward": 4.237152338027954, |
| "reward_std": 1.5055316388607025, |
| "rewards/mrr_reward": 0.19459325820207596, |
| "rewards/rank_analyze_format_reward": 0.14743656385689974, |
| "rewards/rank_answer_foramt_reward": 0.46875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9994212985038757, |
| "rewards/rank_overall_format_reward_more": 0.859375, |
| "rewards/rank_verify_format_reward": 0.9837962985038757, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.6875, |
| "epoch": 0.056, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.023, |
| "reward": 4.391666889190674, |
| "reward_std": 1.7478849291801453, |
| "rewards/mrr_reward": 0.27126736007630825, |
| "rewards/rank_analyze_format_reward": 0.13924695551395416, |
| "rewards/rank_answer_foramt_reward": 0.4375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9977376908063889, |
| "rewards/rank_overall_format_reward_more": 0.78125, |
| "rewards/rank_verify_format_reward": 0.9508626908063889, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 438.890625, |
| "epoch": 0.064, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0161, |
| "reward": 4.737706661224365, |
| "reward_std": 1.8800698816776276, |
| "rewards/mrr_reward": 0.328125, |
| "rewards/rank_analyze_format_reward": 0.08576204627752304, |
| "rewards/rank_answer_foramt_reward": 0.484375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.997436136007309, |
| "rewards/rank_overall_format_reward_more": 0.875, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 440.546875, |
| "epoch": 0.072, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.026, |
| "reward": 4.544053554534912, |
| "reward_std": 2.140646994113922, |
| "rewards/mrr_reward": 0.3108258917927742, |
| "rewards/rank_analyze_format_reward": 0.09371883049607277, |
| "rewards/rank_answer_foramt_reward": 0.44140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.8125, |
| "rewards/rank_verify_format_reward": 0.953125, |
| "step": 9 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 486.078125, |
| "epoch": 0.08, |
| "grad_norm": 0.02036167122423649, |
| "kl": 0.0, |
| "learning_rate": 1.9999999684172664e-05, |
| "loss": -0.0462, |
| "reward": 4.728065490722656, |
| "reward_std": 1.9379011690616608, |
| "rewards/mrr_reward": 0.3036644458770752, |
| "rewards/rank_analyze_format_reward": 0.2416149042546749, |
| "rewards/rank_answer_foramt_reward": 0.4921875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9835526347160339, |
| "rewards/rank_overall_format_reward_more": 0.828125, |
| "rewards/rank_verify_format_reward": 0.9679276347160339, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 478.84375, |
| "epoch": 0.088, |
| "grad_norm": 0.02036167122423649, |
| "kl": -5.602836608886719e-06, |
| "learning_rate": 1.9999999684172664e-05, |
| "loss": -0.0299, |
| "reward": 4.586392045021057, |
| "reward_std": 1.808391511440277, |
| "rewards/mrr_reward": 0.26946303993463516, |
| "rewards/rank_analyze_format_reward": 0.178153439424932, |
| "rewards/rank_answer_foramt_reward": 0.52734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.8359375, |
| "rewards/rank_verify_format_reward": 0.9679276347160339, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 455.078125, |
| "epoch": 0.096, |
| "grad_norm": 0.020598648115992546, |
| "kl": -6.273388862609863e-06, |
| "learning_rate": 1.9999998736690666e-05, |
| "loss": -0.019, |
| "reward": 4.161486208438873, |
| "reward_std": 1.7841115891933441, |
| "rewards/mrr_reward": 0.21319444477558136, |
| "rewards/rank_analyze_format_reward": 0.09823539853096008, |
| "rewards/rank_answer_foramt_reward": 0.396484375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9968380630016327, |
| "rewards/rank_overall_format_reward_more": 0.8359375, |
| "rewards/rank_verify_format_reward": 0.9812130630016327, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 459.578125, |
| "epoch": 0.104, |
| "grad_norm": 0.021508827805519104, |
| "kl": -4.723668098449707e-06, |
| "learning_rate": 1.999999715755407e-05, |
| "loss": -0.0384, |
| "reward": 4.695295810699463, |
| "reward_std": 1.5369611978530884, |
| "rewards/mrr_reward": 0.3081597238779068, |
| "rewards/rank_analyze_format_reward": 0.11086451821029186, |
| "rewards/rank_answer_foramt_reward": 0.48046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.875, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 464.015625, |
| "epoch": 0.112, |
| "grad_norm": 0.022063156589865685, |
| "kl": -5.081295967102051e-06, |
| "learning_rate": 1.9999994946762974e-05, |
| "loss": -0.0454, |
| "reward": 4.200581610202789, |
| "reward_std": 1.8469471633434296, |
| "rewards/mrr_reward": 0.20451389625668526, |
| "rewards/rank_analyze_format_reward": 0.16768221091479063, |
| "rewards/rank_answer_foramt_reward": 0.41015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.8203125, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 452.5, |
| "epoch": 0.12, |
| "grad_norm": 0.022208023816347122, |
| "kl": -4.26173210144043e-06, |
| "learning_rate": 1.999999210431752e-05, |
| "loss": -0.0243, |
| "reward": 4.085702300071716, |
| "reward_std": 1.512882336974144, |
| "rewards/mrr_reward": 0.17906746454536915, |
| "rewards/rank_analyze_format_reward": 0.14385094121098518, |
| "rewards/rank_answer_foramt_reward": 0.4375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9956032931804657, |
| "rewards/rank_overall_format_reward_more": 0.796875, |
| "rewards/rank_verify_format_reward": 0.9956032931804657, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.296875, |
| "epoch": 0.128, |
| "grad_norm": 0.020399712026119232, |
| "kl": -4.0084123611450195e-06, |
| "learning_rate": 1.9999988630217885e-05, |
| "loss": -0.0316, |
| "reward": 5.109304070472717, |
| "reward_std": 1.985443890094757, |
| "rewards/mrr_reward": 0.3867187425494194, |
| "rewards/rank_analyze_format_reward": 0.17149577103555202, |
| "rewards/rank_answer_foramt_reward": 0.580078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.8125, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 473.90625, |
| "epoch": 0.136, |
| "grad_norm": 0.02226601168513298, |
| "kl": -2.7865171432495117e-06, |
| "learning_rate": 1.999998452446429e-05, |
| "loss": -0.032, |
| "reward": 4.289996266365051, |
| "reward_std": 1.757462590932846, |
| "rewards/mrr_reward": 0.2313119969330728, |
| "rewards/rank_analyze_format_reward": 0.15154925920069218, |
| "rewards/rank_answer_foramt_reward": 0.44140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9835526347160339, |
| "rewards/rank_overall_format_reward_more": 0.8203125, |
| "rewards/rank_verify_format_reward": 0.9679276347160339, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 486.328125, |
| "epoch": 0.144, |
| "grad_norm": 0.02119840867817402, |
| "kl": -2.086162567138672e-07, |
| "learning_rate": 1.9999979787056998e-05, |
| "loss": -0.0259, |
| "reward": 4.4557565450668335, |
| "reward_std": 1.1966679394245148, |
| "rewards/mrr_reward": 0.22187501564621925, |
| "rewards/rank_analyze_format_reward": 0.1767062321305275, |
| "rewards/rank_answer_foramt_reward": 0.521484375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9975329041481018, |
| "rewards/rank_overall_format_reward_more": 0.875, |
| "rewards/rank_verify_format_reward": 0.9975329041481018, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.53125, |
| "epoch": 0.152, |
| "grad_norm": 0.02239903435111046, |
| "kl": -2.2351741790771484e-07, |
| "learning_rate": 1.9999974417996303e-05, |
| "loss": -0.0161, |
| "reward": 4.088248610496521, |
| "reward_std": 1.54827019572258, |
| "rewards/mrr_reward": 0.18916791677474976, |
| "rewards/rank_analyze_format_reward": 0.09350559022277594, |
| "rewards/rank_answer_foramt_reward": 0.42578125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9959887713193893, |
| "rewards/rank_overall_format_reward_more": 0.8359375, |
| "rewards/rank_verify_format_reward": 0.9803637713193893, |
| "step": 19 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.40625, |
| "epoch": 0.16, |
| "grad_norm": 0.02158363349735737, |
| "kl": 2.995133399963379e-06, |
| "learning_rate": 1.9999968417282542e-05, |
| "loss": -0.0394, |
| "reward": 5.011839747428894, |
| "reward_std": 1.7887286245822906, |
| "rewards/mrr_reward": 0.35902776941657066, |
| "rewards/rank_analyze_format_reward": 0.1226036436855793, |
| "rewards/rank_answer_foramt_reward": 0.59375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.859375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 512.265625, |
| "epoch": 0.168, |
| "grad_norm": 0.019902769476175308, |
| "kl": 2.086162567138672e-06, |
| "learning_rate": 1.99999617849161e-05, |
| "loss": -0.007, |
| "reward": 4.999041318893433, |
| "reward_std": 2.092874825000763, |
| "rewards/mrr_reward": 0.33280009776353836, |
| "rewards/rank_analyze_format_reward": 0.3474135100841522, |
| "rewards/rank_answer_foramt_reward": 0.564453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9678308814764023, |
| "rewards/rank_overall_format_reward_more": 0.8203125, |
| "rewards/rank_verify_format_reward": 0.9678308814764023, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 462.078125, |
| "epoch": 0.176, |
| "grad_norm": 0.0211955476552248, |
| "kl": 6.16908073425293e-06, |
| "learning_rate": 1.9999954520897394e-05, |
| "loss": 0.0067, |
| "reward": 4.904757022857666, |
| "reward_std": 1.5794726610183716, |
| "rewards/mrr_reward": 0.35468750447034836, |
| "rewards/rank_analyze_format_reward": 0.09507373627275229, |
| "rewards/rank_answer_foramt_reward": 0.611328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.8125, |
| "rewards/rank_verify_format_reward": 0.9679276347160339, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 466.828125, |
| "epoch": 0.184, |
| "grad_norm": 0.020976468920707703, |
| "kl": 6.943941116333008e-06, |
| "learning_rate": 1.999994662522688e-05, |
| "loss": -0.0219, |
| "reward": 5.450310587882996, |
| "reward_std": 1.9311817586421967, |
| "rewards/mrr_reward": 0.44487228244543076, |
| "rewards/rank_analyze_format_reward": 0.20164816547185183, |
| "rewards/rank_answer_foramt_reward": 0.66796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.8359375, |
| "rewards/rank_verify_format_reward": 0.9670085161924362, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.34375, |
| "epoch": 0.192, |
| "grad_norm": 0.0208530742675066, |
| "kl": 1.093745231628418e-05, |
| "learning_rate": 1.9999938097905064e-05, |
| "loss": -0.0345, |
| "reward": 4.764381527900696, |
| "reward_std": 1.8450036644935608, |
| "rewards/mrr_reward": 0.31850818172097206, |
| "rewards/rank_analyze_format_reward": 0.1051585366949439, |
| "rewards/rank_answer_foramt_reward": 0.546875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9972826093435287, |
| "rewards/rank_overall_format_reward_more": 0.859375, |
| "rewards/rank_verify_format_reward": 0.9816576093435287, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 477.484375, |
| "epoch": 0.2, |
| "grad_norm": 0.02045305259525776, |
| "kl": 1.1593103408813477e-05, |
| "learning_rate": 1.9999928938932473e-05, |
| "loss": -0.0176, |
| "reward": 4.7958372831344604, |
| "reward_std": 1.7617928981781006, |
| "rewards/mrr_reward": 0.2876054085791111, |
| "rewards/rank_analyze_format_reward": 0.28027439024299383, |
| "rewards/rank_answer_foramt_reward": 0.48828125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9970238208770752, |
| "rewards/rank_overall_format_reward_more": 0.8828125, |
| "rewards/rank_verify_format_reward": 0.9970238208770752, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.140625, |
| "epoch": 0.208, |
| "grad_norm": 0.02045305259525776, |
| "kl": 1.919269561767578e-05, |
| "learning_rate": 1.9999928938932473e-05, |
| "loss": -0.002, |
| "reward": 4.298715710639954, |
| "reward_std": 1.676234632730484, |
| "rewards/mrr_reward": 0.2303757481276989, |
| "rewards/rank_analyze_format_reward": 0.11518567334860563, |
| "rewards/rank_answer_foramt_reward": 0.458984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9679276347160339, |
| "rewards/rank_overall_format_reward_more": 0.8671875, |
| "rewards/rank_verify_format_reward": 0.9679276347160339, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 483.71875, |
| "epoch": 0.216, |
| "grad_norm": 0.02045305259525776, |
| "kl": 1.0028481483459473e-05, |
| "learning_rate": 1.9999928938932473e-05, |
| "loss": -0.0122, |
| "reward": 4.261886656284332, |
| "reward_std": 1.7420227527618408, |
| "rewards/mrr_reward": 0.19882812350988388, |
| "rewards/rank_analyze_format_reward": 0.19866678677499294, |
| "rewards/rank_answer_foramt_reward": 0.44921875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9837500005960464, |
| "rewards/rank_overall_format_reward_more": 0.8359375, |
| "rewards/rank_verify_format_reward": 0.9990011900663376, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 494.4375, |
| "epoch": 0.224, |
| "grad_norm": 0.020556651055812836, |
| "kl": 1.0758638381958008e-05, |
| "learning_rate": 1.99999191483097e-05, |
| "loss": -0.0292, |
| "reward": 4.516226172447205, |
| "reward_std": 1.9960070848464966, |
| "rewards/mrr_reward": 0.28723958507180214, |
| "rewards/rank_analyze_format_reward": 0.14337314292788506, |
| "rewards/rank_answer_foramt_reward": 0.470703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9820645451545715, |
| "rewards/rank_overall_format_reward_more": 0.7890625, |
| "rewards/rank_verify_format_reward": 0.9820645451545715, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 477.453125, |
| "epoch": 0.232, |
| "grad_norm": 0.019716233015060425, |
| "kl": 1.9982457160949707e-05, |
| "learning_rate": 1.999990872603735e-05, |
| "loss": -0.017, |
| "reward": 4.805420398712158, |
| "reward_std": 1.657298356294632, |
| "rewards/mrr_reward": 0.3268229216337204, |
| "rewards/rank_analyze_format_reward": 0.14590902999043465, |
| "rewards/rank_answer_foramt_reward": 0.490234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9934926480054855, |
| "rewards/rank_overall_format_reward_more": 0.875, |
| "rewards/rank_verify_format_reward": 0.9934926480054855, |
| "step": 29 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 492.375, |
| "epoch": 0.24, |
| "grad_norm": 0.02322639897465706, |
| "kl": 1.965463161468506e-05, |
| "learning_rate": 1.999989767211609e-05, |
| "loss": -0.0386, |
| "reward": 4.979418992996216, |
| "reward_std": 1.6339992135763168, |
| "rewards/mrr_reward": 0.3164062537252903, |
| "rewards/rank_analyze_format_reward": 0.31370767019689083, |
| "rewards/rank_answer_foramt_reward": 0.51953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9988712668418884, |
| "rewards/rank_overall_format_reward_more": 0.8828125, |
| "rewards/rank_verify_format_reward": 0.9988712668418884, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 464.875, |
| "epoch": 0.248, |
| "grad_norm": 0.020231744274497032, |
| "kl": 2.7373433113098145e-05, |
| "learning_rate": 1.9999885986546613e-05, |
| "loss": -0.0448, |
| "reward": 4.7086580991744995, |
| "reward_std": 1.7371686697006226, |
| "rewards/mrr_reward": 0.2849392406642437, |
| "rewards/rank_analyze_format_reward": 0.16265114955604076, |
| "rewards/rank_answer_foramt_reward": 0.4921875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9296875, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 487.484375, |
| "epoch": 0.256, |
| "grad_norm": 0.01962853968143463, |
| "kl": 3.975629806518555e-05, |
| "learning_rate": 1.999987366932966e-05, |
| "loss": -0.0411, |
| "reward": 4.679190993309021, |
| "reward_std": 1.5342676639556885, |
| "rewards/mrr_reward": 0.27297867834568024, |
| "rewards/rank_analyze_format_reward": 0.16313984990119934, |
| "rewards/rank_answer_foramt_reward": 0.56640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.875, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 454.515625, |
| "epoch": 0.264, |
| "grad_norm": 0.02267865277826786, |
| "kl": 3.771483898162842e-05, |
| "learning_rate": 1.9999860720466007e-05, |
| "loss": -0.0034, |
| "reward": 4.1931135058403015, |
| "reward_std": 1.5233525335788727, |
| "rewards/mrr_reward": 0.19470486417412758, |
| "rewards/rank_analyze_format_reward": 0.10926186013966799, |
| "rewards/rank_answer_foramt_reward": 0.443359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9972426444292068, |
| "rewards/rank_overall_format_reward_more": 0.8671875, |
| "rewards/rank_verify_format_reward": 0.9972426444292068, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 461.953125, |
| "epoch": 0.272, |
| "grad_norm": 0.02176724746823311, |
| "kl": 5.410611629486084e-05, |
| "learning_rate": 1.9999847139956477e-05, |
| "loss": -0.0314, |
| "reward": 4.550845384597778, |
| "reward_std": 1.958255022764206, |
| "rewards/mrr_reward": 0.30027903243899345, |
| "rewards/rank_analyze_format_reward": 0.0568907568231225, |
| "rewards/rank_answer_foramt_reward": 0.529296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9677083343267441, |
| "rewards/rank_overall_format_reward_more": 0.84375, |
| "rewards/rank_verify_format_reward": 0.9520833343267441, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 474.65625, |
| "epoch": 0.28, |
| "grad_norm": 0.0221868809312582, |
| "kl": 5.4582953453063965e-05, |
| "learning_rate": 1.9999832927801922e-05, |
| "loss": -0.0057, |
| "reward": 4.710769176483154, |
| "reward_std": 1.6857908964157104, |
| "rewards/mrr_reward": 0.3105034828186035, |
| "rewards/rank_analyze_format_reward": 0.1985221654176712, |
| "rewards/rank_answer_foramt_reward": 0.544921875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9798430800437927, |
| "rewards/rank_overall_format_reward_more": 0.78125, |
| "rewards/rank_verify_format_reward": 0.9642180800437927, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 496.78125, |
| "epoch": 0.288, |
| "grad_norm": 0.02121078222990036, |
| "kl": 6.13182783126831e-05, |
| "learning_rate": 1.9999818084003243e-05, |
| "loss": -0.0368, |
| "reward": 5.009979605674744, |
| "reward_std": 1.9656108021736145, |
| "rewards/mrr_reward": 0.32831721380352974, |
| "rewards/rank_analyze_format_reward": 0.24603652395308018, |
| "rewards/rank_answer_foramt_reward": 0.58984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9811964929103851, |
| "rewards/rank_overall_format_reward_more": 0.8984375, |
| "rewards/rank_verify_format_reward": 0.9811964929103851, |
| "step": 36 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.40625, |
| "epoch": 0.296, |
| "grad_norm": 0.022762347012758255, |
| "kl": 7.359683513641357e-05, |
| "learning_rate": 1.999980260856137e-05, |
| "loss": 0.0164, |
| "reward": 4.261849403381348, |
| "reward_std": 1.6020236611366272, |
| "rewards/mrr_reward": 0.20416666939854622, |
| "rewards/rank_analyze_format_reward": 0.16004161350429058, |
| "rewards/rank_answer_foramt_reward": 0.4375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9980392158031464, |
| "rewards/rank_overall_format_reward_more": 0.8515625, |
| "rewards/rank_verify_format_reward": 0.9980392158031464, |
| "step": 37 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 473.59375, |
| "epoch": 0.304, |
| "grad_norm": 0.02300061471760273, |
| "kl": 6.565451622009277e-05, |
| "learning_rate": 1.9999786501477298e-05, |
| "loss": -0.0407, |
| "reward": 4.600297033786774, |
| "reward_std": 1.597813993692398, |
| "rewards/mrr_reward": 0.2838975712656975, |
| "rewards/rank_analyze_format_reward": 0.10455834865570068, |
| "rewards/rank_answer_foramt_reward": 0.50390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9984335899353027, |
| "rewards/rank_overall_format_reward_more": 0.859375, |
| "rewards/rank_verify_format_reward": 0.9984335899353027, |
| "step": 38 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 468.828125, |
| "epoch": 0.312, |
| "grad_norm": 0.022656837478280067, |
| "kl": 9.85860824584961e-05, |
| "learning_rate": 1.9999769762752024e-05, |
| "loss": -0.0421, |
| "reward": 4.96368944644928, |
| "reward_std": 1.8781414777040482, |
| "rewards/mrr_reward": 0.33848586305975914, |
| "rewards/rank_analyze_format_reward": 0.16822483576834202, |
| "rewards/rank_answer_foramt_reward": 0.654296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.8203125, |
| "rewards/rank_verify_format_reward": 0.9678308814764023, |
| "step": 39 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 503.1875, |
| "epoch": 0.32, |
| "grad_norm": 0.023156002163887024, |
| "kl": 0.0001109689474105835, |
| "learning_rate": 1.999975239238662e-05, |
| "loss": -0.0188, |
| "reward": 5.189586162567139, |
| "reward_std": 2.028193384408951, |
| "rewards/mrr_reward": 0.36250000447034836, |
| "rewards/rank_analyze_format_reward": 0.3480729628354311, |
| "rewards/rank_answer_foramt_reward": 0.560546875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.981889471411705, |
| "rewards/rank_overall_format_reward_more": 0.8671875, |
| "rewards/rank_verify_format_reward": 0.981889471411705, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 461.5625, |
| "epoch": 0.328, |
| "grad_norm": 0.021381191909313202, |
| "kl": 0.00012908875942230225, |
| "learning_rate": 1.999973439038218e-05, |
| "loss": -0.0281, |
| "reward": 4.959184765815735, |
| "reward_std": 2.1065359711647034, |
| "rewards/mrr_reward": 0.36927083879709244, |
| "rewards/rank_analyze_format_reward": 0.1428538914769888, |
| "rewards/rank_answer_foramt_reward": 0.48828125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9918892979621887, |
| "rewards/rank_overall_format_reward_more": 0.8828125, |
| "rewards/rank_verify_format_reward": 0.9762642979621887, |
| "step": 41 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 490.0, |
| "epoch": 0.336, |
| "grad_norm": 0.02237197570502758, |
| "kl": 0.00011831521987915039, |
| "learning_rate": 1.9999715756739833e-05, |
| "loss": -0.0379, |
| "reward": 4.825831055641174, |
| "reward_std": 1.8668445944786072, |
| "rewards/mrr_reward": 0.3253224194049835, |
| "rewards/rank_analyze_format_reward": 0.163555265404284, |
| "rewards/rank_answer_foramt_reward": 0.568359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9900633096694946, |
| "rewards/rank_overall_format_reward_more": 0.828125, |
| "rewards/rank_verify_format_reward": 0.9744383096694946, |
| "step": 42 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 473.34375, |
| "epoch": 0.344, |
| "grad_norm": 0.022024238482117653, |
| "kl": 0.00014644861221313477, |
| "learning_rate": 1.9999696491460764e-05, |
| "loss": -0.0215, |
| "reward": 4.890589237213135, |
| "reward_std": 1.6738486886024475, |
| "rewards/mrr_reward": 0.3286830335855484, |
| "rewards/rank_analyze_format_reward": 0.15786650124937296, |
| "rewards/rank_answer_foramt_reward": 0.56640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9961046874523163, |
| "rewards/rank_overall_format_reward_more": 0.859375, |
| "rewards/rank_verify_format_reward": 0.9961046874523163, |
| "step": 43 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.234375, |
| "epoch": 0.352, |
| "grad_norm": 0.02275724522769451, |
| "kl": 0.00016573071479797363, |
| "learning_rate": 1.9999676594546187e-05, |
| "loss": -0.0215, |
| "reward": 5.033377289772034, |
| "reward_std": 1.8407581448554993, |
| "rewards/mrr_reward": 0.3557477742433548, |
| "rewards/rank_analyze_format_reward": 0.14718732610344887, |
| "rewards/rank_answer_foramt_reward": 0.59765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.8828125, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 44 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 487.625, |
| "epoch": 0.36, |
| "grad_norm": 0.023730719462037086, |
| "kl": 0.00015407800674438477, |
| "learning_rate": 1.999965606599736e-05, |
| "loss": -0.0031, |
| "reward": 5.316616773605347, |
| "reward_std": 1.5850826501846313, |
| "rewards/mrr_reward": 0.4290550574660301, |
| "rewards/rank_analyze_format_reward": 0.08148389589041471, |
| "rewards/rank_answer_foramt_reward": 0.697265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9928547292947769, |
| "rewards/rank_overall_format_reward_more": 0.8515625, |
| "rewards/rank_verify_format_reward": 0.9772297292947769, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 509.9375, |
| "epoch": 0.368, |
| "grad_norm": 0.021739846095442772, |
| "kl": 0.00018548965454101562, |
| "learning_rate": 1.999963490581558e-05, |
| "loss": -0.0254, |
| "reward": 5.217623829841614, |
| "reward_std": 1.4084790647029877, |
| "rewards/mrr_reward": 0.33927951753139496, |
| "rewards/rank_analyze_format_reward": 0.3564212815836072, |
| "rewards/rank_answer_foramt_reward": 0.6953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.998135969042778, |
| "rewards/rank_overall_format_reward_more": 0.84375, |
| "rewards/rank_verify_format_reward": 0.966885969042778, |
| "step": 46 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 473.734375, |
| "epoch": 0.376, |
| "grad_norm": 0.023394783958792686, |
| "kl": 0.00021630525588989258, |
| "learning_rate": 1.9999613114002184e-05, |
| "loss": -0.0309, |
| "reward": 4.08813738822937, |
| "reward_std": 1.2790243327617645, |
| "rewards/mrr_reward": 0.14723462983965874, |
| "rewards/rank_analyze_format_reward": 0.15144313033670187, |
| "rewards/rank_answer_foramt_reward": 0.431640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9971200972795486, |
| "rewards/rank_overall_format_reward_more": 0.921875, |
| "rewards/rank_verify_format_reward": 0.9971200972795486, |
| "step": 47 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 472.421875, |
| "epoch": 0.384, |
| "grad_norm": 0.027028290554881096, |
| "kl": 0.00026175379753112793, |
| "learning_rate": 1.9999590690558545e-05, |
| "loss": -0.054, |
| "reward": 5.350240349769592, |
| "reward_std": 1.9697438478469849, |
| "rewards/mrr_reward": 0.42695312947034836, |
| "rewards/rank_analyze_format_reward": 0.21482349652796984, |
| "rewards/rank_answer_foramt_reward": 0.625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9989583343267441, |
| "rewards/rank_overall_format_reward_more": 0.8359375, |
| "rewards/rank_verify_format_reward": 0.9677083343267441, |
| "step": 48 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 476.34375, |
| "epoch": 0.392, |
| "grad_norm": 0.021585488691926003, |
| "kl": 0.0002930760383605957, |
| "learning_rate": 1.9999567635486086e-05, |
| "loss": -0.0243, |
| "reward": 4.152051568031311, |
| "reward_std": 1.6824184954166412, |
| "rewards/mrr_reward": 0.18816964142024517, |
| "rewards/rank_analyze_format_reward": 0.12541021592915058, |
| "rewards/rank_answer_foramt_reward": 0.39453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983095824718475, |
| "rewards/rank_overall_format_reward_more": 0.8828125, |
| "rewards/rank_verify_format_reward": 0.9983095824718475, |
| "step": 49 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 483.484375, |
| "epoch": 0.4, |
| "grad_norm": 0.022128406912088394, |
| "kl": 0.00023129582405090332, |
| "learning_rate": 1.9999543948786258e-05, |
| "loss": -0.0018, |
| "reward": 4.990848183631897, |
| "reward_std": 1.9261715412139893, |
| "rewards/mrr_reward": 0.3342633992433548, |
| "rewards/rank_analyze_format_reward": 0.1260274900123477, |
| "rewards/rank_answer_foramt_reward": 0.609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.921875, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 492.078125, |
| "epoch": 0.408, |
| "grad_norm": 0.023543158546090126, |
| "kl": 0.0002911984920501709, |
| "learning_rate": 1.9999519630460554e-05, |
| "loss": -0.0076, |
| "reward": 5.144826769828796, |
| "reward_std": 1.6632727682590485, |
| "rewards/mrr_reward": 0.3661458343267441, |
| "rewards/rank_analyze_format_reward": 0.16852473467588425, |
| "rewards/rank_answer_foramt_reward": 0.59765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9140625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 51 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 466.515625, |
| "epoch": 0.416, |
| "grad_norm": 0.024417538195848465, |
| "kl": 0.0004246234893798828, |
| "learning_rate": 1.999949468051052e-05, |
| "loss": -0.0313, |
| "reward": 5.0145174860954285, |
| "reward_std": 1.8828826546669006, |
| "rewards/mrr_reward": 0.38802083767950535, |
| "rewards/rank_analyze_format_reward": 0.10110596101731062, |
| "rewards/rank_answer_foramt_reward": 0.556640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.8359375, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 52 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 494.75, |
| "epoch": 0.424, |
| "grad_norm": 0.024848150089383125, |
| "kl": 0.0002892911434173584, |
| "learning_rate": 1.9999469098937726e-05, |
| "loss": -0.0361, |
| "reward": 4.832870543003082, |
| "reward_std": 1.565253883600235, |
| "rewards/mrr_reward": 0.2958891298621893, |
| "rewards/rank_analyze_format_reward": 0.1942360121756792, |
| "rewards/rank_answer_foramt_reward": 0.611328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.875, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 53 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 504.5625, |
| "epoch": 0.432, |
| "grad_norm": 0.02211805246770382, |
| "kl": 0.00029155611991882324, |
| "learning_rate": 1.9999442885743785e-05, |
| "loss": -0.016, |
| "reward": 4.681830644607544, |
| "reward_std": 1.6615483164787292, |
| "rewards/mrr_reward": 0.28389756940305233, |
| "rewards/rank_analyze_format_reward": 0.1718399478122592, |
| "rewards/rank_answer_foramt_reward": 0.568359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9811454266309738, |
| "rewards/rank_overall_format_reward_more": 0.859375, |
| "rewards/rank_verify_format_reward": 0.9655204266309738, |
| "step": 54 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 471.765625, |
| "epoch": 0.44, |
| "grad_norm": 0.02444814145565033, |
| "kl": 0.0004519224166870117, |
| "learning_rate": 1.9999416040930354e-05, |
| "loss": -0.0462, |
| "reward": 5.167219042778015, |
| "reward_std": 1.9449047446250916, |
| "rewards/mrr_reward": 0.3921875059604645, |
| "rewards/rank_analyze_format_reward": 0.1719050519168377, |
| "rewards/rank_answer_foramt_reward": 0.513671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9955085963010788, |
| "rewards/rank_overall_format_reward_more": 0.921875, |
| "rewards/rank_verify_format_reward": 0.9955085963010788, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 501.21875, |
| "epoch": 0.448, |
| "grad_norm": 0.024404721334576607, |
| "kl": 0.00047457218170166016, |
| "learning_rate": 1.9999388564499135e-05, |
| "loss": -0.047, |
| "reward": 5.111963272094727, |
| "reward_std": 1.9699311256408691, |
| "rewards/mrr_reward": 0.340104166418314, |
| "rewards/rank_analyze_format_reward": 0.30795731022953987, |
| "rewards/rank_answer_foramt_reward": 0.650390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9669117629528046, |
| "rewards/rank_overall_format_reward_more": 0.859375, |
| "rewards/rank_verify_format_reward": 0.9669117629528046, |
| "step": 56 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 481.421875, |
| "epoch": 0.456, |
| "grad_norm": 0.024884849786758423, |
| "kl": 0.0005426406860351562, |
| "learning_rate": 1.999936045645186e-05, |
| "loss": -0.0116, |
| "reward": 4.459952890872955, |
| "reward_std": 1.6162844747304916, |
| "rewards/mrr_reward": 0.24435143917798996, |
| "rewards/rank_analyze_format_reward": 0.10262943152338266, |
| "rewards/rank_answer_foramt_reward": 0.5234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9985526353120804, |
| "rewards/rank_overall_format_reward_more": 0.890625, |
| "rewards/rank_verify_format_reward": 0.9673026353120804, |
| "step": 57 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 476.625, |
| "epoch": 0.464, |
| "grad_norm": 0.02534506469964981, |
| "kl": 0.0007425546646118164, |
| "learning_rate": 1.9999331716790303e-05, |
| "loss": -0.0169, |
| "reward": 4.837222576141357, |
| "reward_std": 1.9827671647071838, |
| "rewards/mrr_reward": 0.33585068956017494, |
| "rewards/rank_analyze_format_reward": 0.20996354706585407, |
| "rewards/rank_answer_foramt_reward": 0.470703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9807952791452408, |
| "rewards/rank_overall_format_reward_more": 0.8515625, |
| "rewards/rank_verify_format_reward": 0.9807952791452408, |
| "step": 58 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 530.046875, |
| "epoch": 0.472, |
| "grad_norm": 0.022839965298771858, |
| "kl": 0.0004405379295349121, |
| "learning_rate": 1.9999302345516278e-05, |
| "loss": -0.0295, |
| "reward": 5.279780864715576, |
| "reward_std": 1.9629344046115875, |
| "rewards/mrr_reward": 0.36336806416511536, |
| "rewards/rank_analyze_format_reward": 0.2832249477505684, |
| "rewards/rank_answer_foramt_reward": 0.654296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.890625, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 59 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 516.65625, |
| "epoch": 0.48, |
| "grad_norm": 0.0263381227850914, |
| "kl": 0.0005091428756713867, |
| "learning_rate": 1.9999272342631644e-05, |
| "loss": -0.0381, |
| "reward": 6.471034526824951, |
| "reward_std": 1.9417240023612976, |
| "rewards/mrr_reward": 0.6197172403335571, |
| "rewards/rank_analyze_format_reward": 0.26364994794130325, |
| "rewards/rank_answer_foramt_reward": 0.791015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 480.421875, |
| "epoch": 0.488, |
| "grad_norm": 0.02566557377576828, |
| "kl": 0.0005975961685180664, |
| "learning_rate": 1.9999241708138296e-05, |
| "loss": -0.0056, |
| "reward": 5.077809810638428, |
| "reward_std": 1.307851292192936, |
| "rewards/mrr_reward": 0.35500991344451904, |
| "rewards/rank_analyze_format_reward": 0.10372397117316723, |
| "rewards/rank_answer_foramt_reward": 0.6328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9957729876041412, |
| "rewards/rank_overall_format_reward_more": 0.9296875, |
| "rewards/rank_verify_format_reward": 0.9957729876041412, |
| "step": 61 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 472.875, |
| "epoch": 0.496, |
| "grad_norm": 0.027827711775898933, |
| "kl": 0.000952601432800293, |
| "learning_rate": 1.9999210442038164e-05, |
| "loss": -0.0339, |
| "reward": 4.869051575660706, |
| "reward_std": 1.8942435383796692, |
| "rewards/mrr_reward": 0.3203125074505806, |
| "rewards/rank_analyze_format_reward": 0.16581160761415958, |
| "rewards/rank_answer_foramt_reward": 0.548828125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9834558814764023, |
| "rewards/rank_overall_format_reward_more": 0.921875, |
| "rewards/rank_verify_format_reward": 0.9678308814764023, |
| "step": 62 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.609375, |
| "epoch": 0.504, |
| "grad_norm": 0.024270422756671906, |
| "kl": 0.000729680061340332, |
| "learning_rate": 1.9999178544333228e-05, |
| "loss": 0.0064, |
| "reward": 5.877958178520203, |
| "reward_std": 1.8244962692260742, |
| "rewards/mrr_reward": 0.5174479112029076, |
| "rewards/rank_analyze_format_reward": 0.19235198944807053, |
| "rewards/rank_answer_foramt_reward": 0.736328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983368366956711, |
| "rewards/rank_overall_format_reward_more": 0.8828125, |
| "rewards/rank_verify_format_reward": 0.9983368366956711, |
| "step": 63 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 515.46875, |
| "epoch": 0.512, |
| "grad_norm": 0.022133484482765198, |
| "kl": 0.0008175373077392578, |
| "learning_rate": 1.9999146015025503e-05, |
| "loss": 0.0092, |
| "reward": 5.555278539657593, |
| "reward_std": 1.9869469702243805, |
| "rewards/mrr_reward": 0.45848215371370316, |
| "rewards/rank_analyze_format_reward": 0.22101733088493347, |
| "rewards/rank_answer_foramt_reward": 0.666015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9913771450519562, |
| "rewards/rank_overall_format_reward_more": 0.8828125, |
| "rewards/rank_verify_format_reward": 0.9601271450519562, |
| "step": 64 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 511.625, |
| "epoch": 0.52, |
| "grad_norm": 0.024551959708333015, |
| "kl": 0.0007832050323486328, |
| "learning_rate": 1.999911285411704e-05, |
| "loss": -0.0049, |
| "reward": 5.41889089345932, |
| "reward_std": 1.9643912464380264, |
| "rewards/mrr_reward": 0.43281250074505806, |
| "rewards/rank_analyze_format_reward": 0.2138027586042881, |
| "rewards/rank_answer_foramt_reward": 0.62890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9927783608436584, |
| "rewards/rank_overall_format_reward_more": 0.875, |
| "rewards/rank_verify_format_reward": 0.9771533608436584, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 528.5, |
| "epoch": 0.528, |
| "grad_norm": 0.02290545031428337, |
| "kl": 0.0008490085601806641, |
| "learning_rate": 1.9999079061609933e-05, |
| "loss": -0.021, |
| "reward": 4.910151720046997, |
| "reward_std": 1.064635694026947, |
| "rewards/mrr_reward": 0.2832217253744602, |
| "rewards/rank_analyze_format_reward": 0.2712905704975128, |
| "rewards/rank_answer_foramt_reward": 0.5859375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 66 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 514.546875, |
| "epoch": 0.536, |
| "grad_norm": 0.024759415537118912, |
| "kl": 0.0009107589721679688, |
| "learning_rate": 1.999904463750632e-05, |
| "loss": 0.0076, |
| "reward": 4.854610323905945, |
| "reward_std": 1.8393707275390625, |
| "rewards/mrr_reward": 0.30915798619389534, |
| "rewards/rank_analyze_format_reward": 0.23711884673684835, |
| "rewards/rank_answer_foramt_reward": 0.576171875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.8203125, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 67 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 499.859375, |
| "epoch": 0.544, |
| "grad_norm": 0.024759415537118912, |
| "kl": 0.0008776187896728516, |
| "learning_rate": 1.999904463750632e-05, |
| "loss": -0.0246, |
| "reward": 5.42217218875885, |
| "reward_std": 1.3200950622558594, |
| "rewards/mrr_reward": 0.42010788805782795, |
| "rewards/rank_analyze_format_reward": 0.19172357022762299, |
| "rewards/rank_answer_foramt_reward": 0.654296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.998641312122345, |
| "rewards/rank_overall_format_reward_more": 0.9140625, |
| "rewards/rank_verify_format_reward": 0.983016312122345, |
| "step": 68 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 496.140625, |
| "epoch": 0.552, |
| "grad_norm": 0.02841918356716633, |
| "kl": 0.0010838508605957031, |
| "learning_rate": 1.999900958180838e-05, |
| "loss": -0.0281, |
| "reward": 5.81439483165741, |
| "reward_std": 1.740799367427826, |
| "rewards/mrr_reward": 0.5312500074505806, |
| "rewards/rank_analyze_format_reward": 0.1757229631766677, |
| "rewards/rank_answer_foramt_reward": 0.677734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.8828125, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 69 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 466.078125, |
| "epoch": 0.56, |
| "grad_norm": 0.02891196869313717, |
| "kl": 0.001157999038696289, |
| "learning_rate": 1.9998973894518318e-05, |
| "loss": -0.0123, |
| "reward": 5.705892205238342, |
| "reward_std": 2.0529025495052338, |
| "rewards/mrr_reward": 0.4973958432674408, |
| "rewards/rank_analyze_format_reward": 0.15696396678686142, |
| "rewards/rank_answer_foramt_reward": 0.638671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9993990361690521, |
| "rewards/rank_overall_format_reward_more": 0.921875, |
| "rewards/rank_verify_format_reward": 0.9993990361690521, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 492.296875, |
| "epoch": 0.568, |
| "grad_norm": 0.024859309196472168, |
| "kl": 0.0010205507278442383, |
| "learning_rate": 1.999893757563839e-05, |
| "loss": 0.0114, |
| "reward": 5.464065313339233, |
| "reward_std": 1.7677285969257355, |
| "rewards/mrr_reward": 0.446893610060215, |
| "rewards/rank_analyze_format_reward": 0.10556165501475334, |
| "rewards/rank_answer_foramt_reward": 0.68359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.890625, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 71 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 531.109375, |
| "epoch": 0.576, |
| "grad_norm": 0.026754125952720642, |
| "kl": 0.001056671142578125, |
| "learning_rate": 1.9998900625170897e-05, |
| "loss": -0.0067, |
| "reward": 6.407280087471008, |
| "reward_std": 1.8228637278079987, |
| "rewards/mrr_reward": 0.5859375298023224, |
| "rewards/rank_analyze_format_reward": 0.30571743845939636, |
| "rewards/rank_answer_foramt_reward": 0.828125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9296875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 72 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 508.484375, |
| "epoch": 0.584, |
| "grad_norm": 0.028836144134402275, |
| "kl": 0.00142669677734375, |
| "learning_rate": 1.9998863043118163e-05, |
| "loss": -0.0076, |
| "reward": 4.505983591079712, |
| "reward_std": 1.231943815946579, |
| "rewards/mrr_reward": 0.2062872126698494, |
| "rewards/rank_analyze_format_reward": 0.20817857421934605, |
| "rewards/rank_answer_foramt_reward": 0.56640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.921875, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 73 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 511.453125, |
| "epoch": 0.592, |
| "grad_norm": 0.025052759796380997, |
| "kl": 0.0013051033020019531, |
| "learning_rate": 1.999882482948257e-05, |
| "loss": -0.0097, |
| "reward": 5.300284147262573, |
| "reward_std": 1.6650860607624054, |
| "rewards/mrr_reward": 0.38593750447034836, |
| "rewards/rank_analyze_format_reward": 0.15106541197746992, |
| "rewards/rank_answer_foramt_reward": 0.66015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 74 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 504.625, |
| "epoch": 0.6, |
| "grad_norm": 0.026283830404281616, |
| "kl": 0.0021805763244628906, |
| "learning_rate": 1.999878598426653e-05, |
| "loss": -0.0317, |
| "reward": 5.158125400543213, |
| "reward_std": 1.4547997415065765, |
| "rewards/mrr_reward": 0.3474578373134136, |
| "rewards/rank_analyze_format_reward": 0.2175126150250435, |
| "rewards/rank_answer_foramt_reward": 0.58203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 516.265625, |
| "epoch": 0.608, |
| "grad_norm": 0.027127476409077644, |
| "kl": 0.001390218734741211, |
| "learning_rate": 1.9998746507472493e-05, |
| "loss": -0.0426, |
| "reward": 5.807446002960205, |
| "reward_std": 1.929233893752098, |
| "rewards/mrr_reward": 0.4895833358168602, |
| "rewards/rank_analyze_format_reward": 0.3203737363219261, |
| "rewards/rank_answer_foramt_reward": 0.689453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9821428656578064, |
| "rewards/rank_overall_format_reward_more": 0.890625, |
| "rewards/rank_verify_format_reward": 0.9665178656578064, |
| "step": 76 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 515.015625, |
| "epoch": 0.616, |
| "grad_norm": 0.026926733553409576, |
| "kl": 0.001764059066772461, |
| "learning_rate": 1.999870639910296e-05, |
| "loss": -0.0223, |
| "reward": 5.370245575904846, |
| "reward_std": 1.9943826496601105, |
| "rewards/mrr_reward": 0.3968749977648258, |
| "rewards/rank_analyze_format_reward": 0.2607038579881191, |
| "rewards/rank_answer_foramt_reward": 0.607421875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9924661070108414, |
| "rewards/rank_overall_format_reward_more": 0.9296875, |
| "rewards/rank_verify_format_reward": 0.9924661070108414, |
| "step": 77 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 481.015625, |
| "epoch": 0.624, |
| "grad_norm": 0.027938006445765495, |
| "kl": 0.0017654895782470703, |
| "learning_rate": 1.9998665659160453e-05, |
| "loss": -0.0188, |
| "reward": 5.412413477897644, |
| "reward_std": 1.896736979484558, |
| "rewards/mrr_reward": 0.41354167461395264, |
| "rewards/rank_analyze_format_reward": 0.24247420020401478, |
| "rewards/rank_answer_foramt_reward": 0.63671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9942144006490707, |
| "rewards/rank_overall_format_reward_more": 0.90625, |
| "rewards/rank_verify_format_reward": 0.9785894006490707, |
| "step": 78 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 499.15625, |
| "epoch": 0.632, |
| "grad_norm": 0.024667983874678612, |
| "kl": 0.0013856887817382812, |
| "learning_rate": 1.999862428764756e-05, |
| "loss": -0.0076, |
| "reward": 6.024145722389221, |
| "reward_std": 1.524814635515213, |
| "rewards/mrr_reward": 0.5302269533276558, |
| "rewards/rank_analyze_format_reward": 0.23562652617692947, |
| "rewards/rank_answer_foramt_reward": 0.794921875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9984335899353027, |
| "rewards/rank_overall_format_reward_more": 0.90625, |
| "rewards/rank_verify_format_reward": 0.9680059552192688, |
| "step": 79 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 501.484375, |
| "epoch": 0.64, |
| "grad_norm": 0.028410576283931732, |
| "kl": 0.0016107559204101562, |
| "learning_rate": 1.9998582284566878e-05, |
| "loss": 0.0072, |
| "reward": 5.220240831375122, |
| "reward_std": 1.5586610436439514, |
| "rewards/mrr_reward": 0.35975322872400284, |
| "rewards/rank_analyze_format_reward": 0.2044668523594737, |
| "rewards/rank_answer_foramt_reward": 0.640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9954117387533188, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.9954117387533188, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 541.9375, |
| "epoch": 0.648, |
| "grad_norm": 0.024989139288663864, |
| "kl": 0.002213001251220703, |
| "learning_rate": 1.999853964992107e-05, |
| "loss": -0.0076, |
| "reward": 5.271288990974426, |
| "reward_std": 1.666042000055313, |
| "rewards/mrr_reward": 0.3229972794651985, |
| "rewards/rank_analyze_format_reward": 0.38286497443914413, |
| "rewards/rank_answer_foramt_reward": 0.65625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 81 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.703125, |
| "epoch": 0.656, |
| "grad_norm": 0.03217592090368271, |
| "kl": 0.0023698806762695312, |
| "learning_rate": 1.9998496383712828e-05, |
| "loss": -0.0122, |
| "reward": 5.724093914031982, |
| "reward_std": 1.503628522157669, |
| "rewards/mrr_reward": 0.4970238097012043, |
| "rewards/rank_analyze_format_reward": 0.05476433038711548, |
| "rewards/rank_answer_foramt_reward": 0.75, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9968671798706055, |
| "rewards/rank_overall_format_reward_more": 0.9375, |
| "rewards/rank_verify_format_reward": 0.9968671798706055, |
| "step": 82 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 511.421875, |
| "epoch": 0.664, |
| "grad_norm": 0.026632074266672134, |
| "kl": 0.001974821090698242, |
| "learning_rate": 1.999845248594489e-05, |
| "loss": -0.0378, |
| "reward": 5.284371018409729, |
| "reward_std": 1.7509951293468475, |
| "rewards/mrr_reward": 0.37621527537703514, |
| "rewards/rank_analyze_format_reward": 0.1659046746790409, |
| "rewards/rank_answer_foramt_reward": 0.693359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9952791333198547, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.9796541333198547, |
| "step": 83 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 497.40625, |
| "epoch": 0.672, |
| "grad_norm": 0.028008421882987022, |
| "kl": 0.002154827117919922, |
| "learning_rate": 1.9998407956620017e-05, |
| "loss": -0.0174, |
| "reward": 5.500829696655273, |
| "reward_std": 1.7818693816661835, |
| "rewards/mrr_reward": 0.46015624701976776, |
| "rewards/rank_analyze_format_reward": 0.18122385442256927, |
| "rewards/rank_answer_foramt_reward": 0.69140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9836309552192688, |
| "rewards/rank_overall_format_reward_more": 0.8515625, |
| "rewards/rank_verify_format_reward": 0.9523809552192688, |
| "step": 84 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 504.109375, |
| "epoch": 0.68, |
| "grad_norm": 0.02929595857858658, |
| "kl": 0.0015263557434082031, |
| "learning_rate": 1.9998362795741027e-05, |
| "loss": -0.0149, |
| "reward": 4.848661541938782, |
| "reward_std": 1.5195987075567245, |
| "rewards/mrr_reward": 0.27783359214663506, |
| "rewards/rank_analyze_format_reward": 0.19695308804512024, |
| "rewards/rank_answer_foramt_reward": 0.658203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9801479876041412, |
| "rewards/rank_overall_format_reward_more": 0.9375, |
| "rewards/rank_verify_format_reward": 0.9645229876041412, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 520.03125, |
| "epoch": 0.688, |
| "grad_norm": 0.028656957671046257, |
| "kl": 0.0018339157104492188, |
| "learning_rate": 1.9998317003310775e-05, |
| "loss": 0.0018, |
| "reward": 6.04072630405426, |
| "reward_std": 1.6347778737545013, |
| "rewards/mrr_reward": 0.5263826847076416, |
| "rewards/rank_analyze_format_reward": 0.2711330959573388, |
| "rewards/rank_answer_foramt_reward": 0.7890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.90625, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 86 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 514.953125, |
| "epoch": 0.696, |
| "grad_norm": 0.0294534619897604, |
| "kl": 0.0030837059020996094, |
| "learning_rate": 1.9998270579332154e-05, |
| "loss": -0.0213, |
| "reward": 5.602773904800415, |
| "reward_std": 1.9321411848068237, |
| "rewards/mrr_reward": 0.45494791865348816, |
| "rewards/rank_analyze_format_reward": 0.22951603773981333, |
| "rewards/rank_answer_foramt_reward": 0.65234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.921875, |
| "rewards/rank_verify_format_reward": 0.9818111509084702, |
| "step": 87 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 517.953125, |
| "epoch": 0.704, |
| "grad_norm": 0.02795729972422123, |
| "kl": 0.0021820068359375, |
| "learning_rate": 1.9998223523808092e-05, |
| "loss": -0.005, |
| "reward": 5.259730100631714, |
| "reward_std": 1.7037486732006073, |
| "rewards/mrr_reward": 0.384002972394228, |
| "rewards/rank_analyze_format_reward": 0.18732355255633593, |
| "rewards/rank_answer_foramt_reward": 0.642578125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9976895451545715, |
| "rewards/rank_overall_format_reward_more": 0.8984375, |
| "rewards/rank_verify_format_reward": 0.9976895451545715, |
| "step": 88 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 572.234375, |
| "epoch": 0.712, |
| "grad_norm": 0.025803212076425552, |
| "kl": 0.0023365020751953125, |
| "learning_rate": 1.9998175836741564e-05, |
| "loss": -0.0233, |
| "reward": 5.643940687179565, |
| "reward_std": 2.12572318315506, |
| "rewards/mrr_reward": 0.41141493432223797, |
| "rewards/rank_analyze_format_reward": 0.43535757809877396, |
| "rewards/rank_answer_foramt_reward": 0.65234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9826335161924362, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 89 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 518.875, |
| "epoch": 0.72, |
| "grad_norm": 0.027814343571662903, |
| "kl": 0.0020236968994140625, |
| "learning_rate": 1.999812751813558e-05, |
| "loss": -0.051, |
| "reward": 5.96042013168335, |
| "reward_std": 1.2770089283585548, |
| "rewards/mrr_reward": 0.4757130518555641, |
| "rewards/rank_analyze_format_reward": 0.3149571679532528, |
| "rewards/rank_answer_foramt_reward": 0.79296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 495.8125, |
| "epoch": 0.728, |
| "grad_norm": 0.031177863478660583, |
| "kl": 0.002357959747314453, |
| "learning_rate": 1.9998078567993197e-05, |
| "loss": -0.0346, |
| "reward": 5.881357431411743, |
| "reward_std": 1.7492572218179703, |
| "rewards/mrr_reward": 0.5347842201590538, |
| "rewards/rank_analyze_format_reward": 0.09312985371798277, |
| "rewards/rank_answer_foramt_reward": 0.7265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9964202791452408, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.9807952791452408, |
| "step": 91 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 511.15625, |
| "epoch": 0.736, |
| "grad_norm": 0.028669161722064018, |
| "kl": 0.002231597900390625, |
| "learning_rate": 1.9998028986317504e-05, |
| "loss": -0.0145, |
| "reward": 5.656317114830017, |
| "reward_std": 1.6166883707046509, |
| "rewards/mrr_reward": 0.44843751192092896, |
| "rewards/rank_analyze_format_reward": 0.20554364286363125, |
| "rewards/rank_answer_foramt_reward": 0.76953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9984335899353027, |
| "rewards/rank_overall_format_reward_more": 0.90625, |
| "rewards/rank_verify_format_reward": 0.9828085899353027, |
| "step": 92 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 522.484375, |
| "epoch": 0.744, |
| "grad_norm": 0.027386236935853958, |
| "kl": 0.0021200180053710938, |
| "learning_rate": 1.999797877311163e-05, |
| "loss": -0.0246, |
| "reward": 5.928924918174744, |
| "reward_std": 1.48914834856987, |
| "rewards/mrr_reward": 0.4644097238779068, |
| "rewards/rank_analyze_format_reward": 0.3623017445206642, |
| "rewards/rank_answer_foramt_reward": 0.802734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.921875, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 93 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 497.1875, |
| "epoch": 0.752, |
| "grad_norm": 0.0282078068703413, |
| "kl": 0.003062725067138672, |
| "learning_rate": 1.9997927928378753e-05, |
| "loss": 0.0186, |
| "reward": 6.396650433540344, |
| "reward_std": 1.9676957428455353, |
| "rewards/mrr_reward": 0.6083333343267441, |
| "rewards/rank_analyze_format_reward": 0.25528283044695854, |
| "rewards/rank_answer_foramt_reward": 0.76171875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9965953528881073, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9965953528881073, |
| "step": 94 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 517.390625, |
| "epoch": 0.76, |
| "grad_norm": 0.03063831850886345, |
| "kl": 0.002585887908935547, |
| "learning_rate": 1.999787645212208e-05, |
| "loss": -0.0102, |
| "reward": 6.281728744506836, |
| "reward_std": 1.7576136887073517, |
| "rewards/mrr_reward": 0.5565538108348846, |
| "rewards/rank_analyze_format_reward": 0.30067696794867516, |
| "rewards/rank_answer_foramt_reward": 0.7890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9985119104385376, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9985119104385376, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 522.171875, |
| "epoch": 0.768, |
| "grad_norm": 0.02829769253730774, |
| "kl": 0.0035347938537597656, |
| "learning_rate": 1.999782434434486e-05, |
| "loss": 0.0108, |
| "reward": 5.318088173866272, |
| "reward_std": 1.6170280575752258, |
| "rewards/mrr_reward": 0.3567398265004158, |
| "rewards/rank_analyze_format_reward": 0.24012142419815063, |
| "rewards/rank_answer_foramt_reward": 0.708984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9827302694320679, |
| "step": 96 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 537.703125, |
| "epoch": 0.776, |
| "grad_norm": 0.026142382994294167, |
| "kl": 0.002566814422607422, |
| "learning_rate": 1.999777160505039e-05, |
| "loss": -0.0223, |
| "reward": 5.818326234817505, |
| "reward_std": 1.489253669977188, |
| "rewards/mrr_reward": 0.45615699887275696, |
| "rewards/rank_analyze_format_reward": 0.23597807995975018, |
| "rewards/rank_answer_foramt_reward": 0.822265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9872584789991379, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9872584789991379, |
| "step": 97 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 513.171875, |
| "epoch": 0.784, |
| "grad_norm": 0.03225073963403702, |
| "kl": 0.0032796859741210938, |
| "learning_rate": 1.9997718234242e-05, |
| "loss": -0.0376, |
| "reward": 5.64834451675415, |
| "reward_std": 1.8088513016700745, |
| "rewards/mrr_reward": 0.4361979216337204, |
| "rewards/rank_analyze_format_reward": 0.2531745582818985, |
| "rewards/rank_answer_foramt_reward": 0.6953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9931579083204269, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9931579083204269, |
| "step": 98 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 513.203125, |
| "epoch": 0.792, |
| "grad_norm": 0.028324192389845848, |
| "kl": 0.002949237823486328, |
| "learning_rate": 1.999766423192306e-05, |
| "loss": -0.0073, |
| "reward": 5.801540851593018, |
| "reward_std": 1.364225059747696, |
| "rewards/mrr_reward": 0.45811013877391815, |
| "rewards/rank_analyze_format_reward": 0.2562095895409584, |
| "rewards/rank_answer_foramt_reward": 0.736328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 99 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 522.625, |
| "epoch": 0.8, |
| "grad_norm": 0.03095146454870701, |
| "kl": 0.0033349990844726562, |
| "learning_rate": 1.9997609598096982e-05, |
| "loss": -0.0571, |
| "reward": 5.498512506484985, |
| "reward_std": 1.5701228380203247, |
| "rewards/mrr_reward": 0.38402778655290604, |
| "rewards/rank_analyze_format_reward": 0.31815899908542633, |
| "rewards/rank_answer_foramt_reward": 0.720703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9891133904457092, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9734883904457092, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 505.75, |
| "epoch": 0.808, |
| "grad_norm": 0.030804021283984184, |
| "kl": 0.003856658935546875, |
| "learning_rate": 1.9997554332767214e-05, |
| "loss": -0.0226, |
| "reward": 6.090959072113037, |
| "reward_std": 1.7257481813430786, |
| "rewards/mrr_reward": 0.5503038242459297, |
| "rewards/rank_analyze_format_reward": 0.20266878511756659, |
| "rewards/rank_answer_foramt_reward": 0.748046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9929515719413757, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9929515719413757, |
| "step": 101 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 486.828125, |
| "epoch": 0.816, |
| "grad_norm": 0.03461439907550812, |
| "kl": 0.0033884048461914062, |
| "learning_rate": 1.9997498435937254e-05, |
| "loss": -0.0362, |
| "reward": 5.366485238075256, |
| "reward_std": 1.3259476721286774, |
| "rewards/mrr_reward": 0.3723524361848831, |
| "rewards/rank_analyze_format_reward": 0.1968497335910797, |
| "rewards/rank_answer_foramt_reward": 0.732421875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9973393976688385, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9973393976688385, |
| "step": 102 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 533.84375, |
| "epoch": 0.824, |
| "grad_norm": 0.028838761150836945, |
| "kl": 0.0028543472290039062, |
| "learning_rate": 1.9997441907610624e-05, |
| "loss": -0.0262, |
| "reward": 5.746440768241882, |
| "reward_std": 1.2651481330394745, |
| "rewards/mrr_reward": 0.41945064067840576, |
| "rewards/rank_analyze_format_reward": 0.30614617466926575, |
| "rewards/rank_answer_foramt_reward": 0.828125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9984335899353027, |
| "rewards/rank_overall_format_reward_more": 0.9375, |
| "rewards/rank_verify_format_reward": 0.9984335899353027, |
| "step": 103 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 534.390625, |
| "epoch": 0.832, |
| "grad_norm": 0.03118244931101799, |
| "kl": 0.0032520294189453125, |
| "learning_rate": 1.9997384747790903e-05, |
| "loss": -0.0115, |
| "reward": 5.606603145599365, |
| "reward_std": 1.3689128905534744, |
| "rewards/mrr_reward": 0.4183097779750824, |
| "rewards/rank_analyze_format_reward": 0.1789928413927555, |
| "rewards/rank_answer_foramt_reward": 0.841796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.9296875, |
| "rewards/rank_verify_format_reward": 0.9836309552192688, |
| "step": 104 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 541.140625, |
| "epoch": 0.84, |
| "grad_norm": 0.0289431344717741, |
| "kl": 0.004002094268798828, |
| "learning_rate": 1.9997326956481693e-05, |
| "loss": 0.0299, |
| "reward": 5.412080824375153, |
| "reward_std": 1.5776411294937134, |
| "rewards/mrr_reward": 0.40980901941657066, |
| "rewards/rank_analyze_format_reward": 0.2225375398993492, |
| "rewards/rank_answer_foramt_reward": 0.6484375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9704661071300507, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9704661071300507, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 546.625, |
| "epoch": 0.848, |
| "grad_norm": 0.02917851321399212, |
| "kl": 0.003693103790283203, |
| "learning_rate": 1.999726853368665e-05, |
| "loss": -0.0132, |
| "reward": 6.237439870834351, |
| "reward_std": 1.6906473636627197, |
| "rewards/mrr_reward": 0.5492559522390366, |
| "rewards/rank_analyze_format_reward": 0.32810740265995264, |
| "rewards/rank_answer_foramt_reward": 0.744140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9958027005195618, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9958027005195618, |
| "step": 106 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 552.84375, |
| "epoch": 0.856, |
| "grad_norm": 0.02737216092646122, |
| "kl": 0.0034399032592773438, |
| "learning_rate": 1.9997209479409464e-05, |
| "loss": -0.0087, |
| "reward": 5.876426458358765, |
| "reward_std": 1.4228278696537018, |
| "rewards/mrr_reward": 0.4435453861951828, |
| "rewards/rank_analyze_format_reward": 0.38482026010751724, |
| "rewards/rank_answer_foramt_reward": 0.79296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9895716160535812, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.9895716160535812, |
| "step": 107 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 508.390625, |
| "epoch": 0.864, |
| "grad_norm": 0.028911437839269638, |
| "kl": 0.0033540725708007812, |
| "learning_rate": 1.9997149793653862e-05, |
| "loss": -0.0094, |
| "reward": 6.699026107788086, |
| "reward_std": 1.3455817177891731, |
| "rewards/mrr_reward": 0.6710069477558136, |
| "rewards/rank_analyze_format_reward": 0.18851793929934502, |
| "rewards/rank_answer_foramt_reward": 0.8359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 108 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 486.1875, |
| "epoch": 0.872, |
| "grad_norm": 0.03616398200392723, |
| "kl": 0.004034519195556641, |
| "learning_rate": 1.9997089476423617e-05, |
| "loss": 0.0287, |
| "reward": 6.017909646034241, |
| "reward_std": 1.8136086165904999, |
| "rewards/mrr_reward": 0.5158420130610466, |
| "rewards/rank_analyze_format_reward": 0.26905644312500954, |
| "rewards/rank_answer_foramt_reward": 0.74609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9853207767009735, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9853207767009735, |
| "step": 109 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 513.421875, |
| "epoch": 0.88, |
| "grad_norm": 0.029927760362625122, |
| "kl": 0.003938198089599609, |
| "learning_rate": 1.999702852772254e-05, |
| "loss": 0.0003, |
| "reward": 5.642710447311401, |
| "reward_std": 1.466000735759735, |
| "rewards/mrr_reward": 0.3956349194049835, |
| "rewards/rank_analyze_format_reward": 0.3304584436118603, |
| "rewards/rank_answer_foramt_reward": 0.814453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.981067106127739, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.981067106127739, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 527.78125, |
| "epoch": 0.888, |
| "grad_norm": 0.03488059714436531, |
| "kl": 0.0036568641662597656, |
| "learning_rate": 1.9996966947554476e-05, |
| "loss": -0.0217, |
| "reward": 6.343585729598999, |
| "reward_std": 1.7120259702205658, |
| "rewards/mrr_reward": 0.5808779820799828, |
| "rewards/rank_analyze_format_reward": 0.3235646188259125, |
| "rewards/rank_answer_foramt_reward": 0.75, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.996692106127739, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.981067106127739, |
| "step": 111 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 508.9375, |
| "epoch": 0.896, |
| "grad_norm": 0.0328449085354805, |
| "kl": 0.0038404464721679688, |
| "learning_rate": 1.9996904735923325e-05, |
| "loss": -0.0289, |
| "reward": 6.122893452644348, |
| "reward_std": 1.4696560502052307, |
| "rewards/mrr_reward": 0.5161458402872086, |
| "rewards/rank_analyze_format_reward": 0.31539197266101837, |
| "rewards/rank_answer_foramt_reward": 0.806640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9915762841701508, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9915762841701508, |
| "step": 112 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 541.578125, |
| "epoch": 0.904, |
| "grad_norm": 0.031444139778614044, |
| "kl": 0.0041828155517578125, |
| "learning_rate": 1.9996841892833e-05, |
| "loss": -0.0134, |
| "reward": 6.206910610198975, |
| "reward_std": 1.6045927107334137, |
| "rewards/mrr_reward": 0.5138206705451012, |
| "rewards/rank_analyze_format_reward": 0.36523886024951935, |
| "rewards/rank_answer_foramt_reward": 0.8203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9947571158409119, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9947571158409119, |
| "step": 113 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 511.28125, |
| "epoch": 0.912, |
| "grad_norm": 0.034729719161987305, |
| "kl": 0.004141807556152344, |
| "learning_rate": 1.9996778418287486e-05, |
| "loss": 0.0052, |
| "reward": 5.282190442085266, |
| "reward_std": 1.5597249567508698, |
| "rewards/mrr_reward": 0.35017360746860504, |
| "rewards/rank_analyze_format_reward": 0.25228141248226166, |
| "rewards/rank_answer_foramt_reward": 0.685546875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 114 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 531.9375, |
| "epoch": 0.92, |
| "grad_norm": 0.03240945562720299, |
| "kl": 0.003923892974853516, |
| "learning_rate": 1.9996714312290784e-05, |
| "loss": -0.0297, |
| "reward": 5.8050724267959595, |
| "reward_std": 1.4556776583194733, |
| "rewards/mrr_reward": 0.4047433137893677, |
| "rewards/rank_analyze_format_reward": 0.3966461531817913, |
| "rewards/rank_answer_foramt_reward": 0.830078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992187470197678, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9992187470197678, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 545.4375, |
| "epoch": 0.928, |
| "grad_norm": 0.033669959753751755, |
| "kl": 0.0044651031494140625, |
| "learning_rate": 1.9996649574846948e-05, |
| "loss": -0.0214, |
| "reward": 6.237725496292114, |
| "reward_std": 1.6311175972223282, |
| "rewards/mrr_reward": 0.5039434656500816, |
| "rewards/rank_analyze_format_reward": 0.40749866887927055, |
| "rewards/rank_answer_foramt_reward": 0.830078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 116 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 541.984375, |
| "epoch": 0.936, |
| "grad_norm": 0.03145231306552887, |
| "kl": 0.004774570465087891, |
| "learning_rate": 1.9996584205960063e-05, |
| "loss": -0.0014, |
| "reward": 5.562940955162048, |
| "reward_std": 1.5884797871112823, |
| "rewards/mrr_reward": 0.40140748769044876, |
| "rewards/rank_analyze_format_reward": 0.33169008791446686, |
| "rewards/rank_answer_foramt_reward": 0.689453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9954276382923126, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9798026382923126, |
| "step": 117 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 527.875, |
| "epoch": 0.944, |
| "grad_norm": 0.035210635513067245, |
| "kl": 0.0044097900390625, |
| "learning_rate": 1.999651820563426e-05, |
| "loss": -0.0421, |
| "reward": 5.673676252365112, |
| "reward_std": 1.3604719787836075, |
| "rewards/mrr_reward": 0.3857142850756645, |
| "rewards/rank_analyze_format_reward": 0.39724233001470566, |
| "rewards/rank_answer_foramt_reward": 0.7890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9956946671009064, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9800696671009064, |
| "step": 118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 534.984375, |
| "epoch": 0.952, |
| "grad_norm": 0.03147454559803009, |
| "kl": 0.0076541900634765625, |
| "learning_rate": 1.999645157387371e-05, |
| "loss": -0.0133, |
| "reward": 6.33061408996582, |
| "reward_std": 1.298683062195778, |
| "rewards/mrr_reward": 0.553689256310463, |
| "rewards/rank_analyze_format_reward": 0.3629737161099911, |
| "rewards/rank_answer_foramt_reward": 0.787109375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9985119104385376, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9985119104385376, |
| "step": 119 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 551.703125, |
| "epoch": 0.96, |
| "grad_norm": 0.031304825097322464, |
| "kl": 0.004342079162597656, |
| "learning_rate": 1.9996384310682615e-05, |
| "loss": -0.0365, |
| "reward": 5.393260598182678, |
| "reward_std": 1.5107265412807465, |
| "rewards/mrr_reward": 0.31743552163243294, |
| "rewards/rank_analyze_format_reward": 0.3929348886013031, |
| "rewards/rank_answer_foramt_reward": 0.763671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 501.890625, |
| "epoch": 0.968, |
| "grad_norm": 0.037323277443647385, |
| "kl": 0.004292488098144531, |
| "learning_rate": 1.999631641606523e-05, |
| "loss": -0.0058, |
| "reward": 6.189559578895569, |
| "reward_std": 1.2323561608791351, |
| "rewards/mrr_reward": 0.5643229335546494, |
| "rewards/rank_analyze_format_reward": 0.11282643768936396, |
| "rewards/rank_answer_foramt_reward": 0.87109375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9819862246513367, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9976112246513367, |
| "step": 121 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 545.25, |
| "epoch": 0.976, |
| "grad_norm": 0.034240033477544785, |
| "kl": 0.005130767822265625, |
| "learning_rate": 1.9996247890025845e-05, |
| "loss": -0.0263, |
| "reward": 5.799539566040039, |
| "reward_std": 1.6965691149234772, |
| "rewards/mrr_reward": 0.4192212335765362, |
| "rewards/rank_analyze_format_reward": 0.41699668765068054, |
| "rewards/rank_answer_foramt_reward": 0.748046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983368366956711, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9983368366956711, |
| "step": 122 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 522.9375, |
| "epoch": 0.984, |
| "grad_norm": 0.03087456338107586, |
| "kl": 0.004132270812988281, |
| "learning_rate": 1.9996178732568784e-05, |
| "loss": -0.0128, |
| "reward": 5.433954238891602, |
| "reward_std": 1.3873755782842636, |
| "rewards/mrr_reward": 0.35366444662213326, |
| "rewards/rank_analyze_format_reward": 0.3041442818939686, |
| "rewards/rank_answer_foramt_reward": 0.802734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9296875, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 123 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 548.90625, |
| "epoch": 0.992, |
| "grad_norm": 0.03247498720884323, |
| "kl": 0.004190921783447266, |
| "learning_rate": 1.9996108943698412e-05, |
| "loss": -0.02, |
| "reward": 6.039711356163025, |
| "reward_std": 1.745398223400116, |
| "rewards/mrr_reward": 0.49064359068870544, |
| "rewards/rank_analyze_format_reward": 0.3508656769990921, |
| "rewards/rank_answer_foramt_reward": 0.734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9959480613470078, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9959480613470078, |
| "step": 124 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 502.75, |
| "epoch": 1.0, |
| "grad_norm": 0.031171714887022972, |
| "kl": 0.00469207763671875, |
| "learning_rate": 1.9996038523419148e-05, |
| "loss": -0.0226, |
| "reward": 5.955172896385193, |
| "reward_std": 1.2524618208408356, |
| "rewards/mrr_reward": 0.460627481341362, |
| "rewards/rank_analyze_format_reward": 0.33911067247390747, |
| "rewards/rank_answer_foramt_reward": 0.806640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 549.53125, |
| "epoch": 1.008, |
| "grad_norm": 0.03288634493947029, |
| "kl": 0.00435638427734375, |
| "learning_rate": 1.9995967471735433e-05, |
| "loss": -0.0184, |
| "reward": 6.16729462146759, |
| "reward_std": 1.4908590912818909, |
| "rewards/mrr_reward": 0.5096540227532387, |
| "rewards/rank_analyze_format_reward": 0.39567676931619644, |
| "rewards/rank_answer_foramt_reward": 0.775390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983368366956711, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9983368366956711, |
| "step": 126 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 533.65625, |
| "epoch": 1.016, |
| "grad_norm": 0.03470674157142639, |
| "kl": 0.005002021789550781, |
| "learning_rate": 1.9995895788651753e-05, |
| "loss": -0.0254, |
| "reward": 6.5735520124435425, |
| "reward_std": 1.469813510775566, |
| "rewards/mrr_reward": 0.621657982468605, |
| "rewards/rank_analyze_format_reward": 0.31275077164173126, |
| "rewards/rank_answer_foramt_reward": 0.818359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 127 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 546.34375, |
| "epoch": 1.024, |
| "grad_norm": 0.03493339568376541, |
| "kl": 0.004825592041015625, |
| "learning_rate": 1.9995823474172644e-05, |
| "loss": -0.0097, |
| "reward": 5.744642496109009, |
| "reward_std": 1.9137973487377167, |
| "rewards/mrr_reward": 0.4302021265029907, |
| "rewards/rank_analyze_format_reward": 0.3747362494468689, |
| "rewards/rank_answer_foramt_reward": 0.6953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9925176054239273, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9925176054239273, |
| "step": 128 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 522.65625, |
| "epoch": 1.032, |
| "grad_norm": 0.03346191346645355, |
| "kl": 0.0044116973876953125, |
| "learning_rate": 1.9995750528302668e-05, |
| "loss": -0.0069, |
| "reward": 6.34830904006958, |
| "reward_std": 1.5635737180709839, |
| "rewards/mrr_reward": 0.5352182611823082, |
| "rewards/rank_analyze_format_reward": 0.3498992621898651, |
| "rewards/rank_answer_foramt_reward": 0.875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 129 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 518.90625, |
| "epoch": 1.04, |
| "grad_norm": 0.027642706409096718, |
| "kl": 0.0034575462341308594, |
| "learning_rate": 1.999567695104643e-05, |
| "loss": -0.0083, |
| "reward": 6.863049745559692, |
| "reward_std": 0.995637645944953, |
| "rewards/mrr_reward": 0.6565104126930237, |
| "rewards/rank_analyze_format_reward": 0.3502893391996622, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 546.265625, |
| "epoch": 1.048, |
| "grad_norm": 0.03320496901869774, |
| "kl": 0.00482177734375, |
| "learning_rate": 1.9995602742408584e-05, |
| "loss": -0.0297, |
| "reward": 5.561194658279419, |
| "reward_std": 1.0441379398107529, |
| "rewards/mrr_reward": 0.3508804552257061, |
| "rewards/rank_analyze_format_reward": 0.3411516472697258, |
| "rewards/rank_answer_foramt_reward": 0.857421875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 131 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 538.40625, |
| "epoch": 1.056, |
| "grad_norm": 0.03059810772538185, |
| "kl": 0.004889488220214844, |
| "learning_rate": 1.9995527902393814e-05, |
| "loss": -0.031, |
| "reward": 6.096472501754761, |
| "reward_std": 1.3641002774238586, |
| "rewards/mrr_reward": 0.4807477816939354, |
| "rewards/rank_analyze_format_reward": 0.329423014074564, |
| "rewards/rank_answer_foramt_reward": 0.845703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 132 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 530.125, |
| "epoch": 1.064, |
| "grad_norm": 0.03329962119460106, |
| "kl": 0.004602909088134766, |
| "learning_rate": 1.9995452431006844e-05, |
| "loss": -0.0196, |
| "reward": 5.331088542938232, |
| "reward_std": 0.830422654747963, |
| "rewards/mrr_reward": 0.3011222556233406, |
| "rewards/rank_analyze_format_reward": 0.27261858060956, |
| "rewards/rank_answer_foramt_reward": 0.88671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9992559552192688, |
| "step": 133 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 546.765625, |
| "epoch": 1.072, |
| "grad_norm": 0.03147532418370247, |
| "kl": 0.004273891448974609, |
| "learning_rate": 1.999537632825245e-05, |
| "loss": -0.0228, |
| "reward": 5.826651930809021, |
| "reward_std": 1.0173790007829666, |
| "rewards/mrr_reward": 0.4183593764901161, |
| "rewards/rank_analyze_format_reward": 0.37596380710601807, |
| "rewards/rank_answer_foramt_reward": 0.861328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.997023805975914, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.965773805975914, |
| "step": 134 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 540.265625, |
| "epoch": 1.08, |
| "grad_norm": 0.03314289450645447, |
| "kl": 0.0045490264892578125, |
| "learning_rate": 1.9995299594135434e-05, |
| "loss": -0.0181, |
| "reward": 6.364633798599243, |
| "reward_std": 1.2888767421245575, |
| "rewards/mrr_reward": 0.5286644101142883, |
| "rewards/rank_analyze_format_reward": 0.40576110780239105, |
| "rewards/rank_answer_foramt_reward": 0.861328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9992559552192688, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 537.515625, |
| "epoch": 1.088, |
| "grad_norm": 0.03256648778915405, |
| "kl": 0.005957603454589844, |
| "learning_rate": 1.999522222866064e-05, |
| "loss": -0.0253, |
| "reward": 6.410487055778503, |
| "reward_std": 1.080582246184349, |
| "rewards/mrr_reward": 0.5400917902588844, |
| "rewards/rank_analyze_format_reward": 0.3477761074900627, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 136 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 565.78125, |
| "epoch": 1.096, |
| "grad_norm": 0.03460180386900902, |
| "kl": 0.012660980224609375, |
| "learning_rate": 1.999514423183296e-05, |
| "loss": -0.0144, |
| "reward": 6.023637771606445, |
| "reward_std": 1.5978916585445404, |
| "rewards/mrr_reward": 0.44875992834568024, |
| "rewards/rank_analyze_format_reward": 0.44508665800094604, |
| "rewards/rank_answer_foramt_reward": 0.83203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9835526347160339, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 137 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 515.0, |
| "epoch": 1.104, |
| "grad_norm": 0.030293360352516174, |
| "kl": 0.005436897277832031, |
| "learning_rate": 1.9995065603657317e-05, |
| "loss": -0.0128, |
| "reward": 6.0460041761398315, |
| "reward_std": 1.0893033295869827, |
| "rewards/mrr_reward": 0.48198164254426956, |
| "rewards/rank_analyze_format_reward": 0.2604257594794035, |
| "rewards/rank_answer_foramt_reward": 0.876953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 138 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 560.9375, |
| "epoch": 1.112, |
| "grad_norm": 0.034913912415504456, |
| "kl": 0.0055980682373046875, |
| "learning_rate": 1.999498634413868e-05, |
| "loss": -0.009, |
| "reward": 6.033616900444031, |
| "reward_std": 1.7777451276779175, |
| "rewards/mrr_reward": 0.4813368245959282, |
| "rewards/rank_analyze_format_reward": 0.3849602974951267, |
| "rewards/rank_answer_foramt_reward": 0.74609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9964202791452408, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9964202791452408, |
| "step": 139 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 519.21875, |
| "epoch": 1.12, |
| "grad_norm": 0.03727564588189125, |
| "kl": 0.004870414733886719, |
| "learning_rate": 1.9994906453282055e-05, |
| "loss": -0.0243, |
| "reward": 6.689180135726929, |
| "reward_std": 1.181299865245819, |
| "rewards/mrr_reward": 0.6408420205116272, |
| "rewards/rank_analyze_format_reward": 0.2992168888449669, |
| "rewards/rank_answer_foramt_reward": 0.876953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 556.578125, |
| "epoch": 1.1280000000000001, |
| "grad_norm": 0.035206038504838943, |
| "kl": 0.006304740905761719, |
| "learning_rate": 1.9994825931092486e-05, |
| "loss": -0.0367, |
| "reward": 6.255677342414856, |
| "reward_std": 1.9543142914772034, |
| "rewards/mrr_reward": 0.5096974149346352, |
| "rewards/rank_analyze_format_reward": 0.4908938556909561, |
| "rewards/rank_answer_foramt_reward": 0.75390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9938564151525497, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9938564151525497, |
| "step": 141 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 531.640625, |
| "epoch": 1.1360000000000001, |
| "grad_norm": 0.031349923461675644, |
| "kl": 0.005738258361816406, |
| "learning_rate": 1.9994744777575064e-05, |
| "loss": 0.0027, |
| "reward": 6.044549226760864, |
| "reward_std": 1.1382797956466675, |
| "rewards/mrr_reward": 0.48732637614011765, |
| "rewards/rank_analyze_format_reward": 0.32322094589471817, |
| "rewards/rank_answer_foramt_reward": 0.83203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9817143976688385, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9817143976688385, |
| "step": 142 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 566.484375, |
| "epoch": 1.144, |
| "grad_norm": 0.03165869414806366, |
| "kl": 0.00574493408203125, |
| "learning_rate": 1.999466299273491e-05, |
| "loss": 0.0042, |
| "reward": 6.553520321846008, |
| "reward_std": 1.627190262079239, |
| "rewards/mrr_reward": 0.5967448204755783, |
| "rewards/rank_analyze_format_reward": 0.44682280719280243, |
| "rewards/rank_answer_foramt_reward": 0.84765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9907185882329941, |
| "rewards/rank_overall_format_reward_more": 0.9375, |
| "rewards/rank_verify_format_reward": 0.9438435882329941, |
| "step": 143 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 531.203125, |
| "epoch": 1.152, |
| "grad_norm": 0.03519825637340546, |
| "kl": 0.0061855316162109375, |
| "learning_rate": 1.9994580576577193e-05, |
| "loss": -0.0129, |
| "reward": 5.729455947875977, |
| "reward_std": 1.3913188576698303, |
| "rewards/mrr_reward": 0.41431671380996704, |
| "rewards/rank_analyze_format_reward": 0.32957829907536507, |
| "rewards/rank_answer_foramt_reward": 0.76171875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 144 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 524.3125, |
| "epoch": 1.16, |
| "grad_norm": 0.03151266649365425, |
| "kl": 0.006320953369140625, |
| "learning_rate": 1.9994497529107118e-05, |
| "loss": -0.0148, |
| "reward": 6.072369456291199, |
| "reward_std": 1.2223908305168152, |
| "rewards/mrr_reward": 0.4943700544536114, |
| "rewards/rank_analyze_format_reward": 0.23118487000465393, |
| "rewards/rank_answer_foramt_reward": 0.875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 536.890625, |
| "epoch": 1.168, |
| "grad_norm": 0.03530154004693031, |
| "kl": 0.005878448486328125, |
| "learning_rate": 1.999441385032993e-05, |
| "loss": -0.0308, |
| "reward": 6.625038385391235, |
| "reward_std": 1.2249456346035004, |
| "rewards/mrr_reward": 0.6044022962450981, |
| "rewards/rank_analyze_format_reward": 0.34935425966978073, |
| "rewards/rank_answer_foramt_reward": 0.880859375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9964202791452408, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9964202791452408, |
| "step": 146 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 520.6875, |
| "epoch": 1.176, |
| "grad_norm": 0.035086773335933685, |
| "kl": 0.007404327392578125, |
| "learning_rate": 1.9994329540250918e-05, |
| "loss": -0.0321, |
| "reward": 6.542271018028259, |
| "reward_std": 1.3827645033597946, |
| "rewards/mrr_reward": 0.5946986712515354, |
| "rewards/rank_analyze_format_reward": 0.292152963578701, |
| "rewards/rank_answer_foramt_reward": 0.875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 147 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 533.8125, |
| "epoch": 1.184, |
| "grad_norm": 0.032674577087163925, |
| "kl": 0.00643157958984375, |
| "learning_rate": 1.99942445988754e-05, |
| "loss": -0.033, |
| "reward": 6.091751337051392, |
| "reward_std": 1.2941071689128876, |
| "rewards/mrr_reward": 0.479445680975914, |
| "rewards/rank_analyze_format_reward": 0.35560934245586395, |
| "rewards/rank_answer_foramt_reward": 0.818359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 148 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 551.0625, |
| "epoch": 1.192, |
| "grad_norm": 0.03282872214913368, |
| "kl": 0.005850791931152344, |
| "learning_rate": 1.999415902620875e-05, |
| "loss": -0.025, |
| "reward": 6.667526364326477, |
| "reward_std": 1.0924562439322472, |
| "rewards/mrr_reward": 0.604253463447094, |
| "rewards/rank_analyze_format_reward": 0.3833247348666191, |
| "rewards/rank_answer_foramt_reward": 0.890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 149 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 564.640625, |
| "epoch": 1.2, |
| "grad_norm": 0.032991521060466766, |
| "kl": 0.0060176849365234375, |
| "learning_rate": 1.999407282225637e-05, |
| "loss": 0.0052, |
| "reward": 5.798678278923035, |
| "reward_std": 1.1788080930709839, |
| "rewards/mrr_reward": 0.402951393276453, |
| "rewards/rank_analyze_format_reward": 0.38478153944015503, |
| "rewards/rank_answer_foramt_reward": 0.861328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9938189834356308, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9781939834356308, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 575.6875, |
| "epoch": 1.208, |
| "grad_norm": 0.02944212593138218, |
| "kl": 0.005663871765136719, |
| "learning_rate": 1.9993985987023703e-05, |
| "loss": -0.0115, |
| "reward": 6.4621899127960205, |
| "reward_std": 1.2538374364376068, |
| "rewards/mrr_reward": 0.5166728720068932, |
| "rewards/rank_analyze_format_reward": 0.4970608651638031, |
| "rewards/rank_answer_foramt_reward": 0.9140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 151 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 583.15625, |
| "epoch": 1.216, |
| "grad_norm": 0.03144199773669243, |
| "kl": 0.005504608154296875, |
| "learning_rate": 1.9993898520516233e-05, |
| "loss": 0.0178, |
| "reward": 7.210927963256836, |
| "reward_std": 1.4847297072410583, |
| "rewards/mrr_reward": 0.7067708373069763, |
| "rewards/rank_analyze_format_reward": 0.5710361748933792, |
| "rewards/rank_answer_foramt_reward": 0.845703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9835526347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 152 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 569.125, |
| "epoch": 1.224, |
| "grad_norm": 0.03476106375455856, |
| "kl": 0.006566047668457031, |
| "learning_rate": 1.9993810422739496e-05, |
| "loss": -0.0255, |
| "reward": 5.501855969429016, |
| "reward_std": 1.1074179112911224, |
| "rewards/mrr_reward": 0.2869729772210121, |
| "rewards/rank_analyze_format_reward": 0.5554128363728523, |
| "rewards/rank_answer_foramt_reward": 0.8359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9969318807125092, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9969318807125092, |
| "step": 153 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 559.140625, |
| "epoch": 1.232, |
| "grad_norm": 0.0336473323404789, |
| "kl": 0.006374359130859375, |
| "learning_rate": 1.999372169369904e-05, |
| "loss": -0.0304, |
| "reward": 7.210146188735962, |
| "reward_std": 1.3585944771766663, |
| "rewards/mrr_reward": 0.7406249940395355, |
| "rewards/rank_analyze_format_reward": 0.4076874777674675, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 154 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 565.015625, |
| "epoch": 1.24, |
| "grad_norm": 0.03243670612573624, |
| "kl": 0.0064258575439453125, |
| "learning_rate": 1.999363233340048e-05, |
| "loss": 0.0124, |
| "reward": 6.775290489196777, |
| "reward_std": 1.6960014998912811, |
| "rewards/mrr_reward": 0.6491319388151169, |
| "rewards/rank_analyze_format_reward": 0.3369657965376973, |
| "rewards/rank_answer_foramt_reward": 0.849609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 567.671875, |
| "epoch": 1.248, |
| "grad_norm": 0.03587043285369873, |
| "kl": 0.008923530578613281, |
| "learning_rate": 1.9993542341849462e-05, |
| "loss": -0.0172, |
| "reward": 6.484335541725159, |
| "reward_std": 1.4846598207950592, |
| "rewards/mrr_reward": 0.545331098139286, |
| "rewards/rank_analyze_format_reward": 0.5122500844299793, |
| "rewards/rank_answer_foramt_reward": 0.822265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9959664940834045, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9959664940834045, |
| "step": 156 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 541.015625, |
| "epoch": 1.256, |
| "grad_norm": 0.03247671574354172, |
| "kl": 0.006220817565917969, |
| "learning_rate": 1.9993451719051663e-05, |
| "loss": -0.0057, |
| "reward": 6.91133987903595, |
| "reward_std": 1.076777160167694, |
| "rewards/mrr_reward": 0.6208333373069763, |
| "rewards/rank_analyze_format_reward": 0.4648074358701706, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 157 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 536.0, |
| "epoch": 1.264, |
| "grad_norm": 0.033995021134614944, |
| "kl": 0.006260871887207031, |
| "learning_rate": 1.999336046501281e-05, |
| "loss": -0.0107, |
| "reward": 6.491420269012451, |
| "reward_std": 1.1248966604471207, |
| "rewards/mrr_reward": 0.5537760369479656, |
| "rewards/rank_analyze_format_reward": 0.38733571022748947, |
| "rewards/rank_answer_foramt_reward": 0.890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 158 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 537.09375, |
| "epoch": 1.272, |
| "grad_norm": 0.03796149790287018, |
| "kl": 0.008052825927734375, |
| "learning_rate": 1.999326857973867e-05, |
| "loss": -0.0482, |
| "reward": 7.359132528305054, |
| "reward_std": 1.4535967111587524, |
| "rewards/mrr_reward": 0.7499999850988388, |
| "rewards/rank_analyze_format_reward": 0.4489763230085373, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 159 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 564.671875, |
| "epoch": 1.28, |
| "grad_norm": 0.036241207271814346, |
| "kl": 0.0068416595458984375, |
| "learning_rate": 1.9993176063235046e-05, |
| "loss": -0.0176, |
| "reward": 7.0445317029953, |
| "reward_std": 1.626471757888794, |
| "rewards/mrr_reward": 0.662822425365448, |
| "rewards/rank_analyze_format_reward": 0.5297309085726738, |
| "rewards/rank_answer_foramt_reward": 0.875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 579.453125, |
| "epoch": 1.288, |
| "grad_norm": 0.037075724452733994, |
| "kl": 0.006366729736328125, |
| "learning_rate": 1.9993082915507776e-05, |
| "loss": -0.0144, |
| "reward": 6.376061797142029, |
| "reward_std": 1.2733474969863892, |
| "rewards/mrr_reward": 0.5289496555924416, |
| "rewards/rank_analyze_format_reward": 0.42237265408039093, |
| "rewards/rank_answer_foramt_reward": 0.876953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 161 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 577.296875, |
| "epoch": 1.296, |
| "grad_norm": 0.03217633441090584, |
| "kl": 0.006763458251953125, |
| "learning_rate": 1.999298913656275e-05, |
| "loss": -0.0085, |
| "reward": 6.5947242975234985, |
| "reward_std": 1.2779672592878342, |
| "rewards/mrr_reward": 0.5658544301986694, |
| "rewards/rank_analyze_format_reward": 0.4991602599620819, |
| "rewards/rank_answer_foramt_reward": 0.849609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 162 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 527.328125, |
| "epoch": 1.304, |
| "grad_norm": 0.03425245359539986, |
| "kl": 0.007500648498535156, |
| "learning_rate": 1.9992894726405894e-05, |
| "loss": -0.0124, |
| "reward": 6.557482957839966, |
| "reward_std": 1.4475017786026, |
| "rewards/mrr_reward": 0.5980902686715126, |
| "rewards/rank_analyze_format_reward": 0.2979341112077236, |
| "rewards/rank_answer_foramt_reward": 0.875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 163 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 564.59375, |
| "epoch": 1.312, |
| "grad_norm": 0.03478895127773285, |
| "kl": 0.007842063903808594, |
| "learning_rate": 1.9992799685043165e-05, |
| "loss": -0.0553, |
| "reward": 6.300098657608032, |
| "reward_std": 1.0886222496628761, |
| "rewards/mrr_reward": 0.48148561269044876, |
| "rewards/rank_analyze_format_reward": 0.5164258703589439, |
| "rewards/rank_answer_foramt_reward": 0.875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 164 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 568.953125, |
| "epoch": 1.32, |
| "grad_norm": 0.0446629598736763, |
| "kl": 0.006804466247558594, |
| "learning_rate": 1.999270401248057e-05, |
| "loss": -0.0217, |
| "reward": 6.716991662979126, |
| "reward_std": 1.4165300726890564, |
| "rewards/mrr_reward": 0.5955481305718422, |
| "rewards/rank_analyze_format_reward": 0.43624673783779144, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 584.984375, |
| "epoch": 1.328, |
| "grad_norm": 0.03580395132303238, |
| "kl": 0.008753776550292969, |
| "learning_rate": 1.999260770872415e-05, |
| "loss": 0.0004, |
| "reward": 5.964340448379517, |
| "reward_std": 1.1115762144327164, |
| "rewards/mrr_reward": 0.3914806619286537, |
| "rewards/rank_analyze_format_reward": 0.5389279127120972, |
| "rewards/rank_answer_foramt_reward": 0.876953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 166 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 571.453125, |
| "epoch": 1.336, |
| "grad_norm": 0.03663269430398941, |
| "kl": 0.007559776306152344, |
| "learning_rate": 1.999251077377999e-05, |
| "loss": -0.0458, |
| "reward": 6.374191999435425, |
| "reward_std": 1.2914250791072845, |
| "rewards/mrr_reward": 0.4921874962747097, |
| "rewards/rank_analyze_format_reward": 0.5749406069517136, |
| "rewards/rank_answer_foramt_reward": 0.849609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 167 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 573.96875, |
| "epoch": 1.3439999999999999, |
| "grad_norm": 0.033142536878585815, |
| "kl": 0.0076465606689453125, |
| "learning_rate": 1.999241320765421e-05, |
| "loss": -0.0188, |
| "reward": 6.285339713096619, |
| "reward_std": 1.369349867105484, |
| "rewards/mrr_reward": 0.4851934462785721, |
| "rewards/rank_analyze_format_reward": 0.479331374168396, |
| "rewards/rank_answer_foramt_reward": 0.873046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 168 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 547.984375, |
| "epoch": 1.3519999999999999, |
| "grad_norm": 0.03498915210366249, |
| "kl": 0.008561134338378906, |
| "learning_rate": 1.9992315010352978e-05, |
| "loss": -0.0274, |
| "reward": 6.904844880104065, |
| "reward_std": 1.2574369013309479, |
| "rewards/mrr_reward": 0.6432291716337204, |
| "rewards/rank_analyze_format_reward": 0.44520963728427887, |
| "rewards/rank_answer_foramt_reward": 0.88671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 169 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 576.765625, |
| "epoch": 1.3599999999999999, |
| "grad_norm": 0.03831981122493744, |
| "kl": 0.010663986206054688, |
| "learning_rate": 1.9992216181882492e-05, |
| "loss": -0.0089, |
| "reward": 6.317743182182312, |
| "reward_std": 1.1524057537317276, |
| "rewards/mrr_reward": 0.4661892428994179, |
| "rewards/rank_analyze_format_reward": 0.5786355137825012, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9977221935987473, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9977221935987473, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 549.5625, |
| "epoch": 1.3679999999999999, |
| "grad_norm": 0.03543487936258316, |
| "kl": 0.008349418640136719, |
| "learning_rate": 1.9992116722248997e-05, |
| "loss": 0.009, |
| "reward": 6.215874433517456, |
| "reward_std": 1.5494773089885712, |
| "rewards/mrr_reward": 0.5115203410387039, |
| "rewards/rank_analyze_format_reward": 0.3291758671402931, |
| "rewards/rank_answer_foramt_reward": 0.859375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9984335899353027, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9984335899353027, |
| "step": 171 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 563.40625, |
| "epoch": 1.376, |
| "grad_norm": 0.03732339292764664, |
| "kl": 0.00754547119140625, |
| "learning_rate": 1.9992016631458774e-05, |
| "loss": -0.0044, |
| "reward": 6.333064913749695, |
| "reward_std": 1.5073265135288239, |
| "rewards/mrr_reward": 0.5336371585726738, |
| "rewards/rank_analyze_format_reward": 0.373980063945055, |
| "rewards/rank_answer_foramt_reward": 0.83203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9962525367736816, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9962525367736816, |
| "step": 172 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 565.109375, |
| "epoch": 1.384, |
| "grad_norm": 0.03858262300491333, |
| "kl": 0.0084381103515625, |
| "learning_rate": 1.9991915909518146e-05, |
| "loss": -0.0484, |
| "reward": 6.43630588054657, |
| "reward_std": 1.1248457580804825, |
| "rewards/mrr_reward": 0.5476128421723843, |
| "rewards/rank_analyze_format_reward": 0.5014840885996819, |
| "rewards/rank_answer_foramt_reward": 0.779296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9825367629528046, |
| "step": 173 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 603.625, |
| "epoch": 1.392, |
| "grad_norm": 0.031945351511240005, |
| "kl": 0.008295059204101562, |
| "learning_rate": 1.9991814556433475e-05, |
| "loss": -0.0415, |
| "reward": 6.396109580993652, |
| "reward_std": 1.257490947842598, |
| "rewards/mrr_reward": 0.5233692973852158, |
| "rewards/rank_analyze_format_reward": 0.5277140513062477, |
| "rewards/rank_answer_foramt_reward": 0.830078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9919514656066895, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9919514656066895, |
| "step": 174 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 547.390625, |
| "epoch": 1.4, |
| "grad_norm": 0.03459803760051727, |
| "kl": 0.009899139404296875, |
| "learning_rate": 1.9991712572211163e-05, |
| "loss": -0.0283, |
| "reward": 6.693902850151062, |
| "reward_std": 1.5040415227413177, |
| "rewards/mrr_reward": 0.6218750029802322, |
| "rewards/rank_analyze_format_reward": 0.38910815864801407, |
| "rewards/rank_answer_foramt_reward": 0.822265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.997514471411705, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.997514471411705, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 596.953125, |
| "epoch": 1.408, |
| "grad_norm": 0.03564726933836937, |
| "kl": 0.008817672729492188, |
| "learning_rate": 1.999160995685765e-05, |
| "loss": 0.0041, |
| "reward": 6.393037676811218, |
| "reward_std": 1.5305506885051727, |
| "rewards/mrr_reward": 0.4989583343267441, |
| "rewards/rank_analyze_format_reward": 0.5883020609617233, |
| "rewards/rank_answer_foramt_reward": 0.833984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 176 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 610.8125, |
| "epoch": 1.416, |
| "grad_norm": 0.0317256897687912, |
| "kl": 0.007786750793457031, |
| "learning_rate": 1.9991506710379424e-05, |
| "loss": -0.0038, |
| "reward": 6.927413702011108, |
| "reward_std": 1.1728498041629791, |
| "rewards/mrr_reward": 0.5874070003628731, |
| "rewards/rank_analyze_format_reward": 0.7316593676805496, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 177 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 614.015625, |
| "epoch": 1.424, |
| "grad_norm": 0.03363263979554176, |
| "kl": 0.006999015808105469, |
| "learning_rate": 1.9991402832783e-05, |
| "loss": -0.0222, |
| "reward": 6.334396123886108, |
| "reward_std": 1.1969702541828156, |
| "rewards/mrr_reward": 0.49082961305975914, |
| "rewards/rank_analyze_format_reward": 0.532570406794548, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9671052694320679, |
| "step": 178 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 639.0625, |
| "epoch": 1.432, |
| "grad_norm": 0.0324893482029438, |
| "kl": 0.0071563720703125, |
| "learning_rate": 1.9991298324074942e-05, |
| "loss": -0.0215, |
| "reward": 6.33887255191803, |
| "reward_std": 1.0472588911652565, |
| "rewards/mrr_reward": 0.4644531235098839, |
| "rewards/rank_analyze_format_reward": 0.6554184406995773, |
| "rewards/rank_answer_foramt_reward": 0.86328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9817143976688385, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.996271014213562, |
| "step": 179 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 597.53125, |
| "epoch": 1.44, |
| "grad_norm": 0.038431741297245026, |
| "kl": 0.0085906982421875, |
| "learning_rate": 1.999119318426185e-05, |
| "loss": -0.0425, |
| "reward": 5.978406071662903, |
| "reward_std": 1.4375847578048706, |
| "rewards/mrr_reward": 0.37400173395872116, |
| "rewards/rank_analyze_format_reward": 0.654976025223732, |
| "rewards/rank_answer_foramt_reward": 0.86328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9937897026538849, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9937897026538849, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.21875, |
| "epoch": 1.448, |
| "grad_norm": 0.03540867194533348, |
| "kl": 0.008702278137207031, |
| "learning_rate": 1.9991087413350367e-05, |
| "loss": 0.0273, |
| "reward": 7.00466001033783, |
| "reward_std": 1.6190518736839294, |
| "rewards/mrr_reward": 0.6167968884110451, |
| "rewards/rank_analyze_format_reward": 0.7251099199056625, |
| "rewards/rank_answer_foramt_reward": 0.859375, |
| "rewards/rank_contrast_format_reward": 0.012996495701372623, |
| "rewards/rank_initial_format_reward": 0.9817143976688385, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9817143976688385, |
| "step": 181 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 575.96875, |
| "epoch": 1.456, |
| "grad_norm": 0.03581292927265167, |
| "kl": 0.009832382202148438, |
| "learning_rate": 1.9990981011347172e-05, |
| "loss": -0.0048, |
| "reward": 5.947044134140015, |
| "reward_std": 0.9751862585544586, |
| "rewards/mrr_reward": 0.3696366660296917, |
| "rewards/rank_analyze_format_reward": 0.6148670166730881, |
| "rewards/rank_answer_foramt_reward": 0.86328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 182 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 584.296875, |
| "epoch": 1.464, |
| "grad_norm": 0.0367310456931591, |
| "kl": 0.008490562438964844, |
| "learning_rate": 1.999087397825899e-05, |
| "loss": -0.0219, |
| "reward": 6.547907114028931, |
| "reward_std": 0.9392938762903214, |
| "rewards/mrr_reward": 0.5391679182648659, |
| "rewards/rank_analyze_format_reward": 0.5218650847673416, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9825367629528046, |
| "step": 183 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 584.265625, |
| "epoch": 1.472, |
| "grad_norm": 0.036771222949028015, |
| "kl": 0.00975799560546875, |
| "learning_rate": 1.9990766314092575e-05, |
| "loss": 0.0093, |
| "reward": 7.504821062088013, |
| "reward_std": 1.017032966017723, |
| "rewards/mrr_reward": 0.7345609813928604, |
| "rewards/rank_analyze_format_reward": 0.6810796558856964, |
| "rewards/rank_answer_foramt_reward": 0.890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 184 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 570.015625, |
| "epoch": 1.48, |
| "grad_norm": 0.035612449049949646, |
| "kl": 0.009767532348632812, |
| "learning_rate": 1.9990658018854737e-05, |
| "loss": -0.0192, |
| "reward": 6.572237730026245, |
| "reward_std": 1.1024248152971268, |
| "rewards/mrr_reward": 0.5541418492794037, |
| "rewards/rank_analyze_format_reward": 0.48548950254917145, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9966137856245041, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9809887856245041, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 610.625, |
| "epoch": 1.488, |
| "grad_norm": 0.03268010914325714, |
| "kl": 0.007953643798828125, |
| "learning_rate": 1.9990549092552307e-05, |
| "loss": -0.0163, |
| "reward": 7.752923250198364, |
| "reward_std": 1.1804132461547852, |
| "rewards/mrr_reward": 0.7671007066965103, |
| "rewards/rank_analyze_format_reward": 0.7255359292030334, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 186 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 618.6875, |
| "epoch": 1.496, |
| "grad_norm": 0.033477578312158585, |
| "kl": 0.010374069213867188, |
| "learning_rate": 1.999043953519217e-05, |
| "loss": -0.0446, |
| "reward": 6.951627135276794, |
| "reward_std": 1.142410233616829, |
| "rewards/mrr_reward": 0.5984498858451843, |
| "rewards/rank_analyze_format_reward": 0.6071917712688446, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9977788031101227, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9977788031101227, |
| "step": 187 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 591.640625, |
| "epoch": 1.504, |
| "grad_norm": 0.03295287489891052, |
| "kl": 0.008535385131835938, |
| "learning_rate": 1.999032934678125e-05, |
| "loss": -0.0228, |
| "reward": 6.217561841011047, |
| "reward_std": 0.885568305850029, |
| "rewards/mrr_reward": 0.4311321973800659, |
| "rewards/rank_analyze_format_reward": 0.5961254388093948, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 188 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 595.359375, |
| "epoch": 1.512, |
| "grad_norm": 0.03543277829885483, |
| "kl": 0.008122444152832031, |
| "learning_rate": 1.99902185273265e-05, |
| "loss": -0.0164, |
| "reward": 6.661153793334961, |
| "reward_std": 0.7280477955937386, |
| "rewards/mrr_reward": 0.5164248645305634, |
| "rewards/rank_analyze_format_reward": 0.623047724366188, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9969455003738403, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9969455003738403, |
| "step": 189 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 626.109375, |
| "epoch": 1.52, |
| "grad_norm": 0.037641286849975586, |
| "kl": 0.008847236633300781, |
| "learning_rate": 1.999010707683492e-05, |
| "loss": -0.0658, |
| "reward": 6.347493886947632, |
| "reward_std": 0.9116277098655701, |
| "rewards/mrr_reward": 0.43297991901636124, |
| "rewards/rank_analyze_format_reward": 0.666047140955925, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 610.25, |
| "epoch": 1.528, |
| "grad_norm": 0.03490091487765312, |
| "kl": 0.009138107299804688, |
| "learning_rate": 1.998999499531356e-05, |
| "loss": -0.0516, |
| "reward": 7.269640564918518, |
| "reward_std": 0.6211766228079796, |
| "rewards/mrr_reward": 0.6727616675198078, |
| "rewards/rank_analyze_format_reward": 0.6237920597195625, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.011442550458014011, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 191 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 581.203125, |
| "epoch": 1.536, |
| "grad_norm": 0.03623748943209648, |
| "kl": 0.010578155517578125, |
| "learning_rate": 1.9989882282769485e-05, |
| "loss": -0.0328, |
| "reward": 6.117859721183777, |
| "reward_std": 1.3281791657209396, |
| "rewards/mrr_reward": 0.4266369119286537, |
| "rewards/rank_analyze_format_reward": 0.5988272428512573, |
| "rewards/rank_answer_foramt_reward": 0.83203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9980392158031464, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9980392158031464, |
| "step": 192 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.859375, |
| "epoch": 1.544, |
| "grad_norm": 0.03717571124434471, |
| "kl": 0.012277603149414062, |
| "learning_rate": 1.9989768939209826e-05, |
| "loss": -0.0291, |
| "reward": 6.472392678260803, |
| "reward_std": 1.0950042307376862, |
| "rewards/mrr_reward": 0.4958333298563957, |
| "rewards/rank_analyze_format_reward": 0.7097622603178024, |
| "rewards/rank_answer_foramt_reward": 0.833984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 193 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 603.59375, |
| "epoch": 1.552, |
| "grad_norm": 0.031889960169792175, |
| "kl": 0.0109710693359375, |
| "learning_rate": 1.9989654964641737e-05, |
| "loss": -0.0297, |
| "reward": 6.880647420883179, |
| "reward_std": 0.8547341153025627, |
| "rewards/mrr_reward": 0.580071933567524, |
| "rewards/rank_analyze_format_reward": 0.6654053032398224, |
| "rewards/rank_answer_foramt_reward": 0.9140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 194 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 599.234375, |
| "epoch": 1.56, |
| "grad_norm": 0.036006029695272446, |
| "kl": 0.012359619140625, |
| "learning_rate": 1.998954035907242e-05, |
| "loss": -0.0148, |
| "reward": 6.577338814735413, |
| "reward_std": 1.2951306998729706, |
| "rewards/mrr_reward": 0.5316840335726738, |
| "rewards/rank_analyze_format_reward": 0.5455324053764343, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9984335899353027, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9984335899353027, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.953125, |
| "epoch": 1.568, |
| "grad_norm": 0.030870715156197548, |
| "kl": 0.0106658935546875, |
| "learning_rate": 1.9989425122509113e-05, |
| "loss": -0.0305, |
| "reward": 6.851738214492798, |
| "reward_std": 0.7111386805772781, |
| "rewards/mrr_reward": 0.5270833075046539, |
| "rewards/rank_analyze_format_reward": 0.784420520067215, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 196 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 620.390625, |
| "epoch": 1.576, |
| "grad_norm": 0.03541827201843262, |
| "kl": 0.011089324951171875, |
| "learning_rate": 1.9989309254959096e-05, |
| "loss": -0.0172, |
| "reward": 7.087416887283325, |
| "reward_std": 1.3555363416671753, |
| "rewards/mrr_reward": 0.6345486119389534, |
| "rewards/rank_analyze_format_reward": 0.7166584730148315, |
| "rewards/rank_answer_foramt_reward": 0.873046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992897808551788, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9836647808551788, |
| "step": 197 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.65625, |
| "epoch": 1.584, |
| "grad_norm": 0.0363469123840332, |
| "kl": 0.011362075805664062, |
| "learning_rate": 1.998919275642968e-05, |
| "loss": 0.0444, |
| "reward": 6.63647723197937, |
| "reward_std": 1.5355401635169983, |
| "rewards/mrr_reward": 0.537413202226162, |
| "rewards/rank_analyze_format_reward": 0.7207763195037842, |
| "rewards/rank_answer_foramt_reward": 0.81640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 198 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 619.609375, |
| "epoch": 1.592, |
| "grad_norm": 0.0350794680416584, |
| "kl": 0.010587692260742188, |
| "learning_rate": 1.9989075626928237e-05, |
| "loss": -0.0324, |
| "reward": 7.593704700469971, |
| "reward_std": 1.2587448060512543, |
| "rewards/mrr_reward": 0.7476562410593033, |
| "rewards/rank_analyze_format_reward": 0.6766816079616547, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 199 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 641.125, |
| "epoch": 1.6, |
| "grad_norm": 0.03762039542198181, |
| "kl": 0.011552810668945312, |
| "learning_rate": 1.9988957866462155e-05, |
| "loss": 0.0012, |
| "reward": 6.556584358215332, |
| "reward_std": 0.7802992425858974, |
| "rewards/mrr_reward": 0.4782552234828472, |
| "rewards/rank_analyze_format_reward": 0.7131441533565521, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 615.515625, |
| "epoch": 1.608, |
| "grad_norm": 0.03529913350939751, |
| "kl": 0.011404037475585938, |
| "learning_rate": 1.998883947503888e-05, |
| "loss": -0.0285, |
| "reward": 6.747278928756714, |
| "reward_std": 0.8986479938030243, |
| "rewards/mrr_reward": 0.5536458566784859, |
| "rewards/rank_analyze_format_reward": 0.6645828187465668, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9975329041481018, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9975329041481018, |
| "step": 201 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.9375, |
| "epoch": 1.616, |
| "grad_norm": 0.03680592030286789, |
| "kl": 0.011167526245117188, |
| "learning_rate": 1.9988720452665885e-05, |
| "loss": -0.0142, |
| "reward": 7.523893117904663, |
| "reward_std": 1.5109763741493225, |
| "rewards/mrr_reward": 0.7254092246294022, |
| "rewards/rank_analyze_format_reward": 0.6940822452306747, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.013573232106864452, |
| "rewards/rank_initial_format_reward": 0.9973393976688385, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9973393976688385, |
| "step": 202 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 654.671875, |
| "epoch": 1.624, |
| "grad_norm": 0.031202631071209908, |
| "kl": 0.011119842529296875, |
| "learning_rate": 1.9988600799350685e-05, |
| "loss": -0.011, |
| "reward": 7.5892653465271, |
| "reward_std": 0.8718039393424988, |
| "rewards/mrr_reward": 0.7219122052192688, |
| "rewards/rank_analyze_format_reward": 0.7621632516384125, |
| "rewards/rank_answer_foramt_reward": 0.955078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 203 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 624.46875, |
| "epoch": 1.6320000000000001, |
| "grad_norm": 0.03513794392347336, |
| "kl": 0.011323928833007812, |
| "learning_rate": 1.998848051510085e-05, |
| "loss": -0.0116, |
| "reward": 7.873760461807251, |
| "reward_std": 0.9668747493997216, |
| "rewards/mrr_reward": 0.8035590276122093, |
| "rewards/rank_analyze_format_reward": 0.738672748208046, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9985119104385376, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9985119104385376, |
| "step": 204 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.203125, |
| "epoch": 1.6400000000000001, |
| "grad_norm": 0.03514819219708443, |
| "kl": 0.012950897216796875, |
| "learning_rate": 1.9988359599923964e-05, |
| "loss": -0.0071, |
| "reward": 6.787094712257385, |
| "reward_std": 1.260214388370514, |
| "rewards/mrr_reward": 0.5561384037137032, |
| "rewards/rank_analyze_format_reward": 0.7126235961914062, |
| "rewards/rank_answer_foramt_reward": 0.859375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 205 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 650.375, |
| "epoch": 1.6480000000000001, |
| "grad_norm": 0.03228963539004326, |
| "kl": 0.012664794921875, |
| "learning_rate": 1.9988238053827677e-05, |
| "loss": -0.0375, |
| "reward": 7.256770491600037, |
| "reward_std": 0.48313772678375244, |
| "rewards/mrr_reward": 0.6615699455142021, |
| "rewards/rank_analyze_format_reward": 0.6667077392339706, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 206 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 667.546875, |
| "epoch": 1.6560000000000001, |
| "grad_norm": 0.03247380256652832, |
| "kl": 0.011510848999023438, |
| "learning_rate": 1.9988115876819654e-05, |
| "loss": -0.0066, |
| "reward": 7.226160883903503, |
| "reward_std": 0.7291913609951735, |
| "rewards/mrr_reward": 0.6235677003860474, |
| "rewards/rank_analyze_format_reward": 0.7994760870933533, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9984335899353027, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9984335899353027, |
| "step": 207 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 617.203125, |
| "epoch": 1.6640000000000001, |
| "grad_norm": 0.03485206514596939, |
| "kl": 0.012271881103515625, |
| "learning_rate": 1.9987993068907624e-05, |
| "loss": -0.0256, |
| "reward": 6.819635629653931, |
| "reward_std": 1.4911159574985504, |
| "rewards/mrr_reward": 0.5687500089406967, |
| "rewards/rank_analyze_format_reward": 0.6510217636823654, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.014248084276914597, |
| "rewards/rank_initial_format_reward": 0.9963235259056091, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9963235259056091, |
| "step": 208 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.609375, |
| "epoch": 1.6720000000000002, |
| "grad_norm": 0.03412061929702759, |
| "kl": 0.011951446533203125, |
| "learning_rate": 1.9987869630099333e-05, |
| "loss": -0.0183, |
| "reward": 7.066570281982422, |
| "reward_std": 1.0215441137552261, |
| "rewards/mrr_reward": 0.6142113208770752, |
| "rewards/rank_analyze_format_reward": 0.7064512819051743, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9985119104385376, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9985119104385376, |
| "step": 209 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 618.9375, |
| "epoch": 1.6800000000000002, |
| "grad_norm": 0.035708099603652954, |
| "kl": 0.011415481567382812, |
| "learning_rate": 1.998774556040259e-05, |
| "loss": 0.0207, |
| "reward": 7.148289203643799, |
| "reward_std": 0.40048687532544136, |
| "rewards/mrr_reward": 0.6233135014772415, |
| "rewards/rank_analyze_format_reward": 0.6780079305171967, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9992559552192688, |
| "step": 210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.90625, |
| "epoch": 1.688, |
| "grad_norm": 0.03594611957669258, |
| "kl": 0.012083053588867188, |
| "learning_rate": 1.9987620859825225e-05, |
| "loss": 0.007, |
| "reward": 7.130272626876831, |
| "reward_std": 1.0038132444024086, |
| "rewards/mrr_reward": 0.5943328440189362, |
| "rewards/rank_analyze_format_reward": 0.8232535421848297, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 211 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 632.3125, |
| "epoch": 1.696, |
| "grad_norm": 0.03679274767637253, |
| "kl": 0.012105941772460938, |
| "learning_rate": 1.9987495528375115e-05, |
| "loss": 0.0071, |
| "reward": 7.324402451515198, |
| "reward_std": 1.0858530811965466, |
| "rewards/mrr_reward": 0.6619791686534882, |
| "rewards/rank_analyze_format_reward": 0.7507043033838272, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 212 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 633.578125, |
| "epoch": 1.704, |
| "grad_norm": 0.032590728253126144, |
| "kl": 0.011951446533203125, |
| "learning_rate": 1.998736956606018e-05, |
| "loss": -0.0204, |
| "reward": 7.353400826454163, |
| "reward_std": 1.2198131084442139, |
| "rewards/mrr_reward": 0.6886904761195183, |
| "rewards/rank_analyze_format_reward": 0.7275451272726059, |
| "rewards/rank_answer_foramt_reward": 0.92578125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 213 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 648.125, |
| "epoch": 1.712, |
| "grad_norm": 0.03516772761940956, |
| "kl": 0.011865615844726562, |
| "learning_rate": 1.9987242972888368e-05, |
| "loss": -0.0256, |
| "reward": 6.390246629714966, |
| "reward_std": 1.2206433862447739, |
| "rewards/mrr_reward": 0.4318266250193119, |
| "rewards/rank_analyze_format_reward": 0.744662880897522, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 214 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 626.515625, |
| "epoch": 1.72, |
| "grad_norm": 0.034917186945676804, |
| "kl": 0.010568618774414062, |
| "learning_rate": 1.9987115748867685e-05, |
| "loss": -0.0075, |
| "reward": 7.013459086418152, |
| "reward_std": 1.1758202761411667, |
| "rewards/mrr_reward": 0.6146267428994179, |
| "rewards/rank_analyze_format_reward": 0.6679250225424767, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 215 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.75, |
| "epoch": 1.728, |
| "grad_norm": 0.03526122495532036, |
| "kl": 0.011056900024414062, |
| "learning_rate": 1.9986987894006164e-05, |
| "loss": -0.0348, |
| "reward": 7.004386067390442, |
| "reward_std": 1.013509213924408, |
| "rewards/mrr_reward": 0.6190104186534882, |
| "rewards/rank_analyze_format_reward": 0.6921346038579941, |
| "rewards/rank_answer_foramt_reward": 0.849609375, |
| "rewards/rank_contrast_format_reward": 0.014774133451282978, |
| "rewards/rank_initial_format_reward": 0.9898194670677185, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9898194670677185, |
| "step": 216 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 587.296875, |
| "epoch": 1.736, |
| "grad_norm": 0.03770997375249863, |
| "kl": 0.013156890869140625, |
| "learning_rate": 1.9986859408311878e-05, |
| "loss": -0.0243, |
| "reward": 7.723721385002136, |
| "reward_std": 1.2577708065509796, |
| "rewards/mrr_reward": 0.8122829794883728, |
| "rewards/rank_analyze_format_reward": 0.5527143776416779, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 217 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 625.390625, |
| "epoch": 1.744, |
| "grad_norm": 0.03594063222408295, |
| "kl": 0.01361083984375, |
| "learning_rate": 1.9986730291792945e-05, |
| "loss": -0.0125, |
| "reward": 6.763970732688904, |
| "reward_std": 1.1345993727445602, |
| "rewards/mrr_reward": 0.5692894533276558, |
| "rewards/rank_analyze_format_reward": 0.6561181470751762, |
| "rewards/rank_answer_foramt_reward": 0.857421875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 218 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 595.8125, |
| "epoch": 1.752, |
| "grad_norm": 0.03884103149175644, |
| "kl": 0.012781143188476562, |
| "learning_rate": 1.9986600544457524e-05, |
| "loss": -0.0204, |
| "reward": 6.09786331653595, |
| "reward_std": 1.128006488084793, |
| "rewards/mrr_reward": 0.4502604268491268, |
| "rewards/rank_analyze_format_reward": 0.48041532188653946, |
| "rewards/rank_answer_foramt_reward": 0.84765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 219 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 611.0625, |
| "epoch": 1.76, |
| "grad_norm": 0.03623896837234497, |
| "kl": 0.01174163818359375, |
| "learning_rate": 1.9986470166313805e-05, |
| "loss": 0.0022, |
| "reward": 6.999427080154419, |
| "reward_std": 0.6746486648917198, |
| "rewards/mrr_reward": 0.608004704117775, |
| "rewards/rank_analyze_format_reward": 0.6940975040197372, |
| "rewards/rank_answer_foramt_reward": 0.9140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991554021835327, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9835304021835327, |
| "step": 220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.984375, |
| "epoch": 1.768, |
| "grad_norm": 0.03681391850113869, |
| "kl": 0.01168060302734375, |
| "learning_rate": 1.9986339157370026e-05, |
| "loss": 0.0156, |
| "reward": 6.224501371383667, |
| "reward_std": 1.1424128413200378, |
| "rewards/mrr_reward": 0.4014260917901993, |
| "rewards/rank_analyze_format_reward": 0.7514945864677429, |
| "rewards/rank_answer_foramt_reward": 0.884765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 221 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 675.609375, |
| "epoch": 1.776, |
| "grad_norm": 0.035715728998184204, |
| "kl": 0.011419296264648438, |
| "learning_rate": 1.9986207517634466e-05, |
| "loss": -0.0075, |
| "reward": 6.838769316673279, |
| "reward_std": 1.138169839978218, |
| "rewards/mrr_reward": 0.5326946973800659, |
| "rewards/rank_analyze_format_reward": 0.8114955276250839, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9951225072145462, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9951225072145462, |
| "step": 222 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 650.0625, |
| "epoch": 1.784, |
| "grad_norm": 0.03682604804635048, |
| "kl": 0.012578964233398438, |
| "learning_rate": 1.998607524711543e-05, |
| "loss": -0.024, |
| "reward": 6.9665446281433105, |
| "reward_std": 1.341919094324112, |
| "rewards/mrr_reward": 0.5829861015081406, |
| "rewards/rank_analyze_format_reward": 0.8279595226049423, |
| "rewards/rank_answer_foramt_reward": 0.876953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 223 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.046875, |
| "epoch": 1.792, |
| "grad_norm": 0.03405497223138809, |
| "kl": 0.012273788452148438, |
| "learning_rate": 1.9985942345821285e-05, |
| "loss": 0.0101, |
| "reward": 7.542881608009338, |
| "reward_std": 0.9405869543552399, |
| "rewards/mrr_reward": 0.70331721752882, |
| "rewards/rank_analyze_format_reward": 0.8331284523010254, |
| "rewards/rank_answer_foramt_reward": 0.927734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 224 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 641.34375, |
| "epoch": 1.8, |
| "grad_norm": 0.03588543459773064, |
| "kl": 0.010999679565429688, |
| "learning_rate": 1.998580881376042e-05, |
| "loss": 0.0182, |
| "reward": 7.186712980270386, |
| "reward_std": 1.0480735301971436, |
| "rewards/mrr_reward": 0.6471106112003326, |
| "rewards/rank_analyze_format_reward": 0.7620245963335037, |
| "rewards/rank_answer_foramt_reward": 0.869140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 225 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 679.96875, |
| "epoch": 1.808, |
| "grad_norm": 0.03298197686672211, |
| "kl": 0.011339187622070312, |
| "learning_rate": 1.9985674650941265e-05, |
| "loss": -0.0075, |
| "reward": 6.580728888511658, |
| "reward_std": 1.171303242444992, |
| "rewards/mrr_reward": 0.49358879029750824, |
| "rewards/rank_analyze_format_reward": 0.7704363465309143, |
| "rewards/rank_answer_foramt_reward": 0.8984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 226 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 641.984375, |
| "epoch": 1.8159999999999998, |
| "grad_norm": 0.032649777829647064, |
| "kl": 0.011322021484375, |
| "learning_rate": 1.9985539857372303e-05, |
| "loss": -0.0173, |
| "reward": 6.867309093475342, |
| "reward_std": 0.8678697645664215, |
| "rewards/mrr_reward": 0.557161457836628, |
| "rewards/rank_analyze_format_reward": 0.736319363117218, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 227 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.921875, |
| "epoch": 1.8239999999999998, |
| "grad_norm": 0.038027409464120865, |
| "kl": 0.011888504028320312, |
| "learning_rate": 1.998540443306204e-05, |
| "loss": 0.0094, |
| "reward": 6.406673431396484, |
| "reward_std": 1.36880823969841, |
| "rewards/mrr_reward": 0.47701510787010193, |
| "rewards/rank_analyze_format_reward": 0.7093206197023392, |
| "rewards/rank_answer_foramt_reward": 0.84765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9825367629528046, |
| "step": 228 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 691.5625, |
| "epoch": 1.8319999999999999, |
| "grad_norm": 0.03739263862371445, |
| "kl": 0.016162872314453125, |
| "learning_rate": 1.998526837801904e-05, |
| "loss": -0.0163, |
| "reward": 6.16663670539856, |
| "reward_std": 0.7895801216363907, |
| "rewards/mrr_reward": 0.36532738618552685, |
| "rewards/rank_analyze_format_reward": 0.8101400434970856, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.013089364394545555, |
| "rewards/rank_initial_format_reward": 0.9976895451545715, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9820645451545715, |
| "step": 229 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.640625, |
| "epoch": 1.8399999999999999, |
| "grad_norm": 0.03776266425848007, |
| "kl": 0.010849952697753906, |
| "learning_rate": 1.9985131692251887e-05, |
| "loss": 0.0068, |
| "reward": 6.760786771774292, |
| "reward_std": 1.123057559132576, |
| "rewards/mrr_reward": 0.5368923768401146, |
| "rewards/rank_analyze_format_reward": 0.7293006330728531, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9985989332199097, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9985989332199097, |
| "step": 230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 613.015625, |
| "epoch": 1.8479999999999999, |
| "grad_norm": 0.03792120888829231, |
| "kl": 0.01201629638671875, |
| "learning_rate": 1.9984994375769222e-05, |
| "loss": -0.0071, |
| "reward": 7.100589036941528, |
| "reward_std": 1.1812313869595528, |
| "rewards/mrr_reward": 0.6353298723697662, |
| "rewards/rank_analyze_format_reward": 0.6587639302015305, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 231 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.921875, |
| "epoch": 1.8559999999999999, |
| "grad_norm": 0.034471407532691956, |
| "kl": 0.012517929077148438, |
| "learning_rate": 1.9984856428579717e-05, |
| "loss": -0.0154, |
| "reward": 7.1253886222839355, |
| "reward_std": 0.9221947491168976, |
| "rewards/mrr_reward": 0.6032862067222595, |
| "rewards/rank_analyze_format_reward": 0.8059941083192825, |
| "rewards/rank_answer_foramt_reward": 0.9140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 232 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.453125, |
| "epoch": 1.8639999999999999, |
| "grad_norm": 0.03487012907862663, |
| "kl": 0.010175704956054688, |
| "learning_rate": 1.998471785069208e-05, |
| "loss": -0.0252, |
| "reward": 7.108256816864014, |
| "reward_std": 1.0599358081817627, |
| "rewards/mrr_reward": 0.6225880309939384, |
| "rewards/rank_analyze_format_reward": 0.7038420140743256, |
| "rewards/rank_answer_foramt_reward": 0.9140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 233 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 614.71875, |
| "epoch": 1.8719999999999999, |
| "grad_norm": 0.03678525239229202, |
| "kl": 0.011201858520507812, |
| "learning_rate": 1.9984578642115072e-05, |
| "loss": -0.0072, |
| "reward": 7.174077749252319, |
| "reward_std": 1.0892403870821, |
| "rewards/mrr_reward": 0.6339843720197678, |
| "rewards/rank_analyze_format_reward": 0.7592339366674423, |
| "rewards/rank_answer_foramt_reward": 0.92578125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 234 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 626.484375, |
| "epoch": 1.88, |
| "grad_norm": 0.03547394275665283, |
| "kl": 0.012744903564453125, |
| "learning_rate": 1.998443880285748e-05, |
| "loss": -0.0371, |
| "reward": 7.188539266586304, |
| "reward_std": 1.558995470404625, |
| "rewards/mrr_reward": 0.6575520932674408, |
| "rewards/rank_analyze_format_reward": 0.7216200232505798, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9828085899353027, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9828085899353027, |
| "step": 235 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 612.84375, |
| "epoch": 1.888, |
| "grad_norm": 0.03983930125832558, |
| "kl": 0.011608123779296875, |
| "learning_rate": 1.9984298332928142e-05, |
| "loss": -0.0087, |
| "reward": 7.840075254440308, |
| "reward_std": 1.4074196517467499, |
| "rewards/mrr_reward": 0.8069010525941849, |
| "rewards/rank_analyze_format_reward": 0.7511429786682129, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 236 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 608.953125, |
| "epoch": 1.896, |
| "grad_norm": 0.03890369087457657, |
| "kl": 0.012737274169921875, |
| "learning_rate": 1.9984157232335926e-05, |
| "loss": -0.0036, |
| "reward": 6.91395902633667, |
| "reward_std": 1.4606387615203857, |
| "rewards/mrr_reward": 0.5853298753499985, |
| "rewards/rank_analyze_format_reward": 0.6925735026597977, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9966736733913422, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9966736733913422, |
| "step": 237 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 626.671875, |
| "epoch": 1.904, |
| "grad_norm": 0.03019222430884838, |
| "kl": 0.010616302490234375, |
| "learning_rate": 1.998401550108975e-05, |
| "loss": -0.0175, |
| "reward": 7.32897675037384, |
| "reward_std": 0.9665245488286018, |
| "rewards/mrr_reward": 0.676432304084301, |
| "rewards/rank_analyze_format_reward": 0.7130914330482483, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 238 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.8125, |
| "epoch": 1.912, |
| "grad_norm": 0.03469119966030121, |
| "kl": 0.011953353881835938, |
| "learning_rate": 1.9983873139198565e-05, |
| "loss": 0.0037, |
| "reward": 6.612988352775574, |
| "reward_std": 1.0446814224123955, |
| "rewards/mrr_reward": 0.47405755519866943, |
| "rewards/rank_analyze_format_reward": 0.7889089584350586, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 239 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.203125, |
| "epoch": 1.92, |
| "grad_norm": 0.03232351318001747, |
| "kl": 0.0106048583984375, |
| "learning_rate": 1.9983730146671363e-05, |
| "loss": -0.0148, |
| "reward": 6.731534361839294, |
| "reward_std": 1.2606956362724304, |
| "rewards/mrr_reward": 0.5494357720017433, |
| "rewards/rank_analyze_format_reward": 0.6778992190957069, |
| "rewards/rank_answer_foramt_reward": 0.9140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.515625, |
| "epoch": 1.928, |
| "grad_norm": 0.03420478478074074, |
| "kl": 0.01219940185546875, |
| "learning_rate": 1.9983586523517175e-05, |
| "loss": -0.0438, |
| "reward": 7.590452075004578, |
| "reward_std": 1.6388859748840332, |
| "rewards/mrr_reward": 0.7669270932674408, |
| "rewards/rank_analyze_format_reward": 0.6672752201557159, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 241 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.84375, |
| "epoch": 1.936, |
| "grad_norm": 0.03276892751455307, |
| "kl": 0.01093292236328125, |
| "learning_rate": 1.9983442269745073e-05, |
| "loss": -0.0257, |
| "reward": 6.300868988037109, |
| "reward_std": 0.995959609746933, |
| "rewards/mrr_reward": 0.4575396776199341, |
| "rewards/rank_analyze_format_reward": 0.6771672368049622, |
| "rewards/rank_answer_foramt_reward": 0.818359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9954044073820114, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9954044073820114, |
| "step": 242 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 617.359375, |
| "epoch": 1.944, |
| "grad_norm": 0.03749570995569229, |
| "kl": 0.010999679565429688, |
| "learning_rate": 1.9983297385364166e-05, |
| "loss": -0.0007, |
| "reward": 7.169430136680603, |
| "reward_std": 1.120530128479004, |
| "rewards/mrr_reward": 0.6705729141831398, |
| "rewards/rank_analyze_format_reward": 0.6551071107387543, |
| "rewards/rank_answer_foramt_reward": 0.86328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 243 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 613.453125, |
| "epoch": 1.952, |
| "grad_norm": 0.04473373666405678, |
| "kl": 0.01145172119140625, |
| "learning_rate": 1.9983151870383614e-05, |
| "loss": -0.0107, |
| "reward": 6.484450101852417, |
| "reward_std": 1.0988103747367859, |
| "rewards/mrr_reward": 0.46861979365348816, |
| "rewards/rank_analyze_format_reward": 0.764109417796135, |
| "rewards/rank_answer_foramt_reward": 0.890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9932432472705841, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9776182472705841, |
| "step": 244 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.328125, |
| "epoch": 1.96, |
| "grad_norm": 0.03367482125759125, |
| "kl": 0.013071060180664062, |
| "learning_rate": 1.99830057248126e-05, |
| "loss": -0.0296, |
| "reward": 6.685883641242981, |
| "reward_std": 0.9533030688762665, |
| "rewards/mrr_reward": 0.5285590291023254, |
| "rewards/rank_analyze_format_reward": 0.6315773874521255, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 245 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 655.34375, |
| "epoch": 1.968, |
| "grad_norm": 0.034726452082395554, |
| "kl": 0.010492324829101562, |
| "learning_rate": 1.9982858948660363e-05, |
| "loss": -0.0181, |
| "reward": 6.672136902809143, |
| "reward_std": 1.0319916605949402, |
| "rewards/mrr_reward": 0.48452381789684296, |
| "rewards/rank_analyze_format_reward": 0.7848227173089981, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 246 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 612.4375, |
| "epoch": 1.976, |
| "grad_norm": 0.03818318620324135, |
| "kl": 0.00988006591796875, |
| "learning_rate": 1.9982711541936167e-05, |
| "loss": -0.0117, |
| "reward": 7.333935976028442, |
| "reward_std": 1.081397719681263, |
| "rewards/mrr_reward": 0.6711309552192688, |
| "rewards/rank_analyze_format_reward": 0.7507446557283401, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 247 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 619.75, |
| "epoch": 1.984, |
| "grad_norm": 0.09436666965484619, |
| "kl": 0.037281036376953125, |
| "learning_rate": 1.9982563504649327e-05, |
| "loss": -0.0099, |
| "reward": 7.042810320854187, |
| "reward_std": 1.4771567583084106, |
| "rewards/mrr_reward": 0.6272321417927742, |
| "rewards/rank_analyze_format_reward": 0.7505638301372528, |
| "rewards/rank_answer_foramt_reward": 0.80859375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 248 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 593.03125, |
| "epoch": 1.992, |
| "grad_norm": 0.040008366107940674, |
| "kl": 0.011211395263671875, |
| "learning_rate": 1.998241483680919e-05, |
| "loss": 0.0073, |
| "reward": 6.97391951084137, |
| "reward_std": 1.2818303257226944, |
| "rewards/mrr_reward": 0.5991319715976715, |
| "rewards/rank_analyze_format_reward": 0.7252494841814041, |
| "rewards/rank_answer_foramt_reward": 0.88671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983368366956711, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9827118366956711, |
| "step": 249 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.59375, |
| "epoch": 2.0, |
| "grad_norm": 0.03942989930510521, |
| "kl": 0.011959075927734375, |
| "learning_rate": 1.9982265538425157e-05, |
| "loss": 0.0371, |
| "reward": 6.234715461730957, |
| "reward_std": 1.436354637145996, |
| "rewards/mrr_reward": 0.47746776789426804, |
| "rewards/rank_analyze_format_reward": 0.5933748111128807, |
| "rewards/rank_answer_foramt_reward": 0.818359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9800696671009064, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9799154698848724, |
| "step": 250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.46875, |
| "epoch": 2.008, |
| "grad_norm": 0.03840762376785278, |
| "kl": 0.01082611083984375, |
| "learning_rate": 1.9982115609506648e-05, |
| "loss": -0.0149, |
| "reward": 7.465001344680786, |
| "reward_std": 1.3534227311611176, |
| "rewards/mrr_reward": 0.701078861951828, |
| "rewards/rank_analyze_format_reward": 0.73264279961586, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 251 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 623.4375, |
| "epoch": 2.016, |
| "grad_norm": 0.03643304482102394, |
| "kl": 0.0111846923828125, |
| "learning_rate": 1.9981965050063134e-05, |
| "loss": 0.0095, |
| "reward": 6.563894629478455, |
| "reward_std": 1.0918782949447632, |
| "rewards/mrr_reward": 0.49427083879709244, |
| "rewards/rank_analyze_format_reward": 0.6970945447683334, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9975927919149399, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9975927919149399, |
| "step": 252 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 623.140625, |
| "epoch": 2.024, |
| "grad_norm": 0.03906136751174927, |
| "kl": 0.011167526245117188, |
| "learning_rate": 1.998181386010413e-05, |
| "loss": 0.0076, |
| "reward": 7.883460879325867, |
| "reward_std": 0.9759941548109055, |
| "rewards/mrr_reward": 0.7747395783662796, |
| "rewards/rank_analyze_format_reward": 0.8154443502426147, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 253 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 589.5, |
| "epoch": 2.032, |
| "grad_norm": 0.035734061151742935, |
| "kl": 0.014560699462890625, |
| "learning_rate": 1.9981662039639182e-05, |
| "loss": -0.0189, |
| "reward": 7.1975014209747314, |
| "reward_std": 1.0746060460805893, |
| "rewards/mrr_reward": 0.6796006858348846, |
| "rewards/rank_analyze_format_reward": 0.5943331569433212, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 254 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 608.125, |
| "epoch": 2.04, |
| "grad_norm": 0.035253558307886124, |
| "kl": 0.011186599731445312, |
| "learning_rate": 1.9981509588677883e-05, |
| "loss": -0.0403, |
| "reward": 6.368244171142578, |
| "reward_std": 0.9275897480547428, |
| "rewards/mrr_reward": 0.43844248354434967, |
| "rewards/rank_analyze_format_reward": 0.7043182849884033, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 255 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 593.515625, |
| "epoch": 2.048, |
| "grad_norm": 0.03972550854086876, |
| "kl": 0.012750625610351562, |
| "learning_rate": 1.9981356507229862e-05, |
| "loss": -0.0269, |
| "reward": 6.800292491912842, |
| "reward_std": 1.1689245849847794, |
| "rewards/mrr_reward": 0.5714161694049835, |
| "rewards/rank_analyze_format_reward": 0.627255916595459, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9964202791452408, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9964202791452408, |
| "step": 256 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 593.203125, |
| "epoch": 2.056, |
| "grad_norm": 0.03649269416928291, |
| "kl": 0.009584426879882812, |
| "learning_rate": 1.9981202795304787e-05, |
| "loss": -0.0051, |
| "reward": 7.230230689048767, |
| "reward_std": 1.2953073680400848, |
| "rewards/mrr_reward": 0.6908172070980072, |
| "rewards/rank_analyze_format_reward": 0.6150907501578331, |
| "rewards/rank_answer_foramt_reward": 0.900390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 257 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.5, |
| "epoch": 2.064, |
| "grad_norm": 0.03631464019417763, |
| "kl": 0.010379791259765625, |
| "learning_rate": 1.9981048452912364e-05, |
| "loss": 0.0223, |
| "reward": 6.423146486282349, |
| "reward_std": 1.1042785942554474, |
| "rewards/mrr_reward": 0.46945685893297195, |
| "rewards/rank_analyze_format_reward": 0.7502825409173965, |
| "rewards/rank_answer_foramt_reward": 0.9140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9834558814764023, |
| "rewards/rank_overall_format_reward_more": 0.9140625, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 258 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 601.90625, |
| "epoch": 2.072, |
| "grad_norm": 0.03482425957918167, |
| "kl": 0.011255264282226562, |
| "learning_rate": 1.998089348006235e-05, |
| "loss": -0.0123, |
| "reward": 6.214681625366211, |
| "reward_std": 1.3232944011688232, |
| "rewards/mrr_reward": 0.4206349328160286, |
| "rewards/rank_analyze_format_reward": 0.6760562360286713, |
| "rewards/rank_answer_foramt_reward": 0.859375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 259 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 607.171875, |
| "epoch": 2.08, |
| "grad_norm": 0.03515848517417908, |
| "kl": 0.008946418762207031, |
| "learning_rate": 1.998073787676453e-05, |
| "loss": -0.0182, |
| "reward": 6.849403977394104, |
| "reward_std": 1.1705361306667328, |
| "rewards/mrr_reward": 0.5722842365503311, |
| "rewards/rank_analyze_format_reward": 0.6880913898348808, |
| "rewards/rank_answer_foramt_reward": 0.876953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9976112246513367, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9976112246513367, |
| "step": 260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 625.765625, |
| "epoch": 2.088, |
| "grad_norm": 0.033137645572423935, |
| "kl": 0.010486602783203125, |
| "learning_rate": 1.9980581643028732e-05, |
| "loss": -0.0257, |
| "reward": 6.725158452987671, |
| "reward_std": 0.918092668056488, |
| "rewards/mrr_reward": 0.5186383947730064, |
| "rewards/rank_analyze_format_reward": 0.7032241895794868, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 261 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 596.21875, |
| "epoch": 2.096, |
| "grad_norm": 0.03581111505627632, |
| "kl": 0.011474609375, |
| "learning_rate": 1.9980424778864825e-05, |
| "loss": -0.028, |
| "reward": 6.540898442268372, |
| "reward_std": 1.0225854963064194, |
| "rewards/mrr_reward": 0.4951760917901993, |
| "rewards/rank_analyze_format_reward": 0.618479423224926, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 262 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 587.578125, |
| "epoch": 2.104, |
| "grad_norm": 0.033393606543540955, |
| "kl": 0.009927749633789062, |
| "learning_rate": 1.9980267284282718e-05, |
| "loss": -0.0212, |
| "reward": 7.45247495174408, |
| "reward_std": 0.4263784661889076, |
| "rewards/mrr_reward": 0.7192708477377892, |
| "rewards/rank_analyze_format_reward": 0.5848489105701447, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 263 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 607.234375, |
| "epoch": 2.112, |
| "grad_norm": 0.034759897738695145, |
| "kl": 0.008701324462890625, |
| "learning_rate": 1.998010915929236e-05, |
| "loss": -0.0146, |
| "reward": 7.091454982757568, |
| "reward_std": 0.9185773134231567, |
| "rewards/mrr_reward": 0.6087363660335541, |
| "rewards/rank_analyze_format_reward": 0.6858063042163849, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 264 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 601.015625, |
| "epoch": 2.12, |
| "grad_norm": 0.03577468544244766, |
| "kl": 0.01145172119140625, |
| "learning_rate": 1.9979950403903732e-05, |
| "loss": -0.0014, |
| "reward": 6.77937126159668, |
| "reward_std": 1.279131755232811, |
| "rewards/mrr_reward": 0.563430055975914, |
| "rewards/rank_analyze_format_reward": 0.6385087594389915, |
| "rewards/rank_answer_foramt_reward": 0.8984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 265 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 559.375, |
| "epoch": 2.128, |
| "grad_norm": 0.03831040486693382, |
| "kl": 0.010589599609375, |
| "learning_rate": 1.9979791018126874e-05, |
| "loss": -0.0106, |
| "reward": 6.678526520729065, |
| "reward_std": 1.4866646826267242, |
| "rewards/mrr_reward": 0.5484995096921921, |
| "rewards/rank_analyze_format_reward": 0.595856636762619, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 266 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 583.3125, |
| "epoch": 2.136, |
| "grad_norm": 0.0412379652261734, |
| "kl": 0.010957717895507812, |
| "learning_rate": 1.9979631001971848e-05, |
| "loss": -0.0116, |
| "reward": 7.416189789772034, |
| "reward_std": 1.0926668643951416, |
| "rewards/mrr_reward": 0.7192708253860474, |
| "rewards/rank_analyze_format_reward": 0.6211378127336502, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 267 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 609.828125, |
| "epoch": 2.144, |
| "grad_norm": 0.03350284695625305, |
| "kl": 0.008955001831054688, |
| "learning_rate": 1.9979470355448756e-05, |
| "loss": -0.0158, |
| "reward": 7.620032906532288, |
| "reward_std": 0.6238258853554726, |
| "rewards/mrr_reward": 0.7218749970197678, |
| "rewards/rank_analyze_format_reward": 0.7774548083543777, |
| "rewards/rank_answer_foramt_reward": 0.955078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 268 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 592.359375, |
| "epoch": 2.152, |
| "grad_norm": 0.03770367428660393, |
| "kl": 0.011816024780273438, |
| "learning_rate": 1.9979309078567756e-05, |
| "loss": -0.0043, |
| "reward": 6.694323897361755, |
| "reward_std": 1.3028307557106018, |
| "rewards/mrr_reward": 0.5659226104617119, |
| "rewards/rank_analyze_format_reward": 0.5534504503011703, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 269 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.828125, |
| "epoch": 2.16, |
| "grad_norm": 0.03764275088906288, |
| "kl": 0.009786605834960938, |
| "learning_rate": 1.9979147171339022e-05, |
| "loss": -0.019, |
| "reward": 6.99415135383606, |
| "reward_std": 1.3437075316905975, |
| "rewards/mrr_reward": 0.6053075417876244, |
| "rewards/rank_analyze_format_reward": 0.6764369979500771, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 595.4375, |
| "epoch": 2.168, |
| "grad_norm": 0.03837438300251961, |
| "kl": 0.011205673217773438, |
| "learning_rate": 1.9978984633772795e-05, |
| "loss": -0.0289, |
| "reward": 5.901566505432129, |
| "reward_std": 0.9236202016472816, |
| "rewards/mrr_reward": 0.35381324775516987, |
| "rewards/rank_analyze_format_reward": 0.59522345662117, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9992559552192688, |
| "step": 271 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 575.375, |
| "epoch": 2.176, |
| "grad_norm": 0.04059956222772598, |
| "kl": 0.011159896850585938, |
| "learning_rate": 1.9978821465879332e-05, |
| "loss": -0.0362, |
| "reward": 6.7173460721969604, |
| "reward_std": 0.7962133586406708, |
| "rewards/mrr_reward": 0.5370783656835556, |
| "rewards/rank_analyze_format_reward": 0.6237201392650604, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 272 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 570.328125, |
| "epoch": 2.184, |
| "grad_norm": 0.03902539238333702, |
| "kl": 0.0102081298828125, |
| "learning_rate": 1.9978657667668945e-05, |
| "loss": -0.032, |
| "reward": 6.786892771720886, |
| "reward_std": 1.4763158559799194, |
| "rewards/mrr_reward": 0.6156250163912773, |
| "rewards/rank_analyze_format_reward": 0.49029337987303734, |
| "rewards/rank_answer_foramt_reward": 0.875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 273 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 632.28125, |
| "epoch": 2.192, |
| "grad_norm": 0.035940494388341904, |
| "kl": 0.012010574340820312, |
| "learning_rate": 1.9978493239151976e-05, |
| "loss": -0.0052, |
| "reward": 7.241865515708923, |
| "reward_std": 1.5207486748695374, |
| "rewards/mrr_reward": 0.6480902805924416, |
| "rewards/rank_analyze_format_reward": 0.7944418787956238, |
| "rewards/rank_answer_foramt_reward": 0.876953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9968671798706055, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9968671798706055, |
| "step": 274 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.765625, |
| "epoch": 2.2, |
| "grad_norm": 0.03509166091680527, |
| "kl": 0.013257980346679688, |
| "learning_rate": 1.997832818033881e-05, |
| "loss": 0.0139, |
| "reward": 6.9878867864608765, |
| "reward_std": 1.2263060361146927, |
| "rewards/mrr_reward": 0.592051088809967, |
| "rewards/rank_analyze_format_reward": 0.7836297750473022, |
| "rewards/rank_answer_foramt_reward": 0.884765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9834558814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 275 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 574.640625, |
| "epoch": 2.208, |
| "grad_norm": 0.03720112144947052, |
| "kl": 0.013032913208007812, |
| "learning_rate": 1.9978162491239882e-05, |
| "loss": -0.0178, |
| "reward": 7.190923571586609, |
| "reward_std": 1.171968013048172, |
| "rewards/mrr_reward": 0.640625, |
| "rewards/rank_analyze_format_reward": 0.6909236311912537, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 276 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 613.5625, |
| "epoch": 2.216, |
| "grad_norm": 0.04091575741767883, |
| "kl": 0.012386322021484375, |
| "learning_rate": 1.997799617186565e-05, |
| "loss": -0.003, |
| "reward": 6.570623397827148, |
| "reward_std": 1.0336104482412338, |
| "rewards/mrr_reward": 0.48072298616170883, |
| "rewards/rank_analyze_format_reward": 0.7270140051841736, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9994212985038757, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9994212985038757, |
| "step": 277 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 622.53125, |
| "epoch": 2.224, |
| "grad_norm": 0.03800741583108902, |
| "kl": 0.01259613037109375, |
| "learning_rate": 1.9977829222226622e-05, |
| "loss": -0.0266, |
| "reward": 6.372930645942688, |
| "reward_std": 0.8913363832980394, |
| "rewards/mrr_reward": 0.46861979365348816, |
| "rewards/rank_analyze_format_reward": 0.731744721531868, |
| "rewards/rank_answer_foramt_reward": 0.802734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9976112246513367, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9819862246513367, |
| "step": 278 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 608.8125, |
| "epoch": 2.232, |
| "grad_norm": 0.0358574353158474, |
| "kl": 0.012134552001953125, |
| "learning_rate": 1.9977661642333344e-05, |
| "loss": -0.0335, |
| "reward": 6.156337261199951, |
| "reward_std": 1.1192015409469604, |
| "rewards/mrr_reward": 0.4033792242407799, |
| "rewards/rank_analyze_format_reward": 0.7136551886796951, |
| "rewards/rank_answer_foramt_reward": 0.859375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9966137856245041, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9966137856245041, |
| "step": 279 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 590.6875, |
| "epoch": 2.24, |
| "grad_norm": 0.03665808215737343, |
| "kl": 0.012432098388671875, |
| "learning_rate": 1.99774934321964e-05, |
| "loss": -0.0148, |
| "reward": 7.189491271972656, |
| "reward_std": 1.3065388202667236, |
| "rewards/mrr_reward": 0.682291679084301, |
| "rewards/rank_analyze_format_reward": 0.6308017671108246, |
| "rewards/rank_answer_foramt_reward": 0.8828125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9967927634716034, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9811677634716034, |
| "step": 280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 632.296875, |
| "epoch": 2.248, |
| "grad_norm": 0.039501260966062546, |
| "kl": 0.010639190673828125, |
| "learning_rate": 1.9977324591826415e-05, |
| "loss": -0.0105, |
| "reward": 6.4820040464401245, |
| "reward_std": 1.1038605086505413, |
| "rewards/mrr_reward": 0.45491691678762436, |
| "rewards/rank_analyze_format_reward": 0.7659522593021393, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.012876884080469608, |
| "rewards/rank_initial_format_reward": 0.9974177181720734, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9974177181720734, |
| "step": 281 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.796875, |
| "epoch": 2.2560000000000002, |
| "grad_norm": 0.040637850761413574, |
| "kl": 0.012548446655273438, |
| "learning_rate": 1.9977155121234056e-05, |
| "loss": 0.008, |
| "reward": 6.498598098754883, |
| "reward_std": 1.4399305284023285, |
| "rewards/mrr_reward": 0.4924045279622078, |
| "rewards/rank_analyze_format_reward": 0.7026933282613754, |
| "rewards/rank_answer_foramt_reward": 0.8359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 282 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 622.609375, |
| "epoch": 2.2640000000000002, |
| "grad_norm": 0.039998337626457214, |
| "kl": 0.01102447509765625, |
| "learning_rate": 1.9976985020430022e-05, |
| "loss": 0.0019, |
| "reward": 6.484407901763916, |
| "reward_std": 0.9918918311595917, |
| "rewards/mrr_reward": 0.4627170190215111, |
| "rewards/rank_analyze_format_reward": 0.7155710011720657, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 283 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.28125, |
| "epoch": 2.2720000000000002, |
| "grad_norm": 0.034088097512722015, |
| "kl": 0.0094451904296875, |
| "learning_rate": 1.9976814289425066e-05, |
| "loss": 0.0066, |
| "reward": 6.765654683113098, |
| "reward_std": 1.0432685762643814, |
| "rewards/mrr_reward": 0.5332837402820587, |
| "rewards/rank_analyze_format_reward": 0.6904967427253723, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 284 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 620.1875, |
| "epoch": 2.2800000000000002, |
| "grad_norm": 0.03567035123705864, |
| "kl": 0.015941619873046875, |
| "learning_rate": 1.9976642928229965e-05, |
| "loss": -0.0143, |
| "reward": 7.0589940547943115, |
| "reward_std": 0.7747539728879929, |
| "rewards/mrr_reward": 0.5898003429174423, |
| "rewards/rank_analyze_format_reward": 0.7460509389638901, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 285 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.59375, |
| "epoch": 2.288, |
| "grad_norm": 0.033182136714458466, |
| "kl": 0.009305953979492188, |
| "learning_rate": 1.997647093685555e-05, |
| "loss": 0.0029, |
| "reward": 7.651683449745178, |
| "reward_std": 0.4577641859650612, |
| "rewards/mrr_reward": 0.7307725697755814, |
| "rewards/rank_analyze_format_reward": 0.7285931408405304, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 286 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 589.5, |
| "epoch": 2.296, |
| "grad_norm": 0.03666054829955101, |
| "kl": 0.010992050170898438, |
| "learning_rate": 1.9976298315312675e-05, |
| "loss": -0.0206, |
| "reward": 7.6038994789123535, |
| "reward_std": 1.4697020053863525, |
| "rewards/mrr_reward": 0.7263020724058151, |
| "rewards/rank_analyze_format_reward": 0.7533785998821259, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 287 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.65625, |
| "epoch": 2.304, |
| "grad_norm": 0.03196245804429054, |
| "kl": 0.009922027587890625, |
| "learning_rate": 1.9976125063612254e-05, |
| "loss": -0.0084, |
| "reward": 7.176369905471802, |
| "reward_std": 1.0738315135240555, |
| "rewards/mrr_reward": 0.6143229156732559, |
| "rewards/rank_analyze_format_reward": 0.7952503263950348, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 288 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 575.3125, |
| "epoch": 2.312, |
| "grad_norm": 0.036641672253608704, |
| "kl": 0.01151275634765625, |
| "learning_rate": 1.9975951181765226e-05, |
| "loss": -0.0135, |
| "reward": 6.732638239860535, |
| "reward_std": 1.1722622215747833, |
| "rewards/mrr_reward": 0.5541604608297348, |
| "rewards/rank_analyze_format_reward": 0.6565065011382103, |
| "rewards/rank_answer_foramt_reward": 0.861328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 289 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 620.125, |
| "epoch": 2.32, |
| "grad_norm": 0.03352293372154236, |
| "kl": 0.00873565673828125, |
| "learning_rate": 1.9975776669782572e-05, |
| "loss": -0.0098, |
| "reward": 7.056705951690674, |
| "reward_std": 0.74837876111269, |
| "rewards/mrr_reward": 0.5602182596921921, |
| "rewards/rank_analyze_format_reward": 0.8049077540636063, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.012763278558850288, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 594.203125, |
| "epoch": 2.328, |
| "grad_norm": 0.03776485472917557, |
| "kl": 0.011941909790039062, |
| "learning_rate": 1.997560152767532e-05, |
| "loss": -0.011, |
| "reward": 7.487109661102295, |
| "reward_std": 0.8209907524287701, |
| "rewards/mrr_reward": 0.7063492089509964, |
| "rewards/rank_analyze_format_reward": 0.7513267993927002, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9825367629528046, |
| "step": 291 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 610.453125, |
| "epoch": 2.336, |
| "grad_norm": 0.037363357841968536, |
| "kl": 0.013795852661132812, |
| "learning_rate": 1.997542575545453e-05, |
| "loss": 0.0103, |
| "reward": 7.0443562269210815, |
| "reward_std": 1.2127674743533134, |
| "rewards/mrr_reward": 0.5873635932803154, |
| "rewards/rank_analyze_format_reward": 0.7670525759458542, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 292 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 590.578125, |
| "epoch": 2.344, |
| "grad_norm": 0.03627763316035271, |
| "kl": 0.01107025146484375, |
| "learning_rate": 1.9975249353131304e-05, |
| "loss": -0.0153, |
| "reward": 7.811681151390076, |
| "reward_std": 1.2126767039299011, |
| "rewards/mrr_reward": 0.8069444298744202, |
| "rewards/rank_analyze_format_reward": 0.6737470030784607, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 293 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 590.75, |
| "epoch": 2.352, |
| "grad_norm": 0.03678389638662338, |
| "kl": 0.009225845336914062, |
| "learning_rate": 1.9975072320716785e-05, |
| "loss": -0.0396, |
| "reward": 6.60707688331604, |
| "reward_std": 1.2315413057804108, |
| "rewards/mrr_reward": 0.5236669182777405, |
| "rewards/rank_analyze_format_reward": 0.584445059299469, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 294 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 576.390625, |
| "epoch": 2.36, |
| "grad_norm": 0.03965243697166443, |
| "kl": 0.013790130615234375, |
| "learning_rate": 1.997489465822216e-05, |
| "loss": -0.0106, |
| "reward": 7.775085091590881, |
| "reward_std": 1.3139366656541824, |
| "rewards/mrr_reward": 0.8050967454910278, |
| "rewards/rank_analyze_format_reward": 0.6439119428396225, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9944556355476379, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9944556355476379, |
| "step": 295 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.703125, |
| "epoch": 2.368, |
| "grad_norm": 0.039732079952955246, |
| "kl": 0.011026382446289062, |
| "learning_rate": 1.9974716365658646e-05, |
| "loss": -0.0467, |
| "reward": 7.427183151245117, |
| "reward_std": 1.2437842339277267, |
| "rewards/mrr_reward": 0.7122395783662796, |
| "rewards/rank_analyze_format_reward": 0.7224476039409637, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 296 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 605.25, |
| "epoch": 2.376, |
| "grad_norm": 0.03692779690027237, |
| "kl": 0.010181427001953125, |
| "learning_rate": 1.9974537443037504e-05, |
| "loss": -0.0119, |
| "reward": 7.6130610704422, |
| "reward_std": 1.0890810042619705, |
| "rewards/mrr_reward": 0.7197916656732559, |
| "rewards/rank_analyze_format_reward": 0.8107158541679382, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9977221935987473, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9977221935987473, |
| "step": 297 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 611.5625, |
| "epoch": 2.384, |
| "grad_norm": 0.039538830518722534, |
| "kl": 0.013807296752929688, |
| "learning_rate": 1.9974357890370038e-05, |
| "loss": -0.008, |
| "reward": 6.635961890220642, |
| "reward_std": 0.7657184079289436, |
| "rewards/mrr_reward": 0.48133058845996857, |
| "rewards/rank_analyze_format_reward": 0.7749776542186737, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 298 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.328125, |
| "epoch": 2.392, |
| "grad_norm": 0.03752804920077324, |
| "kl": 0.013395309448242188, |
| "learning_rate": 1.9974177707667594e-05, |
| "loss": 0.0098, |
| "reward": 7.015731453895569, |
| "reward_std": 1.1001620888710022, |
| "rewards/mrr_reward": 0.6202257052063942, |
| "rewards/rank_analyze_format_reward": 0.6712347567081451, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9962500035762787, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9806250035762787, |
| "step": 299 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 593.0, |
| "epoch": 2.4, |
| "grad_norm": 0.039174940437078476, |
| "kl": 0.011379241943359375, |
| "learning_rate": 1.9973996894941545e-05, |
| "loss": -0.0011, |
| "reward": 7.0397127866744995, |
| "reward_std": 1.0055639445781708, |
| "rewards/mrr_reward": 0.5911644473671913, |
| "rewards/rank_analyze_format_reward": 0.7411527559161186, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 300 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 590.3125, |
| "epoch": 2.408, |
| "grad_norm": 0.0378861129283905, |
| "kl": 0.011119842529296875, |
| "learning_rate": 1.9973815452203314e-05, |
| "loss": 0.0056, |
| "reward": 7.447056770324707, |
| "reward_std": 1.2125954329967499, |
| "rewards/mrr_reward": 0.7122395783662796, |
| "rewards/rank_analyze_format_reward": 0.669488713145256, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9994612038135529, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9994612038135529, |
| "step": 301 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 565.15625, |
| "epoch": 2.416, |
| "grad_norm": 0.03746671974658966, |
| "kl": 0.011993408203125, |
| "learning_rate": 1.997363337946437e-05, |
| "loss": -0.0198, |
| "reward": 6.575040936470032, |
| "reward_std": 0.9133451133966446, |
| "rewards/mrr_reward": 0.5259300693869591, |
| "rewards/rank_analyze_format_reward": 0.5709301829338074, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 302 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 641.546875, |
| "epoch": 2.424, |
| "grad_norm": 0.03554888442158699, |
| "kl": 0.010702133178710938, |
| "learning_rate": 1.9973450676736205e-05, |
| "loss": -0.0074, |
| "reward": 7.236762523651123, |
| "reward_std": 0.604234242811799, |
| "rewards/mrr_reward": 0.6168154701590538, |
| "rewards/rank_analyze_format_reward": 0.8082548528909683, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 303 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 614.53125, |
| "epoch": 2.432, |
| "grad_norm": 0.03649809956550598, |
| "kl": 0.011503219604492188, |
| "learning_rate": 1.997326734403036e-05, |
| "loss": -0.0239, |
| "reward": 6.725122928619385, |
| "reward_std": 1.2124179899692535, |
| "rewards/mrr_reward": 0.5331907123327255, |
| "rewards/rank_analyze_format_reward": 0.7333841472864151, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 304 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 595.609375, |
| "epoch": 2.44, |
| "grad_norm": 0.03357018902897835, |
| "kl": 0.011198043823242188, |
| "learning_rate": 1.997308338135842e-05, |
| "loss": -0.0394, |
| "reward": 7.099708437919617, |
| "reward_std": 1.0707662254571915, |
| "rewards/mrr_reward": 0.617491327226162, |
| "rewards/rank_analyze_format_reward": 0.6802160441875458, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 305 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 639.46875, |
| "epoch": 2.448, |
| "grad_norm": 0.04073842614889145, |
| "kl": 0.01114654541015625, |
| "learning_rate": 1.9972898788732e-05, |
| "loss": -0.0205, |
| "reward": 6.205634713172913, |
| "reward_std": 1.0768165290355682, |
| "rewards/mrr_reward": 0.40212054550647736, |
| "rewards/rank_analyze_format_reward": 0.6713712811470032, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 306 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 592.875, |
| "epoch": 2.456, |
| "grad_norm": 0.038296766579151154, |
| "kl": 0.0117034912109375, |
| "learning_rate": 1.9972713566162763e-05, |
| "loss": -0.0115, |
| "reward": 6.65511429309845, |
| "reward_std": 0.8909335732460022, |
| "rewards/mrr_reward": 0.5184585936367512, |
| "rewards/rank_analyze_format_reward": 0.6747215688228607, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 307 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 571.96875, |
| "epoch": 2.464, |
| "grad_norm": 0.03361259400844574, |
| "kl": 0.010142326354980469, |
| "learning_rate": 1.997252771366241e-05, |
| "loss": -0.0059, |
| "reward": 7.825888633728027, |
| "reward_std": 0.7059714342467487, |
| "rewards/mrr_reward": 0.8350446447730064, |
| "rewards/rank_analyze_format_reward": 0.4857100807130337, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 308 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.9375, |
| "epoch": 2.472, |
| "grad_norm": 0.03526763245463371, |
| "kl": 0.0130767822265625, |
| "learning_rate": 1.9972341231242675e-05, |
| "loss": -0.0398, |
| "reward": 6.988335967063904, |
| "reward_std": 0.7815524078905582, |
| "rewards/mrr_reward": 0.5860863253474236, |
| "rewards/rank_analyze_format_reward": 0.722115769982338, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 309 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 599.140625, |
| "epoch": 2.48, |
| "grad_norm": 0.03733767569065094, |
| "kl": 0.012132644653320312, |
| "learning_rate": 1.9972154118915344e-05, |
| "loss": -0.0251, |
| "reward": 7.347846150398254, |
| "reward_std": 1.1197139769792557, |
| "rewards/mrr_reward": 0.6794270873069763, |
| "rewards/rank_analyze_format_reward": 0.7024035751819611, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 310 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 592.796875, |
| "epoch": 2.488, |
| "grad_norm": 0.037743665277957916, |
| "kl": 0.010995864868164062, |
| "learning_rate": 1.997196637669223e-05, |
| "loss": -0.0057, |
| "reward": 7.16385281085968, |
| "reward_std": 0.9465463161468506, |
| "rewards/mrr_reward": 0.614341527223587, |
| "rewards/rank_analyze_format_reward": 0.7628190815448761, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 311 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 628.25, |
| "epoch": 2.496, |
| "grad_norm": 0.03614401817321777, |
| "kl": 0.010850906372070312, |
| "learning_rate": 1.99717780045852e-05, |
| "loss": -0.0312, |
| "reward": 7.732061147689819, |
| "reward_std": 0.6288701333105564, |
| "rewards/mrr_reward": 0.7590463757514954, |
| "rewards/rank_analyze_format_reward": 0.7517846375703812, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 312 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 605.609375, |
| "epoch": 2.504, |
| "grad_norm": 0.035900671035051346, |
| "kl": 0.010019302368164062, |
| "learning_rate": 1.997158900260614e-05, |
| "loss": 0.001, |
| "reward": 7.1635472774505615, |
| "reward_std": 1.0679296404123306, |
| "rewards/mrr_reward": 0.6484374925494194, |
| "rewards/rank_analyze_format_reward": 0.6771043539047241, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9834558814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 313 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 596.1875, |
| "epoch": 2.512, |
| "grad_norm": 0.041989874094724655, |
| "kl": 0.015058517456054688, |
| "learning_rate": 1.9971399370767e-05, |
| "loss": -0.0166, |
| "reward": 6.863955616950989, |
| "reward_std": 0.7592495381832123, |
| "rewards/mrr_reward": 0.565854400396347, |
| "rewards/rank_analyze_format_reward": 0.6161628141999245, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 314 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 599.0, |
| "epoch": 2.52, |
| "grad_norm": 0.041266754269599915, |
| "kl": 0.013032913208007812, |
| "learning_rate": 1.9971209109079752e-05, |
| "loss": -0.0229, |
| "reward": 7.460736155509949, |
| "reward_std": 1.0799484848976135, |
| "rewards/mrr_reward": 0.7114583253860474, |
| "rewards/rank_analyze_format_reward": 0.6832623034715652, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 315 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.28125, |
| "epoch": 2.528, |
| "grad_norm": 0.03333236649632454, |
| "kl": 0.009759902954101562, |
| "learning_rate": 1.9971018217556416e-05, |
| "loss": -0.0106, |
| "reward": 6.682798147201538, |
| "reward_std": 0.5989858657121658, |
| "rewards/mrr_reward": 0.4994109719991684, |
| "rewards/rank_analyze_format_reward": 0.7437479048967361, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 316 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.4375, |
| "epoch": 2.536, |
| "grad_norm": 0.0350511260330677, |
| "kl": 0.010562896728515625, |
| "learning_rate": 1.997082669620905e-05, |
| "loss": -0.0302, |
| "reward": 6.6315062046051025, |
| "reward_std": 1.0686845779418945, |
| "rewards/mrr_reward": 0.4913690462708473, |
| "rewards/rank_analyze_format_reward": 0.7528755962848663, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9975927770137787, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9975927770137787, |
| "step": 317 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 583.140625, |
| "epoch": 2.544, |
| "grad_norm": 0.03914601355791092, |
| "kl": 0.013078689575195312, |
| "learning_rate": 1.997063454504975e-05, |
| "loss": -0.0055, |
| "reward": 6.575037002563477, |
| "reward_std": 1.3988100588321686, |
| "rewards/mrr_reward": 0.514732152223587, |
| "rewards/rank_analyze_format_reward": 0.6743116676807404, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 318 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 618.71875, |
| "epoch": 2.552, |
| "grad_norm": 0.0380953773856163, |
| "kl": 0.013456344604492188, |
| "learning_rate": 1.9970441764090654e-05, |
| "loss": -0.0518, |
| "reward": 7.295857548713684, |
| "reward_std": 1.004029467701912, |
| "rewards/mrr_reward": 0.6721354275941849, |
| "rewards/rank_analyze_format_reward": 0.7370236366987228, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9878805130720139, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9878805130720139, |
| "step": 319 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 613.4375, |
| "epoch": 2.56, |
| "grad_norm": 0.040197595953941345, |
| "kl": 0.013912200927734375, |
| "learning_rate": 1.9970248353343943e-05, |
| "loss": -0.0075, |
| "reward": 6.5366517305374146, |
| "reward_std": 1.0288221687078476, |
| "rewards/mrr_reward": 0.4604600891470909, |
| "rewards/rank_analyze_format_reward": 0.7944208830595016, |
| "rewards/rank_answer_foramt_reward": 0.900390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 320 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 649.296875, |
| "epoch": 2.568, |
| "grad_norm": 0.038607921451330185, |
| "kl": 0.013824462890625, |
| "learning_rate": 1.997005431282183e-05, |
| "loss": 0.0172, |
| "reward": 7.0922359228134155, |
| "reward_std": 1.0854482501745224, |
| "rewards/mrr_reward": 0.603298619389534, |
| "rewards/rank_analyze_format_reward": 0.7901396751403809, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9825367629528046, |
| "step": 321 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 641.296875, |
| "epoch": 2.576, |
| "grad_norm": 0.03311832994222641, |
| "kl": 0.011646270751953125, |
| "learning_rate": 1.996985964253657e-05, |
| "loss": -0.0369, |
| "reward": 6.7459012269973755, |
| "reward_std": 0.9181017801165581, |
| "rewards/mrr_reward": 0.5021019279956818, |
| "rewards/rank_analyze_format_reward": 0.7544548064470291, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 322 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 627.34375, |
| "epoch": 2.584, |
| "grad_norm": 0.037428632378578186, |
| "kl": 0.012990951538085938, |
| "learning_rate": 1.996966434250046e-05, |
| "loss": -0.0228, |
| "reward": 7.209717512130737, |
| "reward_std": 1.1640962213277817, |
| "rewards/mrr_reward": 0.6627604216337204, |
| "rewards/rank_analyze_format_reward": 0.7012539207935333, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 323 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 586.28125, |
| "epoch": 2.592, |
| "grad_norm": 0.035629965364933014, |
| "kl": 0.011270523071289062, |
| "learning_rate": 1.996946841272584e-05, |
| "loss": -0.0126, |
| "reward": 6.940586090087891, |
| "reward_std": 1.4230458736419678, |
| "rewards/mrr_reward": 0.5958519503474236, |
| "rewards/rank_analyze_format_reward": 0.6704594492912292, |
| "rewards/rank_answer_foramt_reward": 0.88671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 324 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 625.984375, |
| "epoch": 2.6, |
| "grad_norm": 0.032520923763513565, |
| "kl": 0.011152267456054688, |
| "learning_rate": 1.9969271853225083e-05, |
| "loss": -0.0061, |
| "reward": 7.102632761001587, |
| "reward_std": 0.8966164737939835, |
| "rewards/mrr_reward": 0.6060329973697662, |
| "rewards/rank_analyze_format_reward": 0.717139944434166, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 620.765625, |
| "epoch": 2.608, |
| "grad_norm": 0.03493724763393402, |
| "kl": 0.011966705322265625, |
| "learning_rate": 1.9969074664010605e-05, |
| "loss": -0.0149, |
| "reward": 6.612971305847168, |
| "reward_std": 0.9198006242513657, |
| "rewards/mrr_reward": 0.479879729449749, |
| "rewards/rank_analyze_format_reward": 0.7714625149965286, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 326 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 610.78125, |
| "epoch": 2.616, |
| "grad_norm": 0.03426508232951164, |
| "kl": 0.010488510131835938, |
| "learning_rate": 1.9968876845094864e-05, |
| "loss": -0.0116, |
| "reward": 7.175417423248291, |
| "reward_std": 0.7358394265174866, |
| "rewards/mrr_reward": 0.6250000074505806, |
| "rewards/rank_analyze_format_reward": 0.764837920665741, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 327 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 612.21875, |
| "epoch": 2.624, |
| "grad_norm": 0.03677660971879959, |
| "kl": 0.01381683349609375, |
| "learning_rate": 1.996867839649035e-05, |
| "loss": -0.0066, |
| "reward": 7.328829765319824, |
| "reward_std": 0.97315713763237, |
| "rewards/mrr_reward": 0.6791852787137032, |
| "rewards/rank_analyze_format_reward": 0.7214639633893967, |
| "rewards/rank_answer_foramt_reward": 0.9140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 328 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 587.625, |
| "epoch": 2.632, |
| "grad_norm": 0.03748798742890358, |
| "kl": 0.01129150390625, |
| "learning_rate": 1.9968479318209603e-05, |
| "loss": 0.0107, |
| "reward": 7.366376042366028, |
| "reward_std": 0.7245956286787987, |
| "rewards/mrr_reward": 0.6915550529956818, |
| "rewards/rank_analyze_format_reward": 0.6485605537891388, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 329 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 603.265625, |
| "epoch": 2.64, |
| "grad_norm": 0.038382068276405334, |
| "kl": 0.014995574951171875, |
| "learning_rate": 1.9968279610265194e-05, |
| "loss": -0.0244, |
| "reward": 7.351204872131348, |
| "reward_std": 1.0787476003170013, |
| "rewards/mrr_reward": 0.6888020783662796, |
| "rewards/rank_analyze_format_reward": 0.7559229284524918, |
| "rewards/rank_answer_foramt_reward": 0.875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9825367629528046, |
| "step": 330 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 617.171875, |
| "epoch": 2.648, |
| "grad_norm": 0.036839522421360016, |
| "kl": 0.011720657348632812, |
| "learning_rate": 1.9968079272669744e-05, |
| "loss": 0.0057, |
| "reward": 6.830013751983643, |
| "reward_std": 1.1275426745414734, |
| "rewards/mrr_reward": 0.5580295100808144, |
| "rewards/rank_analyze_format_reward": 0.7516124844551086, |
| "rewards/rank_answer_foramt_reward": 0.849609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983368366956711, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983368366956711, |
| "step": 331 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 620.265625, |
| "epoch": 2.656, |
| "grad_norm": 0.03564363345503807, |
| "kl": 0.012371063232421875, |
| "learning_rate": 1.9967878305435902e-05, |
| "loss": -0.0231, |
| "reward": 7.337198257446289, |
| "reward_std": 0.7928859405219555, |
| "rewards/mrr_reward": 0.6541852578520775, |
| "rewards/rank_analyze_format_reward": 0.8176662474870682, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9972937107086182, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9972937107086182, |
| "step": 332 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.890625, |
| "epoch": 2.664, |
| "grad_norm": 0.038595810532569885, |
| "kl": 0.010858535766601562, |
| "learning_rate": 1.9967676708576362e-05, |
| "loss": -0.0045, |
| "reward": 6.599027991294861, |
| "reward_std": 0.9832871407270432, |
| "rewards/mrr_reward": 0.4508804567158222, |
| "rewards/rank_analyze_format_reward": 0.8443343043327332, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 333 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.796875, |
| "epoch": 2.672, |
| "grad_norm": 0.03759092092514038, |
| "kl": 0.0118408203125, |
| "learning_rate": 1.9967474482103863e-05, |
| "loss": -0.0121, |
| "reward": 6.94339394569397, |
| "reward_std": 0.9748950749635696, |
| "rewards/mrr_reward": 0.5725632309913635, |
| "rewards/rank_analyze_format_reward": 0.733219176530838, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 334 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 660.65625, |
| "epoch": 2.68, |
| "grad_norm": 0.03415609896183014, |
| "kl": 0.011899948120117188, |
| "learning_rate": 1.996727162603117e-05, |
| "loss": -0.0132, |
| "reward": 6.538380742073059, |
| "reward_std": 0.7016656026244164, |
| "rewards/mrr_reward": 0.44487228989601135, |
| "rewards/rank_analyze_format_reward": 0.8194384127855301, |
| "rewards/rank_answer_foramt_reward": 0.955078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 335 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.390625, |
| "epoch": 2.6879999999999997, |
| "grad_norm": 0.03727827966213226, |
| "kl": 0.011407852172851562, |
| "learning_rate": 1.9967068140371103e-05, |
| "loss": 0.0018, |
| "reward": 7.043541312217712, |
| "reward_std": 0.7633183086290956, |
| "rewards/mrr_reward": 0.5886718779802322, |
| "rewards/rank_analyze_format_reward": 0.7822953313589096, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 336 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 600.71875, |
| "epoch": 2.6959999999999997, |
| "grad_norm": 0.03895430639386177, |
| "kl": 0.013332366943359375, |
| "learning_rate": 1.9966864025136518e-05, |
| "loss": -0.0042, |
| "reward": 6.765047073364258, |
| "reward_std": 0.8223965764045715, |
| "rewards/mrr_reward": 0.5205729305744171, |
| "rewards/rank_analyze_format_reward": 0.7120521813631058, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 337 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 660.328125, |
| "epoch": 2.7039999999999997, |
| "grad_norm": 0.038571566343307495, |
| "kl": 0.012439727783203125, |
| "learning_rate": 1.99666592803403e-05, |
| "loss": -0.0154, |
| "reward": 6.975342035293579, |
| "reward_std": 0.840043693780899, |
| "rewards/mrr_reward": 0.5555741637945175, |
| "rewards/rank_analyze_format_reward": 0.811639130115509, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 338 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 576.015625, |
| "epoch": 2.7119999999999997, |
| "grad_norm": 0.03610292449593544, |
| "kl": 0.012050628662109375, |
| "learning_rate": 1.9966453905995386e-05, |
| "loss": -0.0219, |
| "reward": 6.419227600097656, |
| "reward_std": 1.1811564713716507, |
| "rewards/mrr_reward": 0.46623264998197556, |
| "rewards/rank_analyze_format_reward": 0.7026196420192719, |
| "rewards/rank_answer_foramt_reward": 0.876953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 339 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 611.953125, |
| "epoch": 2.7199999999999998, |
| "grad_norm": 0.040995605289936066, |
| "kl": 0.010702133178710938, |
| "learning_rate": 1.996624790211475e-05, |
| "loss": 0.0069, |
| "reward": 7.764137506484985, |
| "reward_std": 0.872068215161562, |
| "rewards/mrr_reward": 0.7421006858348846, |
| "rewards/rank_analyze_format_reward": 0.8094066381454468, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 340 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 601.390625, |
| "epoch": 2.7279999999999998, |
| "grad_norm": 0.036672018468379974, |
| "kl": 0.010931015014648438, |
| "learning_rate": 1.9966041268711404e-05, |
| "loss": -0.0282, |
| "reward": 7.355572700500488, |
| "reward_std": 0.8193893283605576, |
| "rewards/mrr_reward": 0.6698970645666122, |
| "rewards/rank_analyze_format_reward": 0.7189528197050095, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 341 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 595.25, |
| "epoch": 2.7359999999999998, |
| "grad_norm": 0.036827512085437775, |
| "kl": 0.011262893676757812, |
| "learning_rate": 1.9965834005798395e-05, |
| "loss": 0.0009, |
| "reward": 7.232412695884705, |
| "reward_std": 0.9624816030263901, |
| "rewards/mrr_reward": 0.6321304589509964, |
| "rewards/rank_analyze_format_reward": 0.776156485080719, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 342 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 583.015625, |
| "epoch": 2.7439999999999998, |
| "grad_norm": 0.041386183351278305, |
| "kl": 0.01598358154296875, |
| "learning_rate": 1.9965626113388823e-05, |
| "loss": -0.0151, |
| "reward": 7.414017677307129, |
| "reward_std": 1.145112544298172, |
| "rewards/mrr_reward": 0.7001488208770752, |
| "rewards/rank_analyze_format_reward": 0.7220657765865326, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 343 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 610.671875, |
| "epoch": 2.752, |
| "grad_norm": 0.03522395342588425, |
| "kl": 0.011472702026367188, |
| "learning_rate": 1.9965417591495813e-05, |
| "loss": -0.0021, |
| "reward": 6.261266589164734, |
| "reward_std": 0.648932583630085, |
| "rewards/mrr_reward": 0.4110739082098007, |
| "rewards/rank_analyze_format_reward": 0.6695905476808548, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 344 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 595.421875, |
| "epoch": 2.76, |
| "grad_norm": 0.036285560578107834, |
| "kl": 0.011045455932617188, |
| "learning_rate": 1.9965208440132538e-05, |
| "loss": -0.0084, |
| "reward": 7.684949636459351, |
| "reward_std": 0.6939431764185429, |
| "rewards/mrr_reward": 0.7300347238779068, |
| "rewards/rank_analyze_format_reward": 0.782451868057251, |
| "rewards/rank_answer_foramt_reward": 0.984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9989919364452362, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9989919364452362, |
| "step": 345 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 602.15625, |
| "epoch": 2.768, |
| "grad_norm": 0.03910991922020912, |
| "kl": 0.011774063110351562, |
| "learning_rate": 1.9964998659312212e-05, |
| "loss": -0.0189, |
| "reward": 6.8010218143463135, |
| "reward_std": 0.8553978726267815, |
| "rewards/mrr_reward": 0.5462859645485878, |
| "rewards/rank_analyze_format_reward": 0.7228547036647797, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9943632036447525, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9943632036447525, |
| "step": 346 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 611.359375, |
| "epoch": 2.776, |
| "grad_norm": 0.038007643073797226, |
| "kl": 0.010667800903320312, |
| "learning_rate": 1.996478824904808e-05, |
| "loss": 0.003, |
| "reward": 7.355239748954773, |
| "reward_std": 0.9060100615024567, |
| "rewards/mrr_reward": 0.6795572899281979, |
| "rewards/rank_analyze_format_reward": 0.7034169733524323, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 347 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 597.8125, |
| "epoch": 2.784, |
| "grad_norm": 0.03934268653392792, |
| "kl": 0.01244354248046875, |
| "learning_rate": 1.9964577209353438e-05, |
| "loss": -0.0656, |
| "reward": 7.2533485889434814, |
| "reward_std": 1.1916275918483734, |
| "rewards/mrr_reward": 0.6880208402872086, |
| "rewards/rank_analyze_format_reward": 0.6423462107777596, |
| "rewards/rank_answer_foramt_reward": 0.92578125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9821939468383789, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9821939468383789, |
| "step": 348 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.140625, |
| "epoch": 2.792, |
| "grad_norm": 0.036522042006254196, |
| "kl": 0.013441085815429688, |
| "learning_rate": 1.9964365540241614e-05, |
| "loss": 0.0013, |
| "reward": 7.095219135284424, |
| "reward_std": 1.0741036236286163, |
| "rewards/mrr_reward": 0.6266059279441833, |
| "rewards/rank_analyze_format_reward": 0.6649671494960785, |
| "rewards/rank_answer_foramt_reward": 0.939453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 349 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 598.15625, |
| "epoch": 2.8, |
| "grad_norm": 0.03805486485362053, |
| "kl": 0.010654449462890625, |
| "learning_rate": 1.9964153241725984e-05, |
| "loss": -0.0168, |
| "reward": 7.228509426116943, |
| "reward_std": 0.9055161625146866, |
| "rewards/mrr_reward": 0.6221354231238365, |
| "rewards/rank_analyze_format_reward": 0.8102801889181137, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 350 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 614.125, |
| "epoch": 2.808, |
| "grad_norm": 0.036437440663576126, |
| "kl": 0.009984970092773438, |
| "learning_rate": 1.996394031381995e-05, |
| "loss": -0.0147, |
| "reward": 6.869751572608948, |
| "reward_std": 0.8186332434415817, |
| "rewards/mrr_reward": 0.5391058996319771, |
| "rewards/rank_analyze_format_reward": 0.77509605884552, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 351 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 588.75, |
| "epoch": 2.816, |
| "grad_norm": 0.04047045111656189, |
| "kl": 0.013484954833984375, |
| "learning_rate": 1.996372675653696e-05, |
| "loss": 0.0169, |
| "reward": 7.264615893363953, |
| "reward_std": 1.1256726384162903, |
| "rewards/mrr_reward": 0.6621279790997505, |
| "rewards/rank_analyze_format_reward": 0.7302107512950897, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9961873590946198, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 352 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 614.28125, |
| "epoch": 2.824, |
| "grad_norm": 0.03461950272321701, |
| "kl": 0.011775970458984375, |
| "learning_rate": 1.9963512569890512e-05, |
| "loss": -0.0006, |
| "reward": 6.854212045669556, |
| "reward_std": 0.9395613223314285, |
| "rewards/mrr_reward": 0.5478236600756645, |
| "rewards/rank_analyze_format_reward": 0.7212026119232178, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 353 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 601.59375, |
| "epoch": 2.832, |
| "grad_norm": 0.03836781159043312, |
| "kl": 0.01210784912109375, |
| "learning_rate": 1.9963297753894134e-05, |
| "loss": -0.0137, |
| "reward": 6.814990997314453, |
| "reward_std": 1.3405095338821411, |
| "rewards/mrr_reward": 0.5263392850756645, |
| "rewards/rank_analyze_format_reward": 0.7955712080001831, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 354 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 613.484375, |
| "epoch": 2.84, |
| "grad_norm": 0.03449360653758049, |
| "kl": 0.013011932373046875, |
| "learning_rate": 1.9963082308561386e-05, |
| "loss": -0.021, |
| "reward": 7.53871476650238, |
| "reward_std": 0.9666296392679214, |
| "rewards/mrr_reward": 0.7184895724058151, |
| "rewards/rank_analyze_format_reward": 0.7090613692998886, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 355 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 605.71875, |
| "epoch": 2.848, |
| "grad_norm": 0.04157762601971626, |
| "kl": 0.012449264526367188, |
| "learning_rate": 1.9962866233905887e-05, |
| "loss": -0.0148, |
| "reward": 7.414668679237366, |
| "reward_std": 0.9551695212721825, |
| "rewards/mrr_reward": 0.693489596247673, |
| "rewards/rank_analyze_format_reward": 0.6817260161042213, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 356 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 614.6875, |
| "epoch": 2.856, |
| "grad_norm": 0.034568723291158676, |
| "kl": 0.011356353759765625, |
| "learning_rate": 1.9962649529941283e-05, |
| "loss": -0.0159, |
| "reward": 7.724859952926636, |
| "reward_std": 0.819370448589325, |
| "rewards/mrr_reward": 0.7456287145614624, |
| "rewards/rank_analyze_format_reward": 0.7604811042547226, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9977678656578064, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9977678656578064, |
| "step": 357 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 618.8125, |
| "epoch": 2.864, |
| "grad_norm": 0.039391741156578064, |
| "kl": 0.012399673461914062, |
| "learning_rate": 1.996243219668126e-05, |
| "loss": -0.0153, |
| "reward": 5.852332949638367, |
| "reward_std": 1.0752842128276825, |
| "rewards/mrr_reward": 0.323691725730896, |
| "rewards/rank_analyze_format_reward": 0.7135076522827148, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9679276347160339, |
| "step": 358 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 620.046875, |
| "epoch": 2.872, |
| "grad_norm": 0.040229834616184235, |
| "kl": 0.011371612548828125, |
| "learning_rate": 1.996221423413954e-05, |
| "loss": 0.0015, |
| "reward": 6.387848496437073, |
| "reward_std": 1.1234539598226547, |
| "rewards/mrr_reward": 0.4234747067093849, |
| "rewards/rank_analyze_format_reward": 0.7486371248960495, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 359 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 628.109375, |
| "epoch": 2.88, |
| "grad_norm": 0.0374617837369442, |
| "kl": 0.011615753173828125, |
| "learning_rate": 1.9961995642329905e-05, |
| "loss": 0.0084, |
| "reward": 7.307153582572937, |
| "reward_std": 1.3044872879981995, |
| "rewards/mrr_reward": 0.6740141361951828, |
| "rewards/rank_analyze_format_reward": 0.7395800352096558, |
| "rewards/rank_answer_foramt_reward": 0.875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 360 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 591.71875, |
| "epoch": 2.888, |
| "grad_norm": 0.04277306795120239, |
| "kl": 0.014926910400390625, |
| "learning_rate": 1.996177642126615e-05, |
| "loss": -0.0085, |
| "reward": 7.5333287715911865, |
| "reward_std": 0.9014619141817093, |
| "rewards/mrr_reward": 0.6997581869363785, |
| "rewards/rank_analyze_format_reward": 0.7635927647352219, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 361 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 603.28125, |
| "epoch": 2.896, |
| "grad_norm": 0.041481465101242065, |
| "kl": 0.014026641845703125, |
| "learning_rate": 1.996155657096213e-05, |
| "loss": -0.0272, |
| "reward": 6.84517502784729, |
| "reward_std": 1.0781239420175552, |
| "rewards/mrr_reward": 0.5556175634264946, |
| "rewards/rank_analyze_format_reward": 0.7236873209476471, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983368366956711, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983368366956711, |
| "step": 362 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 580.53125, |
| "epoch": 2.904, |
| "grad_norm": 0.04171738028526306, |
| "kl": 0.012874603271484375, |
| "learning_rate": 1.9961336091431728e-05, |
| "loss": -0.0004, |
| "reward": 7.211669564247131, |
| "reward_std": 0.8956931233406067, |
| "rewards/mrr_reward": 0.6445312649011612, |
| "rewards/rank_analyze_format_reward": 0.6827789545059204, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9968671649694443, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9968671649694443, |
| "step": 363 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.328125, |
| "epoch": 2.912, |
| "grad_norm": 0.03671007230877876, |
| "kl": 0.011234283447265625, |
| "learning_rate": 1.9961114982688868e-05, |
| "loss": -0.0257, |
| "reward": 7.139348030090332, |
| "reward_std": 0.8967479169368744, |
| "rewards/mrr_reward": 0.6116319298744202, |
| "rewards/rank_analyze_format_reward": 0.7958708107471466, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9992559552192688, |
| "step": 364 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 648.640625, |
| "epoch": 2.92, |
| "grad_norm": 0.033731456845998764, |
| "kl": 0.010288238525390625, |
| "learning_rate": 1.9960893244747525e-05, |
| "loss": -0.0108, |
| "reward": 7.166544318199158, |
| "reward_std": 0.6106544919312, |
| "rewards/mrr_reward": 0.6010168790817261, |
| "rewards/rank_analyze_format_reward": 0.7859143763780594, |
| "rewards/rank_answer_foramt_reward": 0.984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 365 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 627.625, |
| "epoch": 2.928, |
| "grad_norm": 0.036414846777915955, |
| "kl": 0.012035369873046875, |
| "learning_rate": 1.9960670877621697e-05, |
| "loss": -0.0184, |
| "reward": 6.740770578384399, |
| "reward_std": 0.8052867725491524, |
| "rewards/mrr_reward": 0.5220424234867096, |
| "rewards/rank_analyze_format_reward": 0.7072883993387222, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 366 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 619.90625, |
| "epoch": 2.936, |
| "grad_norm": 0.03801536187529564, |
| "kl": 0.013200759887695312, |
| "learning_rate": 1.9960447881325433e-05, |
| "loss": -0.0308, |
| "reward": 6.5149757862091064, |
| "reward_std": 0.7093790546059608, |
| "rewards/mrr_reward": 0.44720981270074844, |
| "rewards/rank_analyze_format_reward": 0.7612926959991455, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 367 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 591.1875, |
| "epoch": 2.944, |
| "grad_norm": 0.040969040244817734, |
| "kl": 0.01461029052734375, |
| "learning_rate": 1.996022425587282e-05, |
| "loss": -0.0185, |
| "reward": 7.41820216178894, |
| "reward_std": 0.9369710832834244, |
| "rewards/mrr_reward": 0.6916666775941849, |
| "rewards/rank_analyze_format_reward": 0.7156801223754883, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 368 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.859375, |
| "epoch": 2.952, |
| "grad_norm": 0.03583036735653877, |
| "kl": 0.011312484741210938, |
| "learning_rate": 1.9960000001277985e-05, |
| "loss": -0.0276, |
| "reward": 7.153052568435669, |
| "reward_std": 0.6550789251923561, |
| "rewards/mrr_reward": 0.6043154746294022, |
| "rewards/rank_analyze_format_reward": 0.7746230661869049, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 369 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 607.65625, |
| "epoch": 2.96, |
| "grad_norm": 0.04044310748577118, |
| "kl": 0.012559890747070312, |
| "learning_rate": 1.9959775117555085e-05, |
| "loss": 0.0112, |
| "reward": 7.005048513412476, |
| "reward_std": 1.1625263132154942, |
| "rewards/mrr_reward": 0.5972346290946007, |
| "rewards/rank_analyze_format_reward": 0.7758757621049881, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992187470197678, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9679687470197678, |
| "step": 370 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 597.0, |
| "epoch": 2.968, |
| "grad_norm": 0.03745017945766449, |
| "kl": 0.013088226318359375, |
| "learning_rate": 1.995954960471833e-05, |
| "loss": 0.0034, |
| "reward": 7.509567379951477, |
| "reward_std": 0.961163155734539, |
| "rewards/mrr_reward": 0.7000806033611298, |
| "rewards/rank_analyze_format_reward": 0.7853019386529922, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 371 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 592.859375, |
| "epoch": 2.976, |
| "grad_norm": 0.038514841347932816, |
| "kl": 0.012132644653320312, |
| "learning_rate": 1.995932346278197e-05, |
| "loss": -0.0071, |
| "reward": 7.772576689720154, |
| "reward_std": 0.6515968926250935, |
| "rewards/mrr_reward": 0.7575520798563957, |
| "rewards/rank_analyze_format_reward": 0.7793630510568619, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 372 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 586.09375, |
| "epoch": 2.984, |
| "grad_norm": 0.03822262957692146, |
| "kl": 0.013006210327148438, |
| "learning_rate": 1.9959096691760284e-05, |
| "loss": -0.0155, |
| "reward": 7.534856200218201, |
| "reward_std": 0.7668619826436043, |
| "rewards/mrr_reward": 0.7446614354848862, |
| "rewards/rank_analyze_format_reward": 0.685593493282795, |
| "rewards/rank_answer_foramt_reward": 0.955078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9773005694150925, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9773005694150925, |
| "step": 373 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.109375, |
| "epoch": 2.992, |
| "grad_norm": 0.03631046786904335, |
| "kl": 0.011692047119140625, |
| "learning_rate": 1.995886929166759e-05, |
| "loss": 0.0136, |
| "reward": 7.2465866804122925, |
| "reward_std": 0.8428932726383209, |
| "rewards/mrr_reward": 0.6395833343267441, |
| "rewards/rank_analyze_format_reward": 0.7468471378087997, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 374 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 668.40625, |
| "epoch": 3.0, |
| "grad_norm": 0.03699138015508652, |
| "kl": 0.011791229248046875, |
| "learning_rate": 1.9958641262518263e-05, |
| "loss": 0.0192, |
| "reward": 7.813745975494385, |
| "reward_std": 0.7177924737334251, |
| "rewards/mrr_reward": 0.7415550798177719, |
| "rewards/rank_analyze_format_reward": 0.8690101951360703, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 375 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 615.921875, |
| "epoch": 3.008, |
| "grad_norm": 0.035117242485284805, |
| "kl": 0.013750076293945312, |
| "learning_rate": 3.4816627469912147e-06, |
| "loss": 0.0291, |
| "reward": 7.042345643043518, |
| "reward_std": 0.7293612845242023, |
| "rewards/mrr_reward": 0.5897755473852158, |
| "rewards/rank_analyze_format_reward": 0.7183997631072998, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 376 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.125, |
| "epoch": 3.016, |
| "grad_norm": 0.03519332408905029, |
| "kl": 0.01271820068359375, |
| "learning_rate": 3.4341424424704373e-06, |
| "loss": -0.0114, |
| "reward": 6.630066633224487, |
| "reward_std": 0.9696584269404411, |
| "rewards/mrr_reward": 0.4780319929122925, |
| "rewards/rank_analyze_format_reward": 0.7667666971683502, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 377 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.59375, |
| "epoch": 3.024, |
| "grad_norm": 0.036034248769283295, |
| "kl": 0.014505386352539062, |
| "learning_rate": 3.3868813467634833e-06, |
| "loss": -0.0026, |
| "reward": 7.198747515678406, |
| "reward_std": 1.1167692840099335, |
| "rewards/mrr_reward": 0.6099516302347183, |
| "rewards/rank_analyze_format_reward": 0.8406638205051422, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 378 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.28125, |
| "epoch": 3.032, |
| "grad_norm": 0.03498866409063339, |
| "kl": 0.01226043701171875, |
| "learning_rate": 3.3398813256574847e-06, |
| "loss": -0.0099, |
| "reward": 7.360252737998962, |
| "reward_std": 0.8017124682664871, |
| "rewards/mrr_reward": 0.6533172130584717, |
| "rewards/rank_analyze_format_reward": 0.8052692711353302, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 379 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.265625, |
| "epoch": 3.04, |
| "grad_norm": 0.0373137928545475, |
| "kl": 0.013418197631835938, |
| "learning_rate": 3.2931442346328e-06, |
| "loss": 0.0002, |
| "reward": 7.177944183349609, |
| "reward_std": 1.186311975121498, |
| "rewards/mrr_reward": 0.6419270783662796, |
| "rewards/rank_analyze_format_reward": 0.6922670155763626, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 380 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 655.390625, |
| "epoch": 3.048, |
| "grad_norm": 0.03675440698862076, |
| "kl": 0.0125885009765625, |
| "learning_rate": 3.2466719187897555e-06, |
| "loss": 0.0072, |
| "reward": 6.830274343490601, |
| "reward_std": 0.661302238702774, |
| "rewards/mrr_reward": 0.4951822906732559, |
| "rewards/rank_analyze_format_reward": 0.8651701956987381, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 381 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 610.34375, |
| "epoch": 3.056, |
| "grad_norm": 0.03555948659777641, |
| "kl": 0.01416015625, |
| "learning_rate": 3.200466212775808e-06, |
| "loss": -0.0196, |
| "reward": 7.550482988357544, |
| "reward_std": 1.0687852203845978, |
| "rewards/mrr_reward": 0.71484375, |
| "rewards/rank_analyze_format_reward": 0.7417741417884827, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 382 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 619.625, |
| "epoch": 3.064, |
| "grad_norm": 0.03907148540019989, |
| "kl": 0.013399124145507812, |
| "learning_rate": 3.1545289407131128e-06, |
| "loss": -0.0043, |
| "reward": 7.558589220046997, |
| "reward_std": 1.2266802489757538, |
| "rewards/mrr_reward": 0.7244791686534882, |
| "rewards/rank_analyze_format_reward": 0.7446569502353668, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 383 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 598.09375, |
| "epoch": 3.072, |
| "grad_norm": 0.038272228091955185, |
| "kl": 0.011919021606445312, |
| "learning_rate": 3.108861916126518e-06, |
| "loss": 0.002, |
| "reward": 8.19713008403778, |
| "reward_std": 0.7785622999072075, |
| "rewards/mrr_reward": 0.85546875, |
| "rewards/rank_analyze_format_reward": 0.8240830302238464, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 384 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 627.640625, |
| "epoch": 3.08, |
| "grad_norm": 0.041524823755025864, |
| "kl": 0.01570892333984375, |
| "learning_rate": 3.063466941871952e-06, |
| "loss": 0.0153, |
| "reward": 7.146573901176453, |
| "reward_std": 1.074029102921486, |
| "rewards/mrr_reward": 0.6067274287343025, |
| "rewards/rank_analyze_format_reward": 0.834898442029953, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 385 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.671875, |
| "epoch": 3.088, |
| "grad_norm": 0.03623941168189049, |
| "kl": 0.013017654418945312, |
| "learning_rate": 3.0183458100652752e-06, |
| "loss": -0.0022, |
| "reward": 7.186826229095459, |
| "reward_std": 0.6731258956715465, |
| "rewards/mrr_reward": 0.5905319899320602, |
| "rewards/rank_analyze_format_reward": 0.863645926117897, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 386 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 624.265625, |
| "epoch": 3.096, |
| "grad_norm": 0.03690262883901596, |
| "kl": 0.01531219482421875, |
| "learning_rate": 2.9735003020115095e-06, |
| "loss": 0.0131, |
| "reward": 7.618446707725525, |
| "reward_std": 0.5352663211524487, |
| "rewards/mrr_reward": 0.7252604365348816, |
| "rewards/rank_analyze_format_reward": 0.7408426254987717, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 387 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 633.4375, |
| "epoch": 3.104, |
| "grad_norm": 0.03921409696340561, |
| "kl": 0.014377593994140625, |
| "learning_rate": 2.9289321881345257e-06, |
| "loss": -0.0006, |
| "reward": 7.131399869918823, |
| "reward_std": 1.3833198249340057, |
| "rewards/mrr_reward": 0.6131696403026581, |
| "rewards/rank_analyze_format_reward": 0.8094245195388794, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981250017881393, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9825000017881393, |
| "step": 388 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 660.5625, |
| "epoch": 3.112, |
| "grad_norm": 0.03858316317200661, |
| "kl": 0.01251220703125, |
| "learning_rate": 2.884643227907147e-06, |
| "loss": 0.0078, |
| "reward": 6.986513733863831, |
| "reward_std": 1.1185480952262878, |
| "rewards/mrr_reward": 0.5550533309578896, |
| "rewards/rank_analyze_format_reward": 0.8381428718566895, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 389 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 586.0625, |
| "epoch": 3.12, |
| "grad_norm": 0.03950336202979088, |
| "kl": 0.01474761962890625, |
| "learning_rate": 2.840635169781688e-06, |
| "loss": -0.0229, |
| "reward": 6.151705384254456, |
| "reward_std": 1.3553853258490562, |
| "rewards/mrr_reward": 0.416666679084301, |
| "rewards/rank_analyze_format_reward": 0.666062742471695, |
| "rewards/rank_answer_foramt_reward": 0.822265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 390 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 610.796875, |
| "epoch": 3.128, |
| "grad_norm": 0.03854774311184883, |
| "kl": 0.013458251953125, |
| "learning_rate": 2.796909751120931e-06, |
| "loss": -0.007, |
| "reward": 7.251393556594849, |
| "reward_std": 1.445090800523758, |
| "rewards/mrr_reward": 0.6562500074505806, |
| "rewards/rank_analyze_format_reward": 0.7045186460018158, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 391 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 633.71875, |
| "epoch": 3.136, |
| "grad_norm": 0.03738197311758995, |
| "kl": 0.013525009155273438, |
| "learning_rate": 2.7534686981295335e-06, |
| "loss": -0.0034, |
| "reward": 6.909914255142212, |
| "reward_std": 1.123517245054245, |
| "rewards/mrr_reward": 0.5471974164247513, |
| "rewards/rank_analyze_format_reward": 0.779603436589241, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 392 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 648.96875, |
| "epoch": 3.144, |
| "grad_norm": 0.0402173288166523, |
| "kl": 0.010915756225585938, |
| "learning_rate": 2.7103137257858867e-06, |
| "loss": 0.0094, |
| "reward": 6.921466946601868, |
| "reward_std": 0.774784117937088, |
| "rewards/mrr_reward": 0.5484312921762466, |
| "rewards/rank_analyze_format_reward": 0.8028685003519058, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9975927919149399, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9975927919149399, |
| "step": 393 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 619.296875, |
| "epoch": 3.152, |
| "grad_norm": 0.04159383475780487, |
| "kl": 0.014692306518554688, |
| "learning_rate": 2.667446537774402e-06, |
| "loss": -0.0153, |
| "reward": 6.572040319442749, |
| "reward_std": 1.726172387599945, |
| "rewards/mrr_reward": 0.4993923604488373, |
| "rewards/rank_analyze_format_reward": 0.7517964094877243, |
| "rewards/rank_answer_foramt_reward": 0.8203125, |
| "rewards/rank_contrast_format_reward": 0.012423780746757984, |
| "rewards/rank_initial_format_reward": 0.994969055056572, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.994969055056572, |
| "step": 394 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.765625, |
| "epoch": 3.16, |
| "grad_norm": 0.042603906244039536, |
| "kl": 0.010896682739257812, |
| "learning_rate": 2.624868826418262e-06, |
| "loss": 0.0296, |
| "reward": 7.20228123664856, |
| "reward_std": 0.9538848847150803, |
| "rewards/mrr_reward": 0.6085069477558136, |
| "rewards/rank_analyze_format_reward": 0.8265387862920761, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 395 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 678.578125, |
| "epoch": 3.168, |
| "grad_norm": 0.03769225999712944, |
| "kl": 0.011178970336914062, |
| "learning_rate": 2.5825822726126095e-06, |
| "loss": 0.0099, |
| "reward": 7.425193428993225, |
| "reward_std": 1.2249933630228043, |
| "rewards/mrr_reward": 0.6672184988856316, |
| "rewards/rank_analyze_format_reward": 0.8207724988460541, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 396 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 612.90625, |
| "epoch": 3.176, |
| "grad_norm": 0.03580842167139053, |
| "kl": 0.01148223876953125, |
| "learning_rate": 2.5405885457581793e-06, |
| "loss": 0.0051, |
| "reward": 7.102351069450378, |
| "reward_std": 1.0760410577058792, |
| "rewards/mrr_reward": 0.5938120186328888, |
| "rewards/rank_analyze_format_reward": 0.7681185156106949, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 397 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 622.234375, |
| "epoch": 3.184, |
| "grad_norm": 0.03826119750738144, |
| "kl": 0.014284133911132812, |
| "learning_rate": 2.4988893036954045e-06, |
| "loss": 0.0084, |
| "reward": 7.598180770874023, |
| "reward_std": 1.0497512221336365, |
| "rewards/mrr_reward": 0.7157738208770752, |
| "rewards/rank_analyze_format_reward": 0.8209080398082733, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 398 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 629.640625, |
| "epoch": 3.192, |
| "grad_norm": 0.039424311369657516, |
| "kl": 0.012294769287109375, |
| "learning_rate": 2.4574861926389615e-06, |
| "loss": 0.0079, |
| "reward": 7.362653613090515, |
| "reward_std": 0.824449434876442, |
| "rewards/mrr_reward": 0.6566406339406967, |
| "rewards/rank_analyze_format_reward": 0.8023822903633118, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 399 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.015625, |
| "epoch": 3.2, |
| "grad_norm": 0.03927793353796005, |
| "kl": 0.01302337646484375, |
| "learning_rate": 2.4163808471127815e-06, |
| "loss": -0.0046, |
| "reward": 7.52125608921051, |
| "reward_std": 1.2097734808921814, |
| "rewards/mrr_reward": 0.7062934041023254, |
| "rewards/rank_analyze_format_reward": 0.7816215455532074, |
| "rewards/rank_answer_foramt_reward": 0.955078125, |
| "rewards/rank_contrast_format_reward": 0.014070273377001286, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 400 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 611.1875, |
| "epoch": 3.208, |
| "grad_norm": 0.03897445276379585, |
| "kl": 0.013471603393554688, |
| "learning_rate": 2.37557488988552e-06, |
| "loss": -0.0031, |
| "reward": 6.832857847213745, |
| "reward_std": 1.2639935612678528, |
| "rewards/mrr_reward": 0.5474764406681061, |
| "rewards/rank_analyze_format_reward": 0.7873684614896774, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 401 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 627.421875, |
| "epoch": 3.216, |
| "grad_norm": 0.03820272535085678, |
| "kl": 0.012819290161132812, |
| "learning_rate": 2.335069931906503e-06, |
| "loss": -0.0068, |
| "reward": 7.258768320083618, |
| "reward_std": 1.3467806428670883, |
| "rewards/mrr_reward": 0.6381944566965103, |
| "rewards/rank_analyze_format_reward": 0.7626311928033829, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 402 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 623.78125, |
| "epoch": 3.224, |
| "grad_norm": 0.037081021815538406, |
| "kl": 0.013256072998046875, |
| "learning_rate": 2.2948675722421086e-06, |
| "loss": -0.0032, |
| "reward": 7.068072199821472, |
| "reward_std": 1.0497987121343613, |
| "rewards/mrr_reward": 0.5989149361848831, |
| "rewards/rank_analyze_format_reward": 0.7660476416349411, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 403 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 605.0625, |
| "epoch": 3.232, |
| "grad_norm": 0.0360063761472702, |
| "kl": 0.015211105346679688, |
| "learning_rate": 2.254969398012663e-06, |
| "loss": -0.0158, |
| "reward": 7.160408020019531, |
| "reward_std": 1.019886076450348, |
| "rewards/mrr_reward": 0.646112360060215, |
| "rewards/rank_analyze_format_reward": 0.7062013298273087, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 404 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 625.453125, |
| "epoch": 3.24, |
| "grad_norm": 0.03534236177802086, |
| "kl": 0.014194488525390625, |
| "learning_rate": 2.215376984329767e-06, |
| "loss": -0.0216, |
| "reward": 7.393091082572937, |
| "reward_std": 0.8330601751804352, |
| "rewards/mrr_reward": 0.6536458283662796, |
| "rewards/rank_analyze_format_reward": 0.7999920099973679, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 405 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 604.921875, |
| "epoch": 3.248, |
| "grad_norm": 0.038391463458538055, |
| "kl": 0.013559341430664062, |
| "learning_rate": 2.1760918942341193e-06, |
| "loss": -0.0178, |
| "reward": 7.6344475746154785, |
| "reward_std": 0.9299002774059772, |
| "rewards/mrr_reward": 0.7469618022441864, |
| "rewards/rank_analyze_format_reward": 0.7185573130846024, |
| "rewards/rank_answer_foramt_reward": 0.96875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 406 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.015625, |
| "epoch": 3.2560000000000002, |
| "grad_norm": 0.03509964421391487, |
| "kl": 0.01255035400390625, |
| "learning_rate": 2.1371156786338108e-06, |
| "loss": -0.0117, |
| "reward": 6.898256897926331, |
| "reward_std": 0.9209974706172943, |
| "rewards/mrr_reward": 0.5367559418082237, |
| "rewards/rank_analyze_format_reward": 0.8304647654294968, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9965170323848724, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9808920323848724, |
| "step": 407 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 608.5625, |
| "epoch": 3.2640000000000002, |
| "grad_norm": 0.03695489838719368, |
| "kl": 0.0167388916015625, |
| "learning_rate": 2.098449876243096e-06, |
| "loss": -0.0313, |
| "reward": 6.917726039886475, |
| "reward_std": 0.8459838628768921, |
| "rewards/mrr_reward": 0.5599144399166107, |
| "rewards/rank_analyze_format_reward": 0.7054120153188705, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 408 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 585.078125, |
| "epoch": 3.2720000000000002, |
| "grad_norm": 0.039599835872650146, |
| "kl": 0.014032363891601562, |
| "learning_rate": 2.0600960135216463e-06, |
| "loss": -0.0047, |
| "reward": 7.127155780792236, |
| "reward_std": 1.1092039048671722, |
| "rewards/mrr_reward": 0.6348276287317276, |
| "rewards/rank_analyze_format_reward": 0.7182396054267883, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9992559552192688, |
| "step": 409 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.296875, |
| "epoch": 3.2800000000000002, |
| "grad_norm": 0.039675965905189514, |
| "kl": 0.013446807861328125, |
| "learning_rate": 2.022055604614289e-06, |
| "loss": -0.0103, |
| "reward": 6.730955481529236, |
| "reward_std": 0.9299670159816742, |
| "rewards/mrr_reward": 0.5159474387764931, |
| "rewards/rank_analyze_format_reward": 0.7542870342731476, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9945252537727356, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9945252537727356, |
| "step": 410 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.3125, |
| "epoch": 3.288, |
| "grad_norm": 0.03656487911939621, |
| "kl": 0.012666702270507812, |
| "learning_rate": 1.984330151291233e-06, |
| "loss": -0.016, |
| "reward": 6.7821091413497925, |
| "reward_std": 0.9743772521615028, |
| "rewards/mrr_reward": 0.5557911694049835, |
| "rewards/rank_analyze_format_reward": 0.7064568400382996, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9819079041481018, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9662829041481018, |
| "step": 411 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.171875, |
| "epoch": 3.296, |
| "grad_norm": 0.037856802344322205, |
| "kl": 0.01450347900390625, |
| "learning_rate": 1.9469211428887813e-06, |
| "loss": -0.0176, |
| "reward": 6.753392338752747, |
| "reward_std": 0.9574991762638092, |
| "rewards/mrr_reward": 0.5006696432828903, |
| "rewards/rank_analyze_format_reward": 0.8190732151269913, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 412 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 618.59375, |
| "epoch": 3.304, |
| "grad_norm": 0.040868956595659256, |
| "kl": 0.011432647705078125, |
| "learning_rate": 1.9098300562505266e-06, |
| "loss": 0.0035, |
| "reward": 6.817109823226929, |
| "reward_std": 0.7666200622916222, |
| "rewards/mrr_reward": 0.5467447973787785, |
| "rewards/rank_analyze_format_reward": 0.7559238225221634, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9966736733913422, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9966736733913422, |
| "step": 413 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.671875, |
| "epoch": 3.312, |
| "grad_norm": 0.04021133482456207, |
| "kl": 0.013158798217773438, |
| "learning_rate": 1.8730583556690607e-06, |
| "loss": 0.0066, |
| "reward": 6.991716146469116, |
| "reward_std": 0.8390699215233326, |
| "rewards/mrr_reward": 0.5647073462605476, |
| "rewards/rank_analyze_format_reward": 0.8067970871925354, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 414 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.8125, |
| "epoch": 3.32, |
| "grad_norm": 0.037540923804044724, |
| "kl": 0.01172637939453125, |
| "learning_rate": 1.8366074928281608e-06, |
| "loss": 0.0074, |
| "reward": 7.544142961502075, |
| "reward_std": 1.008560985326767, |
| "rewards/mrr_reward": 0.6979166716337204, |
| "rewards/rank_analyze_format_reward": 0.8081740438938141, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.994612067937851, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.994612067937851, |
| "step": 415 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 584.15625, |
| "epoch": 3.328, |
| "grad_norm": 0.04208629950881004, |
| "kl": 0.012399673461914062, |
| "learning_rate": 1.8004789067454763e-06, |
| "loss": -0.0386, |
| "reward": 7.588791251182556, |
| "reward_std": 1.3209501877427101, |
| "rewards/mrr_reward": 0.7485863268375397, |
| "rewards/rank_analyze_format_reward": 0.6915150880813599, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9924812018871307, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9924812018871307, |
| "step": 416 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 635.203125, |
| "epoch": 3.336, |
| "grad_norm": 0.043379127979278564, |
| "kl": 0.012195587158203125, |
| "learning_rate": 1.7646740237157256e-06, |
| "loss": 0.0323, |
| "reward": 7.012084484100342, |
| "reward_std": 1.0143009573221207, |
| "rewards/mrr_reward": 0.5759734660387039, |
| "rewards/rank_analyze_format_reward": 0.8153042197227478, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9835526347160339, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 417 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 598.859375, |
| "epoch": 3.344, |
| "grad_norm": 0.04017612338066101, |
| "kl": 0.014421463012695312, |
| "learning_rate": 1.7291942572543806e-06, |
| "loss": -0.006, |
| "reward": 6.709989428520203, |
| "reward_std": 1.0495906621217728, |
| "rewards/mrr_reward": 0.529706098139286, |
| "rewards/rank_analyze_format_reward": 0.7245771586894989, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9967704266309738, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9967704266309738, |
| "step": 418 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 580.203125, |
| "epoch": 3.352, |
| "grad_norm": 0.040608614683151245, |
| "kl": 0.012868881225585938, |
| "learning_rate": 1.6940410080418723e-06, |
| "loss": -0.0019, |
| "reward": 7.201984643936157, |
| "reward_std": 0.7756945788860321, |
| "rewards/mrr_reward": 0.6367187723517418, |
| "rewards/rank_analyze_format_reward": 0.7234692126512527, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 419 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.3125, |
| "epoch": 3.36, |
| "grad_norm": 0.03586459904909134, |
| "kl": 0.011873245239257812, |
| "learning_rate": 1.6592156638682887e-06, |
| "loss": -0.0093, |
| "reward": 7.138599634170532, |
| "reward_std": 0.7431515604257584, |
| "rewards/mrr_reward": 0.6127170100808144, |
| "rewards/rank_analyze_format_reward": 0.7191510647535324, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9979619532823563, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9979619532823563, |
| "step": 420 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.703125, |
| "epoch": 3.368, |
| "grad_norm": 0.03901754319667816, |
| "kl": 0.01136016845703125, |
| "learning_rate": 1.6247195995785836e-06, |
| "loss": 0.003, |
| "reward": 6.6844483613967896, |
| "reward_std": 0.7994736880064011, |
| "rewards/mrr_reward": 0.4797743149101734, |
| "rewards/rank_analyze_format_reward": 0.8219916969537735, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 421 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 605.625, |
| "epoch": 3.376, |
| "grad_norm": 0.03650727495551109, |
| "kl": 0.01389312744140625, |
| "learning_rate": 1.5905541770183096e-06, |
| "loss": -0.0195, |
| "reward": 6.6595494747161865, |
| "reward_std": 0.500478945672512, |
| "rewards/mrr_reward": 0.4967882111668587, |
| "rewards/rank_analyze_format_reward": 0.7016936540603638, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 422 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 622.78125, |
| "epoch": 3.384, |
| "grad_norm": 0.04294556751847267, |
| "kl": 0.012430191040039062, |
| "learning_rate": 1.5567207449798517e-06, |
| "loss": 0.0142, |
| "reward": 7.437200546264648, |
| "reward_std": 0.8200259059667587, |
| "rewards/mrr_reward": 0.687189981341362, |
| "rewards/rank_analyze_format_reward": 0.7644545584917068, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.013829787261784077, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 423 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.3125, |
| "epoch": 3.392, |
| "grad_norm": 0.03355753794312477, |
| "kl": 0.011930465698242188, |
| "learning_rate": 1.52322063914917e-06, |
| "loss": -0.0143, |
| "reward": 7.236422419548035, |
| "reward_std": 1.3275894522666931, |
| "rewards/mrr_reward": 0.6320932507514954, |
| "rewards/rank_analyze_format_reward": 0.7606691271066666, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 424 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.0, |
| "epoch": 3.4, |
| "grad_norm": 0.03680611029267311, |
| "kl": 0.0124969482421875, |
| "learning_rate": 1.490055182053083e-06, |
| "loss": -0.0241, |
| "reward": 7.067311525344849, |
| "reward_std": 0.7071668058633804, |
| "rewards/mrr_reward": 0.5894097089767456, |
| "rewards/rank_analyze_format_reward": 0.7389693707227707, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 618.25, |
| "epoch": 3.408, |
| "grad_norm": 0.03699398413300514, |
| "kl": 0.012582778930664062, |
| "learning_rate": 1.4572256830070497e-06, |
| "loss": 0.0013, |
| "reward": 7.260975360870361, |
| "reward_std": 0.6704662144184113, |
| "rewards/mrr_reward": 0.6398809552192688, |
| "rewards/rank_analyze_format_reward": 0.8204772174358368, |
| "rewards/rank_answer_foramt_reward": 0.890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 426 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 598.4375, |
| "epoch": 3.416, |
| "grad_norm": 0.04159224405884743, |
| "kl": 0.014894485473632812, |
| "learning_rate": 1.4247334380634792e-06, |
| "loss": -0.0191, |
| "reward": 7.206877589225769, |
| "reward_std": 0.8614709973335266, |
| "rewards/mrr_reward": 0.6289062574505806, |
| "rewards/rank_analyze_format_reward": 0.765241265296936, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 427 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.125, |
| "epoch": 3.424, |
| "grad_norm": 0.036023661494255066, |
| "kl": 0.011859893798828125, |
| "learning_rate": 1.3925797299605649e-06, |
| "loss": -0.0067, |
| "reward": 6.5737926959991455, |
| "reward_std": 0.9088378921151161, |
| "rewards/mrr_reward": 0.4585689455270767, |
| "rewards/rank_analyze_format_reward": 0.7960425764322281, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 428 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 624.1875, |
| "epoch": 3.432, |
| "grad_norm": 0.047732554376125336, |
| "kl": 0.011541366577148438, |
| "learning_rate": 1.3607658280716474e-06, |
| "loss": -0.028, |
| "reward": 7.2520798444747925, |
| "reward_std": 1.1335118561983109, |
| "rewards/mrr_reward": 0.6308097690343857, |
| "rewards/rank_analyze_format_reward": 0.7853662818670273, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 429 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 596.40625, |
| "epoch": 3.44, |
| "grad_norm": 0.03683609887957573, |
| "kl": 0.013605117797851562, |
| "learning_rate": 1.3292929883550998e-06, |
| "loss": -0.0073, |
| "reward": 8.030953884124756, |
| "reward_std": 0.5688543245196342, |
| "rewards/mrr_reward": 0.8156249970197678, |
| "rewards/rank_analyze_format_reward": 0.7977506220340729, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 430 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 605.25, |
| "epoch": 3.448, |
| "grad_norm": 0.03646247833967209, |
| "kl": 0.013217926025390625, |
| "learning_rate": 1.2981624533047432e-06, |
| "loss": 0.0074, |
| "reward": 6.805210113525391, |
| "reward_std": 0.8861361294984818, |
| "rewards/mrr_reward": 0.5289062447845936, |
| "rewards/rank_analyze_format_reward": 0.7668732404708862, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9994419664144516, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9994419664144516, |
| "step": 431 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 609.875, |
| "epoch": 3.456, |
| "grad_norm": 0.03660254180431366, |
| "kl": 0.011661529541015625, |
| "learning_rate": 1.2673754519008008e-06, |
| "loss": -0.0102, |
| "reward": 6.942854642868042, |
| "reward_std": 0.8882918208837509, |
| "rewards/mrr_reward": 0.5530754029750824, |
| "rewards/rank_analyze_format_reward": 0.7735218703746796, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 432 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 620.671875, |
| "epoch": 3.464, |
| "grad_norm": 0.037669289857149124, |
| "kl": 0.013912200927734375, |
| "learning_rate": 1.2369331995613664e-06, |
| "loss": -0.0091, |
| "reward": 7.411279797554016, |
| "reward_std": 1.1482711285352707, |
| "rewards/mrr_reward": 0.673480898141861, |
| "rewards/rank_analyze_format_reward": 0.7954811006784439, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 433 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.984375, |
| "epoch": 3.472, |
| "grad_norm": 0.037749458104372025, |
| "kl": 0.01103973388671875, |
| "learning_rate": 1.206836898094439e-06, |
| "loss": -0.0045, |
| "reward": 7.326077461242676, |
| "reward_std": 0.8330521434545517, |
| "rewards/mrr_reward": 0.6379278004169464, |
| "rewards/rank_analyze_format_reward": 0.7898766249418259, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 434 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.78125, |
| "epoch": 3.48, |
| "grad_norm": 0.035765353590250015, |
| "kl": 0.012155532836914062, |
| "learning_rate": 1.1770877356504684e-06, |
| "loss": -0.02, |
| "reward": 7.883293986320496, |
| "reward_std": 0.8871591687202454, |
| "rewards/mrr_reward": 0.7798177152872086, |
| "rewards/rank_analyze_format_reward": 0.7656676918268204, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 435 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 611.78125, |
| "epoch": 3.488, |
| "grad_norm": 0.03705098107457161, |
| "kl": 0.012918472290039062, |
| "learning_rate": 1.1476868866754488e-06, |
| "loss": -0.0083, |
| "reward": 6.660637736320496, |
| "reward_std": 0.9346826821565628, |
| "rewards/mrr_reward": 0.500713050365448, |
| "rewards/rank_analyze_format_reward": 0.7608778774738312, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 436 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 628.71875, |
| "epoch": 3.496, |
| "grad_norm": 0.03969808667898178, |
| "kl": 0.015727996826171875, |
| "learning_rate": 1.1186355118645552e-06, |
| "loss": -0.0132, |
| "reward": 7.052313804626465, |
| "reward_std": 1.0558638274669647, |
| "rewards/mrr_reward": 0.5923363342881203, |
| "rewards/rank_analyze_format_reward": 0.7863690704107285, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 437 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.859375, |
| "epoch": 3.504, |
| "grad_norm": 0.037416357547044754, |
| "kl": 0.012334823608398438, |
| "learning_rate": 1.0899347581163222e-06, |
| "loss": -0.0176, |
| "reward": 7.955611228942871, |
| "reward_std": 0.7251264750957489, |
| "rewards/mrr_reward": 0.796651765704155, |
| "rewards/rank_analyze_format_reward": 0.8157641887664795, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 438 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.5, |
| "epoch": 3.512, |
| "grad_norm": 0.03700832277536392, |
| "kl": 0.011600494384765625, |
| "learning_rate": 1.0615857584873624e-06, |
| "loss": 0.0115, |
| "reward": 7.665433883666992, |
| "reward_std": 0.595935083925724, |
| "rewards/mrr_reward": 0.7211123704910278, |
| "rewards/rank_analyze_format_reward": 0.8138794153928757, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 439 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 615.5, |
| "epoch": 3.52, |
| "grad_norm": 0.038077887147665024, |
| "kl": 0.01373291015625, |
| "learning_rate": 1.0335896321476413e-06, |
| "loss": -0.0342, |
| "reward": 7.015413165092468, |
| "reward_std": 1.31123448908329, |
| "rewards/mrr_reward": 0.5999503880739212, |
| "rewards/rank_analyze_format_reward": 0.7072935104370117, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 440 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 658.0625, |
| "epoch": 3.528, |
| "grad_norm": 0.03242430090904236, |
| "kl": 0.013689041137695312, |
| "learning_rate": 1.0059474843362893e-06, |
| "loss": -0.0198, |
| "reward": 6.429423809051514, |
| "reward_std": 0.8046993911266327, |
| "rewards/mrr_reward": 0.43027032166719437, |
| "rewards/rank_analyze_format_reward": 0.7727955877780914, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 441 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 671.890625, |
| "epoch": 3.536, |
| "grad_norm": 0.034074753522872925, |
| "kl": 0.0110321044921875, |
| "learning_rate": 9.786604063179728e-07, |
| "loss": -0.0013, |
| "reward": 7.255928158760071, |
| "reward_std": 0.7968212515115738, |
| "rewards/mrr_reward": 0.6222656294703484, |
| "rewards/rank_analyze_format_reward": 0.8351100534200668, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 442 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 612.109375, |
| "epoch": 3.544, |
| "grad_norm": 0.040858954191207886, |
| "kl": 0.0142974853515625, |
| "learning_rate": 9.517294753398066e-07, |
| "loss": 0.0039, |
| "reward": 7.048562169075012, |
| "reward_std": 0.9897879660129547, |
| "rewards/mrr_reward": 0.5950024798512459, |
| "rewards/rank_analyze_format_reward": 0.7603489309549332, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 443 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.3125, |
| "epoch": 3.552, |
| "grad_norm": 0.033798523247241974, |
| "kl": 0.01203155517578125, |
| "learning_rate": 9.251557545888312e-07, |
| "loss": -0.0221, |
| "reward": 7.666335582733154, |
| "reward_std": 0.6299453526735306, |
| "rewards/mrr_reward": 0.7138020843267441, |
| "rewards/rank_analyze_format_reward": 0.8006909340620041, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.01411290280520916, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 444 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 587.46875, |
| "epoch": 3.56, |
| "grad_norm": 0.039384886622428894, |
| "kl": 0.012357711791992188, |
| "learning_rate": 8.989402931500434e-07, |
| "loss": -0.0138, |
| "reward": 8.181223034858704, |
| "reward_std": 0.7046910002827644, |
| "rewards/mrr_reward": 0.875, |
| "rewards/rank_analyze_format_reward": 0.6948948577046394, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 445 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.15625, |
| "epoch": 3.568, |
| "grad_norm": 0.03874967247247696, |
| "kl": 0.015522003173828125, |
| "learning_rate": 8.730841259649725e-07, |
| "loss": -0.0254, |
| "reward": 7.0217931270599365, |
| "reward_std": 0.8901334404945374, |
| "rewards/mrr_reward": 0.6117807626724243, |
| "rewards/rank_analyze_format_reward": 0.633263885974884, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 446 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 623.296875, |
| "epoch": 3.576, |
| "grad_norm": 0.03479482978582382, |
| "kl": 0.011865615844726562, |
| "learning_rate": 8.475882737908248e-07, |
| "loss": 0.0008, |
| "reward": 7.554774522781372, |
| "reward_std": 0.9437515586614609, |
| "rewards/mrr_reward": 0.7163194417953491, |
| "rewards/rank_analyze_format_reward": 0.7170282900333405, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0127108134329319, |
| "rewards/rank_initial_format_reward": 0.9984335899353027, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9984335899353027, |
| "step": 447 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 632.0625, |
| "epoch": 3.584, |
| "grad_norm": 0.03798670321702957, |
| "kl": 0.012453079223632812, |
| "learning_rate": 8.224537431601886e-07, |
| "loss": 0.0001, |
| "reward": 6.237062215805054, |
| "reward_std": 0.5429144222289324, |
| "rewards/mrr_reward": 0.3848772421479225, |
| "rewards/rank_analyze_format_reward": 0.7522407919168472, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 448 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 619.234375, |
| "epoch": 3.592, |
| "grad_norm": 0.036835283041000366, |
| "kl": 0.012708663940429688, |
| "learning_rate": 7.976815263412963e-07, |
| "loss": -0.0548, |
| "reward": 6.972110390663147, |
| "reward_std": 1.0418353527784348, |
| "rewards/mrr_reward": 0.6005208343267441, |
| "rewards/rank_analyze_format_reward": 0.6950270235538483, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 449 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.84375, |
| "epoch": 3.6, |
| "grad_norm": 0.03548385202884674, |
| "kl": 0.01288604736328125, |
| "learning_rate": 7.732726012988512e-07, |
| "loss": -0.0231, |
| "reward": 7.123287677764893, |
| "reward_std": 0.9100038930773735, |
| "rewards/mrr_reward": 0.6060701757669449, |
| "rewards/rank_analyze_format_reward": 0.749788224697113, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 450 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 617.828125, |
| "epoch": 3.608, |
| "grad_norm": 0.03627340868115425, |
| "kl": 0.011266708374023438, |
| "learning_rate": 7.492279316554207e-07, |
| "loss": -0.0177, |
| "reward": 7.946985602378845, |
| "reward_std": 0.7550379931926727, |
| "rewards/mrr_reward": 0.80078125, |
| "rewards/rank_analyze_format_reward": 0.7886674106121063, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 451 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 612.265625, |
| "epoch": 3.616, |
| "grad_norm": 0.04377419501543045, |
| "kl": 0.0131683349609375, |
| "learning_rate": 7.255484666533874e-07, |
| "loss": -0.0026, |
| "reward": 6.993055105209351, |
| "reward_std": 1.2321006208658218, |
| "rewards/mrr_reward": 0.6157738119363785, |
| "rewards/rank_analyze_format_reward": 0.705741174519062, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 452 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 572.984375, |
| "epoch": 3.624, |
| "grad_norm": 0.04109544679522514, |
| "kl": 0.01351165771484375, |
| "learning_rate": 7.022351411174866e-07, |
| "loss": 0.005, |
| "reward": 7.255419611930847, |
| "reward_std": 1.0537290424108505, |
| "rewards/mrr_reward": 0.6588541716337204, |
| "rewards/rank_analyze_format_reward": 0.7091151028871536, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 453 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.953125, |
| "epoch": 3.632, |
| "grad_norm": 0.03640174865722656, |
| "kl": 0.012472152709960938, |
| "learning_rate": 6.792888754178906e-07, |
| "loss": -0.0046, |
| "reward": 7.461974620819092, |
| "reward_std": 0.8970663994550705, |
| "rewards/mrr_reward": 0.688616082072258, |
| "rewards/rank_analyze_format_reward": 0.7289945930242538, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 454 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 609.234375, |
| "epoch": 3.64, |
| "grad_norm": 0.03923455998301506, |
| "kl": 0.0132598876953125, |
| "learning_rate": 6.567105754338798e-07, |
| "loss": -0.0055, |
| "reward": 7.089033126831055, |
| "reward_std": 1.0927991718053818, |
| "rewards/mrr_reward": 0.6148189604282379, |
| "rewards/rank_analyze_format_reward": 0.7315036952495575, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.013373362831771374, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 455 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 670.09375, |
| "epoch": 3.648, |
| "grad_norm": 0.03714577481150627, |
| "kl": 0.012269973754882812, |
| "learning_rate": 6.345011325180772e-07, |
| "loss": -0.006, |
| "reward": 6.888863801956177, |
| "reward_std": 0.7847508117556572, |
| "rewards/mrr_reward": 0.5322854816913605, |
| "rewards/rank_analyze_format_reward": 0.8160540610551834, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 456 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 576.171875, |
| "epoch": 3.656, |
| "grad_norm": 0.03968933969736099, |
| "kl": 0.014535903930664062, |
| "learning_rate": 6.126614234612593e-07, |
| "loss": -0.0031, |
| "reward": 6.883362054824829, |
| "reward_std": 1.1338584274053574, |
| "rewards/mrr_reward": 0.5649925693869591, |
| "rewards/rank_analyze_format_reward": 0.7210482209920883, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 457 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 609.625, |
| "epoch": 3.664, |
| "grad_norm": 0.0411507710814476, |
| "kl": 0.014348983764648438, |
| "learning_rate": 5.911923104577455e-07, |
| "loss": -0.017, |
| "reward": 6.8970195055007935, |
| "reward_std": 0.7303311824798584, |
| "rewards/mrr_reward": 0.544177807867527, |
| "rewards/rank_analyze_format_reward": 0.7589471489191055, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 458 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 600.109375, |
| "epoch": 3.672, |
| "grad_norm": 0.03855804726481438, |
| "kl": 0.013666152954101562, |
| "learning_rate": 5.700946410713548e-07, |
| "loss": 0.0051, |
| "reward": 7.619644403457642, |
| "reward_std": 0.8956380970776081, |
| "rewards/mrr_reward": 0.7256944477558136, |
| "rewards/rank_analyze_format_reward": 0.7556206434965134, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 459 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 622.765625, |
| "epoch": 3.68, |
| "grad_norm": 0.04349582642316818, |
| "kl": 0.011423110961914062, |
| "learning_rate": 5.49369248201953e-07, |
| "loss": 0.0175, |
| "reward": 6.194160223007202, |
| "reward_std": 0.9866833090782166, |
| "rewards/mrr_reward": 0.3742373511195183, |
| "rewards/rank_analyze_format_reward": 0.8241638392210007, |
| "rewards/rank_answer_foramt_reward": 0.927734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 460 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 605.609375, |
| "epoch": 3.6879999999999997, |
| "grad_norm": 0.035147525370121, |
| "kl": 0.011508941650390625, |
| "learning_rate": 5.290169500525577e-07, |
| "loss": -0.0124, |
| "reward": 7.419980049133301, |
| "reward_std": 0.46903695818036795, |
| "rewards/mrr_reward": 0.7058593779802322, |
| "rewards/rank_analyze_format_reward": 0.5965423956513405, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 461 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 607.328125, |
| "epoch": 3.6959999999999997, |
| "grad_norm": 0.038031551986932755, |
| "kl": 0.0143585205078125, |
| "learning_rate": 5.090385500970551e-07, |
| "loss": -0.0282, |
| "reward": 7.524974226951599, |
| "reward_std": 0.7495295517146587, |
| "rewards/mrr_reward": 0.7476562410593033, |
| "rewards/rank_analyze_format_reward": 0.616380512714386, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 462 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 624.5625, |
| "epoch": 3.7039999999999997, |
| "grad_norm": 0.03687411919236183, |
| "kl": 0.01457977294921875, |
| "learning_rate": 4.894348370484648e-07, |
| "loss": -0.0306, |
| "reward": 6.9998191595077515, |
| "reward_std": 1.1533474028110504, |
| "rewards/mrr_reward": 0.5960689634084702, |
| "rewards/rank_analyze_format_reward": 0.72789466381073, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9985119104385376, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9828869104385376, |
| "step": 463 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.578125, |
| "epoch": 3.7119999999999997, |
| "grad_norm": 0.035118553787469864, |
| "kl": 0.010381698608398438, |
| "learning_rate": 4.702065848278126e-07, |
| "loss": 0.001, |
| "reward": 7.487505078315735, |
| "reward_std": 1.193233162164688, |
| "rewards/mrr_reward": 0.7095052152872086, |
| "rewards/rank_analyze_format_reward": 0.7606973052024841, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9834558814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 464 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 613.640625, |
| "epoch": 3.7199999999999998, |
| "grad_norm": 0.039265792816877365, |
| "kl": 0.012542724609375, |
| "learning_rate": 4.5135455253357053e-07, |
| "loss": -0.02, |
| "reward": 6.985211730003357, |
| "reward_std": 0.8345009088516235, |
| "rewards/mrr_reward": 0.5719804167747498, |
| "rewards/rank_analyze_format_reward": 0.7383057624101639, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 465 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 598.15625, |
| "epoch": 3.7279999999999998, |
| "grad_norm": 0.03676972910761833, |
| "kl": 0.013540267944335938, |
| "learning_rate": 4.3287948441169457e-07, |
| "loss": -0.0256, |
| "reward": 7.533303260803223, |
| "reward_std": 0.6752141863107681, |
| "rewards/mrr_reward": 0.7332217246294022, |
| "rewards/rank_analyze_format_reward": 0.6644462794065475, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 466 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 623.875, |
| "epoch": 3.7359999999999998, |
| "grad_norm": 0.03843839094042778, |
| "kl": 0.012653350830078125, |
| "learning_rate": 4.1478210982624055e-07, |
| "loss": -0.0137, |
| "reward": 7.101514220237732, |
| "reward_std": 0.712131037376821, |
| "rewards/mrr_reward": 0.6059895902872086, |
| "rewards/rank_analyze_format_reward": 0.7317783385515213, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9836309552192688, |
| "step": 467 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 627.5, |
| "epoch": 3.7439999999999998, |
| "grad_norm": 0.038445886224508286, |
| "kl": 0.0115203857421875, |
| "learning_rate": 3.9706314323056936e-07, |
| "loss": -0.001, |
| "reward": 7.2100324630737305, |
| "reward_std": 0.8441106081008911, |
| "rewards/mrr_reward": 0.6200706958770752, |
| "rewards/rank_analyze_format_reward": 0.8110490888357162, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 468 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 567.046875, |
| "epoch": 3.752, |
| "grad_norm": 0.0397077351808548, |
| "kl": 0.012897491455078125, |
| "learning_rate": 3.7972328413914074e-07, |
| "loss": -0.0162, |
| "reward": 7.725077509880066, |
| "reward_std": 1.1218221932649612, |
| "rewards/mrr_reward": 0.7771391421556473, |
| "rewards/rank_analyze_format_reward": 0.6673022508621216, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 469 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.859375, |
| "epoch": 3.76, |
| "grad_norm": 0.035067442804574966, |
| "kl": 0.01100921630859375, |
| "learning_rate": 3.627632170999029e-07, |
| "loss": -0.0112, |
| "reward": 7.772274732589722, |
| "reward_std": 0.4194560647010803, |
| "rewards/mrr_reward": 0.7544270902872086, |
| "rewards/rank_analyze_format_reward": 0.75864277780056, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9979619532823563, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9979619532823563, |
| "step": 470 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 629.296875, |
| "epoch": 3.768, |
| "grad_norm": 0.03988838940858841, |
| "kl": 0.013881683349609375, |
| "learning_rate": 3.4618361166726123e-07, |
| "loss": 0.0089, |
| "reward": 6.951045513153076, |
| "reward_std": 0.9670315980911255, |
| "rewards/mrr_reward": 0.5557911768555641, |
| "rewards/rank_analyze_format_reward": 0.811864972114563, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 471 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 610.359375, |
| "epoch": 3.776, |
| "grad_norm": 0.04017311707139015, |
| "kl": 0.012922286987304688, |
| "learning_rate": 3.2998512237565005e-07, |
| "loss": 0.0021, |
| "reward": 6.655686378479004, |
| "reward_std": 0.8838780298829079, |
| "rewards/mrr_reward": 0.49754463881254196, |
| "rewards/rank_analyze_format_reward": 0.7454710304737091, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 472 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 598.546875, |
| "epoch": 3.784, |
| "grad_norm": 0.04061814397573471, |
| "kl": 0.015371322631835938, |
| "learning_rate": 3.1416838871368925e-07, |
| "loss": 0.0023, |
| "reward": 6.667187333106995, |
| "reward_std": 1.6450905501842499, |
| "rewards/mrr_reward": 0.546354167163372, |
| "rewards/rank_analyze_format_reward": 0.6706438362598419, |
| "rewards/rank_answer_foramt_reward": 0.822265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983368366956711, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9983368366956711, |
| "step": 473 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.578125, |
| "epoch": 3.792, |
| "grad_norm": 0.03757631406188011, |
| "kl": 0.011842727661132812, |
| "learning_rate": 2.987340350989421e-07, |
| "loss": -0.0277, |
| "reward": 7.062578439712524, |
| "reward_std": 0.7171650826931, |
| "rewards/mrr_reward": 0.5821800529956818, |
| "rewards/rank_analyze_format_reward": 0.8331592828035355, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 474 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.390625, |
| "epoch": 3.8, |
| "grad_norm": 0.0385047122836113, |
| "kl": 0.013031005859375, |
| "learning_rate": 2.836826708532603e-07, |
| "loss": -0.0071, |
| "reward": 6.774095058441162, |
| "reward_std": 1.0064187571406364, |
| "rewards/mrr_reward": 0.5209139287471771, |
| "rewards/rank_analyze_format_reward": 0.80762679874897, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 475 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.0, |
| "epoch": 3.808, |
| "grad_norm": 0.03585473448038101, |
| "kl": 0.013561248779296875, |
| "learning_rate": 2.6901489017873375e-07, |
| "loss": 0.0157, |
| "reward": 7.464797139167786, |
| "reward_std": 0.975187674164772, |
| "rewards/mrr_reward": 0.7072048783302307, |
| "rewards/rank_analyze_format_reward": 0.7355870008468628, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 476 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 597.921875, |
| "epoch": 3.816, |
| "grad_norm": 0.039436955004930496, |
| "kl": 0.013492584228515625, |
| "learning_rate": 2.547312721342277e-07, |
| "loss": -0.0445, |
| "reward": 6.666959643363953, |
| "reward_std": 0.9370896592736244, |
| "rewards/mrr_reward": 0.5154203921556473, |
| "rewards/rank_analyze_format_reward": 0.6805618405342102, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.997514471411705, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.997514471411705, |
| "step": 477 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.046875, |
| "epoch": 3.824, |
| "grad_norm": 0.03860106319189072, |
| "kl": 0.013734817504882812, |
| "learning_rate": 2.4083238061252565e-07, |
| "loss": 0.0084, |
| "reward": 6.7049055099487305, |
| "reward_std": 0.43066950887441635, |
| "rewards/mrr_reward": 0.4952381029725075, |
| "rewards/rank_analyze_format_reward": 0.7649687975645065, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 478 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.34375, |
| "epoch": 3.832, |
| "grad_norm": 0.041024912148714066, |
| "kl": 0.012483596801757812, |
| "learning_rate": 2.273187643180652e-07, |
| "loss": -0.023, |
| "reward": 6.57793653011322, |
| "reward_std": 1.0732970535755157, |
| "rewards/mrr_reward": 0.4735739082098007, |
| "rewards/rank_analyze_format_reward": 0.7500470578670502, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 479 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.578125, |
| "epoch": 3.84, |
| "grad_norm": 0.034815359860658646, |
| "kl": 0.011325836181640625, |
| "learning_rate": 2.1419095674527934e-07, |
| "loss": 0.0153, |
| "reward": 6.932149052619934, |
| "reward_std": 1.0536329746246338, |
| "rewards/mrr_reward": 0.5726066380739212, |
| "rewards/rank_analyze_format_reward": 0.731004387140274, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9953981041908264, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9953981041908264, |
| "step": 480 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.40625, |
| "epoch": 3.848, |
| "grad_norm": 0.036056023091077805, |
| "kl": 0.01219940185546875, |
| "learning_rate": 2.014494761575314e-07, |
| "loss": -0.0102, |
| "reward": 7.459859132766724, |
| "reward_std": 0.681392565369606, |
| "rewards/mrr_reward": 0.6882998645305634, |
| "rewards/rank_analyze_format_reward": 0.7418159544467926, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 481 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 632.6875, |
| "epoch": 3.856, |
| "grad_norm": 0.038142506033182144, |
| "kl": 0.011157989501953125, |
| "learning_rate": 1.8909482556666026e-07, |
| "loss": -0.0052, |
| "reward": 6.9656277894973755, |
| "reward_std": 1.1684068441390991, |
| "rewards/mrr_reward": 0.5538752377033234, |
| "rewards/rank_analyze_format_reward": 0.8277124911546707, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9953869134187698, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9953869134187698, |
| "step": 482 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 588.96875, |
| "epoch": 3.864, |
| "grad_norm": 0.03848228603601456, |
| "kl": 0.01366424560546875, |
| "learning_rate": 1.7712749271311392e-07, |
| "loss": -0.0115, |
| "reward": 7.90727972984314, |
| "reward_std": 0.9489937871694565, |
| "rewards/mrr_reward": 0.8121279776096344, |
| "rewards/rank_analyze_format_reward": 0.7349397465586662, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 483 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 592.140625, |
| "epoch": 3.872, |
| "grad_norm": 0.03674139454960823, |
| "kl": 0.01291656494140625, |
| "learning_rate": 1.6554795004670389e-07, |
| "loss": -0.0121, |
| "reward": 7.151305317878723, |
| "reward_std": 1.3574425652623177, |
| "rewards/mrr_reward": 0.6450272798538208, |
| "rewards/rank_analyze_format_reward": 0.7020555436611176, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 484 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 593.875, |
| "epoch": 3.88, |
| "grad_norm": 0.03965899720788002, |
| "kl": 0.012401580810546875, |
| "learning_rate": 1.543566547079467e-07, |
| "loss": -0.0167, |
| "reward": 6.482243657112122, |
| "reward_std": 0.920079916715622, |
| "rewards/mrr_reward": 0.4476686418056488, |
| "rewards/rank_analyze_format_reward": 0.7362185418605804, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.013944223523139954, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 485 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 614.71875, |
| "epoch": 3.888, |
| "grad_norm": 0.038730841130018234, |
| "kl": 0.015628814697265625, |
| "learning_rate": 1.4355404851001953e-07, |
| "loss": -0.0013, |
| "reward": 7.664029955863953, |
| "reward_std": 0.8843671232461929, |
| "rewards/mrr_reward": 0.7207217365503311, |
| "rewards/rank_analyze_format_reward": 0.8122782856225967, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 486 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 584.703125, |
| "epoch": 3.896, |
| "grad_norm": 0.03808212652802467, |
| "kl": 0.013837814331054688, |
| "learning_rate": 1.3314055792131964e-07, |
| "loss": 0.0003, |
| "reward": 7.29535174369812, |
| "reward_std": 0.6776691898703575, |
| "rewards/mrr_reward": 0.6445312649011612, |
| "rewards/rank_analyze_format_reward": 0.7601954787969589, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 487 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 632.71875, |
| "epoch": 3.904, |
| "grad_norm": 0.04019862040877342, |
| "kl": 0.012102127075195312, |
| "learning_rate": 1.231165940486234e-07, |
| "loss": 0.0098, |
| "reward": 7.2379196882247925, |
| "reward_std": 1.0804044008255005, |
| "rewards/mrr_reward": 0.6402343809604645, |
| "rewards/rank_analyze_format_reward": 0.7469863891601562, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 488 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 611.21875, |
| "epoch": 3.912, |
| "grad_norm": 0.04155530408024788, |
| "kl": 0.014692306518554688, |
| "learning_rate": 1.134825526208605e-07, |
| "loss": -0.0172, |
| "reward": 6.462707042694092, |
| "reward_std": 1.3991620540618896, |
| "rewards/mrr_reward": 0.4669705033302307, |
| "rewards/rank_analyze_format_reward": 0.7217782437801361, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 489 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.140625, |
| "epoch": 3.92, |
| "grad_norm": 0.038728028535842896, |
| "kl": 0.011913299560546875, |
| "learning_rate": 1.0423881397349067e-07, |
| "loss": -0.0274, |
| "reward": 6.584546685218811, |
| "reward_std": 0.987204298377037, |
| "rewards/mrr_reward": 0.47931547462940216, |
| "rewards/rank_analyze_format_reward": 0.7864252328872681, |
| "rewards/rank_answer_foramt_reward": 0.927734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 490 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 643.890625, |
| "epoch": 3.928, |
| "grad_norm": 0.037255771458148956, |
| "kl": 0.012048721313476562, |
| "learning_rate": 9.538574303348813e-08, |
| "loss": -0.0003, |
| "reward": 7.248134255409241, |
| "reward_std": 0.609087161719799, |
| "rewards/mrr_reward": 0.6106956899166107, |
| "rewards/rank_analyze_format_reward": 0.8346483111381531, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 491 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 607.09375, |
| "epoch": 3.936, |
| "grad_norm": 0.03653496131300926, |
| "kl": 0.014324188232421875, |
| "learning_rate": 8.692368930493522e-08, |
| "loss": -0.0078, |
| "reward": 7.079232692718506, |
| "reward_std": 0.7762657403945923, |
| "rewards/mrr_reward": 0.5865451470017433, |
| "rewards/rank_analyze_format_reward": 0.7700468897819519, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 492 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.9375, |
| "epoch": 3.944, |
| "grad_norm": 0.03887813538312912, |
| "kl": 0.01277923583984375, |
| "learning_rate": 7.885298685522235e-08, |
| "loss": 0.0019, |
| "reward": 6.853779196739197, |
| "reward_std": 0.5883737578988075, |
| "rewards/mrr_reward": 0.5194568485021591, |
| "rewards/rank_analyze_format_reward": 0.8206439018249512, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 493 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 628.890625, |
| "epoch": 3.952, |
| "grad_norm": 0.03760422766208649, |
| "kl": 0.012228012084960938, |
| "learning_rate": 7.117395430186414e-08, |
| "loss": -0.0105, |
| "reward": 7.296409845352173, |
| "reward_std": 1.1353522688150406, |
| "rewards/mrr_reward": 0.6470052301883698, |
| "rewards/rank_analyze_format_reward": 0.7630764245986938, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 494 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 632.453125, |
| "epoch": 3.96, |
| "grad_norm": 0.04088395833969116, |
| "kl": 0.011974334716796875, |
| "learning_rate": 6.388689479991606e-08, |
| "loss": -0.014, |
| "reward": 6.792389988899231, |
| "reward_std": 0.9225097447633743, |
| "rewards/mrr_reward": 0.5218750275671482, |
| "rewards/rank_analyze_format_reward": 0.7651283890008926, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 495 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 586.046875, |
| "epoch": 3.968, |
| "grad_norm": 0.038039859384298325, |
| "kl": 0.012334823608398438, |
| "learning_rate": 5.699209603001077e-08, |
| "loss": -0.0243, |
| "reward": 7.0821181535720825, |
| "reward_std": 1.2305900156497955, |
| "rewards/mrr_reward": 0.6263020932674408, |
| "rewards/rank_analyze_format_reward": 0.676519088447094, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 496 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 619.65625, |
| "epoch": 3.976, |
| "grad_norm": 0.042080074548721313, |
| "kl": 0.012048721313476562, |
| "learning_rate": 5.048983018699827e-08, |
| "loss": 0.0112, |
| "reward": 7.104416489601135, |
| "reward_std": 1.2093525528907776, |
| "rewards/mrr_reward": 0.5863157212734222, |
| "rewards/rank_analyze_format_reward": 0.7825910001993179, |
| "rewards/rank_answer_foramt_reward": 0.984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 497 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 602.734375, |
| "epoch": 3.984, |
| "grad_norm": 0.038315799087285995, |
| "kl": 0.010751724243164062, |
| "learning_rate": 4.438035396920004e-08, |
| "loss": -0.0141, |
| "reward": 6.625136733055115, |
| "reward_std": 0.9687140211462975, |
| "rewards/mrr_reward": 0.4734809100627899, |
| "rewards/rank_analyze_format_reward": 0.7722287178039551, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 498 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 618.96875, |
| "epoch": 3.992, |
| "grad_norm": 0.03733893856406212, |
| "kl": 0.01275634765625, |
| "learning_rate": 3.866390856827495e-08, |
| "loss": -0.0313, |
| "reward": 7.275609493255615, |
| "reward_std": 0.918476015329361, |
| "rewards/mrr_reward": 0.6309213787317276, |
| "rewards/rank_analyze_format_reward": 0.8395065367221832, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 499 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 582.34375, |
| "epoch": 4.0, |
| "grad_norm": 0.03825841844081879, |
| "kl": 0.015573501586914062, |
| "learning_rate": 3.3340719659701315e-08, |
| "loss": -0.0178, |
| "reward": 7.2012619972229, |
| "reward_std": 0.7794432565569878, |
| "rewards/mrr_reward": 0.6161458268761635, |
| "rewards/rank_analyze_format_reward": 0.795157715678215, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 500 |
| }, |
| { |
| "epoch": 4.0, |
| "step": 500, |
| "total_flos": 0.0, |
| "train_loss": -0.0018289514125790446, |
| "train_runtime": 36870.3642, |
| "train_samples_per_second": 0.868, |
| "train_steps_per_second": 0.014 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|