jnian's picture
Model save
e0c00cc verified
raw
history blame
365 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 455.8125,
"epoch": 0.008,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0618,
"reward": 4.689006567001343,
"reward_std": 1.78610560297966,
"rewards/mrr_reward": 0.2938988097012043,
"rewards/rank_analyze_format_reward": 0.11466514505445957,
"rewards/rank_answer_foramt_reward": 0.501953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.8984375,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 486.734375,
"epoch": 0.016,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": 0.0063,
"reward": 3.9557588696479797,
"reward_std": 1.5732559561729431,
"rewards/mrr_reward": 0.169766865670681,
"rewards/rank_analyze_format_reward": 0.07681952975690365,
"rewards/rank_answer_foramt_reward": 0.36328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9807952791452408,
"rewards/rank_overall_format_reward_more": 0.875,
"rewards/rank_verify_format_reward": 0.9807952791452408,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 441.203125,
"epoch": 0.024,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0565,
"reward": 4.4016576409339905,
"reward_std": 1.6944840550422668,
"rewards/mrr_reward": 0.2554253488779068,
"rewards/rank_analyze_format_reward": 0.11000172607600689,
"rewards/rank_answer_foramt_reward": 0.4140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.875,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 473.03125,
"epoch": 0.032,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0361,
"reward": 5.027822136878967,
"reward_std": 1.9866893887519836,
"rewards/mrr_reward": 0.3529265820980072,
"rewards/rank_analyze_format_reward": 0.15089312940835953,
"rewards/rank_answer_foramt_reward": 0.5546875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.998236283659935,
"rewards/rank_overall_format_reward_more": 0.9140625,
"rewards/rank_verify_format_reward": 0.998236283659935,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 488.078125,
"epoch": 0.04,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0246,
"reward": 4.68894362449646,
"reward_std": 1.7762902677059174,
"rewards/mrr_reward": 0.3031250014901161,
"rewards/rank_analyze_format_reward": 0.2197269294410944,
"rewards/rank_answer_foramt_reward": 0.474609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9808974117040634,
"rewards/rank_overall_format_reward_more": 0.8203125,
"rewards/rank_verify_format_reward": 0.9808974117040634,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 476.78125,
"epoch": 0.048,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0422,
"reward": 4.237152338027954,
"reward_std": 1.5055316388607025,
"rewards/mrr_reward": 0.19459325820207596,
"rewards/rank_analyze_format_reward": 0.14743656385689974,
"rewards/rank_answer_foramt_reward": 0.46875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9994212985038757,
"rewards/rank_overall_format_reward_more": 0.859375,
"rewards/rank_verify_format_reward": 0.9837962985038757,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 456.6875,
"epoch": 0.056,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.023,
"reward": 4.391666889190674,
"reward_std": 1.7478849291801453,
"rewards/mrr_reward": 0.27126736007630825,
"rewards/rank_analyze_format_reward": 0.13924695551395416,
"rewards/rank_answer_foramt_reward": 0.4375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9977376908063889,
"rewards/rank_overall_format_reward_more": 0.78125,
"rewards/rank_verify_format_reward": 0.9508626908063889,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 438.890625,
"epoch": 0.064,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0161,
"reward": 4.737706661224365,
"reward_std": 1.8800698816776276,
"rewards/mrr_reward": 0.328125,
"rewards/rank_analyze_format_reward": 0.08576204627752304,
"rewards/rank_answer_foramt_reward": 0.484375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.997436136007309,
"rewards/rank_overall_format_reward_more": 0.875,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 440.546875,
"epoch": 0.072,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.026,
"reward": 4.544053554534912,
"reward_std": 2.140646994113922,
"rewards/mrr_reward": 0.3108258917927742,
"rewards/rank_analyze_format_reward": 0.09371883049607277,
"rewards/rank_answer_foramt_reward": 0.44140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.8125,
"rewards/rank_verify_format_reward": 0.953125,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 486.078125,
"epoch": 0.08,
"grad_norm": 0.02036167122423649,
"kl": 0.0,
"learning_rate": 1.9999999684172664e-05,
"loss": -0.0462,
"reward": 4.728065490722656,
"reward_std": 1.9379011690616608,
"rewards/mrr_reward": 0.3036644458770752,
"rewards/rank_analyze_format_reward": 0.2416149042546749,
"rewards/rank_answer_foramt_reward": 0.4921875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9835526347160339,
"rewards/rank_overall_format_reward_more": 0.828125,
"rewards/rank_verify_format_reward": 0.9679276347160339,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 478.84375,
"epoch": 0.088,
"grad_norm": 0.02036167122423649,
"kl": -5.602836608886719e-06,
"learning_rate": 1.9999999684172664e-05,
"loss": -0.0299,
"reward": 4.586392045021057,
"reward_std": 1.808391511440277,
"rewards/mrr_reward": 0.26946303993463516,
"rewards/rank_analyze_format_reward": 0.178153439424932,
"rewards/rank_answer_foramt_reward": 0.52734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.8359375,
"rewards/rank_verify_format_reward": 0.9679276347160339,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 455.078125,
"epoch": 0.096,
"grad_norm": 0.020598648115992546,
"kl": -6.273388862609863e-06,
"learning_rate": 1.9999998736690666e-05,
"loss": -0.019,
"reward": 4.161486208438873,
"reward_std": 1.7841115891933441,
"rewards/mrr_reward": 0.21319444477558136,
"rewards/rank_analyze_format_reward": 0.09823539853096008,
"rewards/rank_answer_foramt_reward": 0.396484375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9968380630016327,
"rewards/rank_overall_format_reward_more": 0.8359375,
"rewards/rank_verify_format_reward": 0.9812130630016327,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 459.578125,
"epoch": 0.104,
"grad_norm": 0.021508827805519104,
"kl": -4.723668098449707e-06,
"learning_rate": 1.999999715755407e-05,
"loss": -0.0384,
"reward": 4.695295810699463,
"reward_std": 1.5369611978530884,
"rewards/mrr_reward": 0.3081597238779068,
"rewards/rank_analyze_format_reward": 0.11086451821029186,
"rewards/rank_answer_foramt_reward": 0.48046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.875,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 464.015625,
"epoch": 0.112,
"grad_norm": 0.022063156589865685,
"kl": -5.081295967102051e-06,
"learning_rate": 1.9999994946762974e-05,
"loss": -0.0454,
"reward": 4.200581610202789,
"reward_std": 1.8469471633434296,
"rewards/mrr_reward": 0.20451389625668526,
"rewards/rank_analyze_format_reward": 0.16768221091479063,
"rewards/rank_answer_foramt_reward": 0.41015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.8203125,
"rewards/rank_verify_format_reward": 0.984375,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 452.5,
"epoch": 0.12,
"grad_norm": 0.022208023816347122,
"kl": -4.26173210144043e-06,
"learning_rate": 1.999999210431752e-05,
"loss": -0.0243,
"reward": 4.085702300071716,
"reward_std": 1.512882336974144,
"rewards/mrr_reward": 0.17906746454536915,
"rewards/rank_analyze_format_reward": 0.14385094121098518,
"rewards/rank_answer_foramt_reward": 0.4375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9956032931804657,
"rewards/rank_overall_format_reward_more": 0.796875,
"rewards/rank_verify_format_reward": 0.9956032931804657,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 467.296875,
"epoch": 0.128,
"grad_norm": 0.020399712026119232,
"kl": -4.0084123611450195e-06,
"learning_rate": 1.9999988630217885e-05,
"loss": -0.0316,
"reward": 5.109304070472717,
"reward_std": 1.985443890094757,
"rewards/mrr_reward": 0.3867187425494194,
"rewards/rank_analyze_format_reward": 0.17149577103555202,
"rewards/rank_answer_foramt_reward": 0.580078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.8125,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 473.90625,
"epoch": 0.136,
"grad_norm": 0.02226601168513298,
"kl": -2.7865171432495117e-06,
"learning_rate": 1.999998452446429e-05,
"loss": -0.032,
"reward": 4.289996266365051,
"reward_std": 1.757462590932846,
"rewards/mrr_reward": 0.2313119969330728,
"rewards/rank_analyze_format_reward": 0.15154925920069218,
"rewards/rank_answer_foramt_reward": 0.44140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9835526347160339,
"rewards/rank_overall_format_reward_more": 0.8203125,
"rewards/rank_verify_format_reward": 0.9679276347160339,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 486.328125,
"epoch": 0.144,
"grad_norm": 0.02119840867817402,
"kl": -2.086162567138672e-07,
"learning_rate": 1.9999979787056998e-05,
"loss": -0.0259,
"reward": 4.4557565450668335,
"reward_std": 1.1966679394245148,
"rewards/mrr_reward": 0.22187501564621925,
"rewards/rank_analyze_format_reward": 0.1767062321305275,
"rewards/rank_answer_foramt_reward": 0.521484375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9975329041481018,
"rewards/rank_overall_format_reward_more": 0.875,
"rewards/rank_verify_format_reward": 0.9975329041481018,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 446.53125,
"epoch": 0.152,
"grad_norm": 0.02239903435111046,
"kl": -2.2351741790771484e-07,
"learning_rate": 1.9999974417996303e-05,
"loss": -0.0161,
"reward": 4.088248610496521,
"reward_std": 1.54827019572258,
"rewards/mrr_reward": 0.18916791677474976,
"rewards/rank_analyze_format_reward": 0.09350559022277594,
"rewards/rank_answer_foramt_reward": 0.42578125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9959887713193893,
"rewards/rank_overall_format_reward_more": 0.8359375,
"rewards/rank_verify_format_reward": 0.9803637713193893,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 467.40625,
"epoch": 0.16,
"grad_norm": 0.02158363349735737,
"kl": 2.995133399963379e-06,
"learning_rate": 1.9999968417282542e-05,
"loss": -0.0394,
"reward": 5.011839747428894,
"reward_std": 1.7887286245822906,
"rewards/mrr_reward": 0.35902776941657066,
"rewards/rank_analyze_format_reward": 0.1226036436855793,
"rewards/rank_answer_foramt_reward": 0.59375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.859375,
"rewards/rank_verify_format_reward": 1.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 512.265625,
"epoch": 0.168,
"grad_norm": 0.019902769476175308,
"kl": 2.086162567138672e-06,
"learning_rate": 1.99999617849161e-05,
"loss": -0.007,
"reward": 4.999041318893433,
"reward_std": 2.092874825000763,
"rewards/mrr_reward": 0.33280009776353836,
"rewards/rank_analyze_format_reward": 0.3474135100841522,
"rewards/rank_answer_foramt_reward": 0.564453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9678308814764023,
"rewards/rank_overall_format_reward_more": 0.8203125,
"rewards/rank_verify_format_reward": 0.9678308814764023,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 462.078125,
"epoch": 0.176,
"grad_norm": 0.0211955476552248,
"kl": 6.16908073425293e-06,
"learning_rate": 1.9999954520897394e-05,
"loss": 0.0067,
"reward": 4.904757022857666,
"reward_std": 1.5794726610183716,
"rewards/mrr_reward": 0.35468750447034836,
"rewards/rank_analyze_format_reward": 0.09507373627275229,
"rewards/rank_answer_foramt_reward": 0.611328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.8125,
"rewards/rank_verify_format_reward": 0.9679276347160339,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 466.828125,
"epoch": 0.184,
"grad_norm": 0.020976468920707703,
"kl": 6.943941116333008e-06,
"learning_rate": 1.999994662522688e-05,
"loss": -0.0219,
"reward": 5.450310587882996,
"reward_std": 1.9311817586421967,
"rewards/mrr_reward": 0.44487228244543076,
"rewards/rank_analyze_format_reward": 0.20164816547185183,
"rewards/rank_answer_foramt_reward": 0.66796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.8359375,
"rewards/rank_verify_format_reward": 0.9670085161924362,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 467.34375,
"epoch": 0.192,
"grad_norm": 0.0208530742675066,
"kl": 1.093745231628418e-05,
"learning_rate": 1.9999938097905064e-05,
"loss": -0.0345,
"reward": 4.764381527900696,
"reward_std": 1.8450036644935608,
"rewards/mrr_reward": 0.31850818172097206,
"rewards/rank_analyze_format_reward": 0.1051585366949439,
"rewards/rank_answer_foramt_reward": 0.546875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9972826093435287,
"rewards/rank_overall_format_reward_more": 0.859375,
"rewards/rank_verify_format_reward": 0.9816576093435287,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 477.484375,
"epoch": 0.2,
"grad_norm": 0.02045305259525776,
"kl": 1.1593103408813477e-05,
"learning_rate": 1.9999928938932473e-05,
"loss": -0.0176,
"reward": 4.7958372831344604,
"reward_std": 1.7617928981781006,
"rewards/mrr_reward": 0.2876054085791111,
"rewards/rank_analyze_format_reward": 0.28027439024299383,
"rewards/rank_answer_foramt_reward": 0.48828125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9970238208770752,
"rewards/rank_overall_format_reward_more": 0.8828125,
"rewards/rank_verify_format_reward": 0.9970238208770752,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 445.140625,
"epoch": 0.208,
"grad_norm": 0.02045305259525776,
"kl": 1.919269561767578e-05,
"learning_rate": 1.9999928938932473e-05,
"loss": -0.002,
"reward": 4.298715710639954,
"reward_std": 1.676234632730484,
"rewards/mrr_reward": 0.2303757481276989,
"rewards/rank_analyze_format_reward": 0.11518567334860563,
"rewards/rank_answer_foramt_reward": 0.458984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9679276347160339,
"rewards/rank_overall_format_reward_more": 0.8671875,
"rewards/rank_verify_format_reward": 0.9679276347160339,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 483.71875,
"epoch": 0.216,
"grad_norm": 0.02045305259525776,
"kl": 1.0028481483459473e-05,
"learning_rate": 1.9999928938932473e-05,
"loss": -0.0122,
"reward": 4.261886656284332,
"reward_std": 1.7420227527618408,
"rewards/mrr_reward": 0.19882812350988388,
"rewards/rank_analyze_format_reward": 0.19866678677499294,
"rewards/rank_answer_foramt_reward": 0.44921875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9837500005960464,
"rewards/rank_overall_format_reward_more": 0.8359375,
"rewards/rank_verify_format_reward": 0.9990011900663376,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 494.4375,
"epoch": 0.224,
"grad_norm": 0.020556651055812836,
"kl": 1.0758638381958008e-05,
"learning_rate": 1.99999191483097e-05,
"loss": -0.0292,
"reward": 4.516226172447205,
"reward_std": 1.9960070848464966,
"rewards/mrr_reward": 0.28723958507180214,
"rewards/rank_analyze_format_reward": 0.14337314292788506,
"rewards/rank_answer_foramt_reward": 0.470703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9820645451545715,
"rewards/rank_overall_format_reward_more": 0.7890625,
"rewards/rank_verify_format_reward": 0.9820645451545715,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 477.453125,
"epoch": 0.232,
"grad_norm": 0.019716233015060425,
"kl": 1.9982457160949707e-05,
"learning_rate": 1.999990872603735e-05,
"loss": -0.017,
"reward": 4.805420398712158,
"reward_std": 1.657298356294632,
"rewards/mrr_reward": 0.3268229216337204,
"rewards/rank_analyze_format_reward": 0.14590902999043465,
"rewards/rank_answer_foramt_reward": 0.490234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9934926480054855,
"rewards/rank_overall_format_reward_more": 0.875,
"rewards/rank_verify_format_reward": 0.9934926480054855,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 492.375,
"epoch": 0.24,
"grad_norm": 0.02322639897465706,
"kl": 1.965463161468506e-05,
"learning_rate": 1.999989767211609e-05,
"loss": -0.0386,
"reward": 4.979418992996216,
"reward_std": 1.6339992135763168,
"rewards/mrr_reward": 0.3164062537252903,
"rewards/rank_analyze_format_reward": 0.31370767019689083,
"rewards/rank_answer_foramt_reward": 0.51953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9988712668418884,
"rewards/rank_overall_format_reward_more": 0.8828125,
"rewards/rank_verify_format_reward": 0.9988712668418884,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 464.875,
"epoch": 0.248,
"grad_norm": 0.020231744274497032,
"kl": 2.7373433113098145e-05,
"learning_rate": 1.9999885986546613e-05,
"loss": -0.0448,
"reward": 4.7086580991744995,
"reward_std": 1.7371686697006226,
"rewards/mrr_reward": 0.2849392406642437,
"rewards/rank_analyze_format_reward": 0.16265114955604076,
"rewards/rank_answer_foramt_reward": 0.4921875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9296875,
"rewards/rank_verify_format_reward": 0.984375,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 487.484375,
"epoch": 0.256,
"grad_norm": 0.01962853968143463,
"kl": 3.975629806518555e-05,
"learning_rate": 1.999987366932966e-05,
"loss": -0.0411,
"reward": 4.679190993309021,
"reward_std": 1.5342676639556885,
"rewards/mrr_reward": 0.27297867834568024,
"rewards/rank_analyze_format_reward": 0.16313984990119934,
"rewards/rank_answer_foramt_reward": 0.56640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.875,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 454.515625,
"epoch": 0.264,
"grad_norm": 0.02267865277826786,
"kl": 3.771483898162842e-05,
"learning_rate": 1.9999860720466007e-05,
"loss": -0.0034,
"reward": 4.1931135058403015,
"reward_std": 1.5233525335788727,
"rewards/mrr_reward": 0.19470486417412758,
"rewards/rank_analyze_format_reward": 0.10926186013966799,
"rewards/rank_answer_foramt_reward": 0.443359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9972426444292068,
"rewards/rank_overall_format_reward_more": 0.8671875,
"rewards/rank_verify_format_reward": 0.9972426444292068,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 461.953125,
"epoch": 0.272,
"grad_norm": 0.02176724746823311,
"kl": 5.410611629486084e-05,
"learning_rate": 1.9999847139956477e-05,
"loss": -0.0314,
"reward": 4.550845384597778,
"reward_std": 1.958255022764206,
"rewards/mrr_reward": 0.30027903243899345,
"rewards/rank_analyze_format_reward": 0.0568907568231225,
"rewards/rank_answer_foramt_reward": 0.529296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9677083343267441,
"rewards/rank_overall_format_reward_more": 0.84375,
"rewards/rank_verify_format_reward": 0.9520833343267441,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 474.65625,
"epoch": 0.28,
"grad_norm": 0.0221868809312582,
"kl": 5.4582953453063965e-05,
"learning_rate": 1.9999832927801922e-05,
"loss": -0.0057,
"reward": 4.710769176483154,
"reward_std": 1.6857908964157104,
"rewards/mrr_reward": 0.3105034828186035,
"rewards/rank_analyze_format_reward": 0.1985221654176712,
"rewards/rank_answer_foramt_reward": 0.544921875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9798430800437927,
"rewards/rank_overall_format_reward_more": 0.78125,
"rewards/rank_verify_format_reward": 0.9642180800437927,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 496.78125,
"epoch": 0.288,
"grad_norm": 0.02121078222990036,
"kl": 6.13182783126831e-05,
"learning_rate": 1.9999818084003243e-05,
"loss": -0.0368,
"reward": 5.009979605674744,
"reward_std": 1.9656108021736145,
"rewards/mrr_reward": 0.32831721380352974,
"rewards/rank_analyze_format_reward": 0.24603652395308018,
"rewards/rank_answer_foramt_reward": 0.58984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9811964929103851,
"rewards/rank_overall_format_reward_more": 0.8984375,
"rewards/rank_verify_format_reward": 0.9811964929103851,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 489.40625,
"epoch": 0.296,
"grad_norm": 0.022762347012758255,
"kl": 7.359683513641357e-05,
"learning_rate": 1.999980260856137e-05,
"loss": 0.0164,
"reward": 4.261849403381348,
"reward_std": 1.6020236611366272,
"rewards/mrr_reward": 0.20416666939854622,
"rewards/rank_analyze_format_reward": 0.16004161350429058,
"rewards/rank_answer_foramt_reward": 0.4375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9980392158031464,
"rewards/rank_overall_format_reward_more": 0.8515625,
"rewards/rank_verify_format_reward": 0.9980392158031464,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 473.59375,
"epoch": 0.304,
"grad_norm": 0.02300061471760273,
"kl": 6.565451622009277e-05,
"learning_rate": 1.9999786501477298e-05,
"loss": -0.0407,
"reward": 4.600297033786774,
"reward_std": 1.597813993692398,
"rewards/mrr_reward": 0.2838975712656975,
"rewards/rank_analyze_format_reward": 0.10455834865570068,
"rewards/rank_answer_foramt_reward": 0.50390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9984335899353027,
"rewards/rank_overall_format_reward_more": 0.859375,
"rewards/rank_verify_format_reward": 0.9984335899353027,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 468.828125,
"epoch": 0.312,
"grad_norm": 0.022656837478280067,
"kl": 9.85860824584961e-05,
"learning_rate": 1.9999769762752024e-05,
"loss": -0.0421,
"reward": 4.96368944644928,
"reward_std": 1.8781414777040482,
"rewards/mrr_reward": 0.33848586305975914,
"rewards/rank_analyze_format_reward": 0.16822483576834202,
"rewards/rank_answer_foramt_reward": 0.654296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.8203125,
"rewards/rank_verify_format_reward": 0.9678308814764023,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 503.1875,
"epoch": 0.32,
"grad_norm": 0.023156002163887024,
"kl": 0.0001109689474105835,
"learning_rate": 1.999975239238662e-05,
"loss": -0.0188,
"reward": 5.189586162567139,
"reward_std": 2.028193384408951,
"rewards/mrr_reward": 0.36250000447034836,
"rewards/rank_analyze_format_reward": 0.3480729628354311,
"rewards/rank_answer_foramt_reward": 0.560546875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.981889471411705,
"rewards/rank_overall_format_reward_more": 0.8671875,
"rewards/rank_verify_format_reward": 0.981889471411705,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 461.5625,
"epoch": 0.328,
"grad_norm": 0.021381191909313202,
"kl": 0.00012908875942230225,
"learning_rate": 1.999973439038218e-05,
"loss": -0.0281,
"reward": 4.959184765815735,
"reward_std": 2.1065359711647034,
"rewards/mrr_reward": 0.36927083879709244,
"rewards/rank_analyze_format_reward": 0.1428538914769888,
"rewards/rank_answer_foramt_reward": 0.48828125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9918892979621887,
"rewards/rank_overall_format_reward_more": 0.8828125,
"rewards/rank_verify_format_reward": 0.9762642979621887,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 490.0,
"epoch": 0.336,
"grad_norm": 0.02237197570502758,
"kl": 0.00011831521987915039,
"learning_rate": 1.9999715756739833e-05,
"loss": -0.0379,
"reward": 4.825831055641174,
"reward_std": 1.8668445944786072,
"rewards/mrr_reward": 0.3253224194049835,
"rewards/rank_analyze_format_reward": 0.163555265404284,
"rewards/rank_answer_foramt_reward": 0.568359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9900633096694946,
"rewards/rank_overall_format_reward_more": 0.828125,
"rewards/rank_verify_format_reward": 0.9744383096694946,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 473.34375,
"epoch": 0.344,
"grad_norm": 0.022024238482117653,
"kl": 0.00014644861221313477,
"learning_rate": 1.9999696491460764e-05,
"loss": -0.0215,
"reward": 4.890589237213135,
"reward_std": 1.6738486886024475,
"rewards/mrr_reward": 0.3286830335855484,
"rewards/rank_analyze_format_reward": 0.15786650124937296,
"rewards/rank_answer_foramt_reward": 0.56640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9961046874523163,
"rewards/rank_overall_format_reward_more": 0.859375,
"rewards/rank_verify_format_reward": 0.9961046874523163,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 467.234375,
"epoch": 0.352,
"grad_norm": 0.02275724522769451,
"kl": 0.00016573071479797363,
"learning_rate": 1.9999676594546187e-05,
"loss": -0.0215,
"reward": 5.033377289772034,
"reward_std": 1.8407581448554993,
"rewards/mrr_reward": 0.3557477742433548,
"rewards/rank_analyze_format_reward": 0.14718732610344887,
"rewards/rank_answer_foramt_reward": 0.59765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.8828125,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 487.625,
"epoch": 0.36,
"grad_norm": 0.023730719462037086,
"kl": 0.00015407800674438477,
"learning_rate": 1.999965606599736e-05,
"loss": -0.0031,
"reward": 5.316616773605347,
"reward_std": 1.5850826501846313,
"rewards/mrr_reward": 0.4290550574660301,
"rewards/rank_analyze_format_reward": 0.08148389589041471,
"rewards/rank_answer_foramt_reward": 0.697265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9928547292947769,
"rewards/rank_overall_format_reward_more": 0.8515625,
"rewards/rank_verify_format_reward": 0.9772297292947769,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 509.9375,
"epoch": 0.368,
"grad_norm": 0.021739846095442772,
"kl": 0.00018548965454101562,
"learning_rate": 1.999963490581558e-05,
"loss": -0.0254,
"reward": 5.217623829841614,
"reward_std": 1.4084790647029877,
"rewards/mrr_reward": 0.33927951753139496,
"rewards/rank_analyze_format_reward": 0.3564212815836072,
"rewards/rank_answer_foramt_reward": 0.6953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.998135969042778,
"rewards/rank_overall_format_reward_more": 0.84375,
"rewards/rank_verify_format_reward": 0.966885969042778,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 473.734375,
"epoch": 0.376,
"grad_norm": 0.023394783958792686,
"kl": 0.00021630525588989258,
"learning_rate": 1.9999613114002184e-05,
"loss": -0.0309,
"reward": 4.08813738822937,
"reward_std": 1.2790243327617645,
"rewards/mrr_reward": 0.14723462983965874,
"rewards/rank_analyze_format_reward": 0.15144313033670187,
"rewards/rank_answer_foramt_reward": 0.431640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9971200972795486,
"rewards/rank_overall_format_reward_more": 0.921875,
"rewards/rank_verify_format_reward": 0.9971200972795486,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 472.421875,
"epoch": 0.384,
"grad_norm": 0.027028290554881096,
"kl": 0.00026175379753112793,
"learning_rate": 1.9999590690558545e-05,
"loss": -0.054,
"reward": 5.350240349769592,
"reward_std": 1.9697438478469849,
"rewards/mrr_reward": 0.42695312947034836,
"rewards/rank_analyze_format_reward": 0.21482349652796984,
"rewards/rank_answer_foramt_reward": 0.625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9989583343267441,
"rewards/rank_overall_format_reward_more": 0.8359375,
"rewards/rank_verify_format_reward": 0.9677083343267441,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 476.34375,
"epoch": 0.392,
"grad_norm": 0.021585488691926003,
"kl": 0.0002930760383605957,
"learning_rate": 1.9999567635486086e-05,
"loss": -0.0243,
"reward": 4.152051568031311,
"reward_std": 1.6824184954166412,
"rewards/mrr_reward": 0.18816964142024517,
"rewards/rank_analyze_format_reward": 0.12541021592915058,
"rewards/rank_answer_foramt_reward": 0.39453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983095824718475,
"rewards/rank_overall_format_reward_more": 0.8828125,
"rewards/rank_verify_format_reward": 0.9983095824718475,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 483.484375,
"epoch": 0.4,
"grad_norm": 0.022128406912088394,
"kl": 0.00023129582405090332,
"learning_rate": 1.9999543948786258e-05,
"loss": -0.0018,
"reward": 4.990848183631897,
"reward_std": 1.9261715412139893,
"rewards/mrr_reward": 0.3342633992433548,
"rewards/rank_analyze_format_reward": 0.1260274900123477,
"rewards/rank_answer_foramt_reward": 0.609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.921875,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 492.078125,
"epoch": 0.408,
"grad_norm": 0.023543158546090126,
"kl": 0.0002911984920501709,
"learning_rate": 1.9999519630460554e-05,
"loss": -0.0076,
"reward": 5.144826769828796,
"reward_std": 1.6632727682590485,
"rewards/mrr_reward": 0.3661458343267441,
"rewards/rank_analyze_format_reward": 0.16852473467588425,
"rewards/rank_answer_foramt_reward": 0.59765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9140625,
"rewards/rank_verify_format_reward": 1.0,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 466.515625,
"epoch": 0.416,
"grad_norm": 0.024417538195848465,
"kl": 0.0004246234893798828,
"learning_rate": 1.999949468051052e-05,
"loss": -0.0313,
"reward": 5.0145174860954285,
"reward_std": 1.8828826546669006,
"rewards/mrr_reward": 0.38802083767950535,
"rewards/rank_analyze_format_reward": 0.10110596101731062,
"rewards/rank_answer_foramt_reward": 0.556640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.8359375,
"rewards/rank_verify_format_reward": 0.96875,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 494.75,
"epoch": 0.424,
"grad_norm": 0.024848150089383125,
"kl": 0.0002892911434173584,
"learning_rate": 1.9999469098937726e-05,
"loss": -0.0361,
"reward": 4.832870543003082,
"reward_std": 1.565253883600235,
"rewards/mrr_reward": 0.2958891298621893,
"rewards/rank_analyze_format_reward": 0.1942360121756792,
"rewards/rank_answer_foramt_reward": 0.611328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.875,
"rewards/rank_verify_format_reward": 0.984375,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 504.5625,
"epoch": 0.432,
"grad_norm": 0.02211805246770382,
"kl": 0.00029155611991882324,
"learning_rate": 1.9999442885743785e-05,
"loss": -0.016,
"reward": 4.681830644607544,
"reward_std": 1.6615483164787292,
"rewards/mrr_reward": 0.28389756940305233,
"rewards/rank_analyze_format_reward": 0.1718399478122592,
"rewards/rank_answer_foramt_reward": 0.568359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9811454266309738,
"rewards/rank_overall_format_reward_more": 0.859375,
"rewards/rank_verify_format_reward": 0.9655204266309738,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 471.765625,
"epoch": 0.44,
"grad_norm": 0.02444814145565033,
"kl": 0.0004519224166870117,
"learning_rate": 1.9999416040930354e-05,
"loss": -0.0462,
"reward": 5.167219042778015,
"reward_std": 1.9449047446250916,
"rewards/mrr_reward": 0.3921875059604645,
"rewards/rank_analyze_format_reward": 0.1719050519168377,
"rewards/rank_answer_foramt_reward": 0.513671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9955085963010788,
"rewards/rank_overall_format_reward_more": 0.921875,
"rewards/rank_verify_format_reward": 0.9955085963010788,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 501.21875,
"epoch": 0.448,
"grad_norm": 0.024404721334576607,
"kl": 0.00047457218170166016,
"learning_rate": 1.9999388564499135e-05,
"loss": -0.047,
"reward": 5.111963272094727,
"reward_std": 1.9699311256408691,
"rewards/mrr_reward": 0.340104166418314,
"rewards/rank_analyze_format_reward": 0.30795731022953987,
"rewards/rank_answer_foramt_reward": 0.650390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9669117629528046,
"rewards/rank_overall_format_reward_more": 0.859375,
"rewards/rank_verify_format_reward": 0.9669117629528046,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 481.421875,
"epoch": 0.456,
"grad_norm": 0.024884849786758423,
"kl": 0.0005426406860351562,
"learning_rate": 1.999936045645186e-05,
"loss": -0.0116,
"reward": 4.459952890872955,
"reward_std": 1.6162844747304916,
"rewards/mrr_reward": 0.24435143917798996,
"rewards/rank_analyze_format_reward": 0.10262943152338266,
"rewards/rank_answer_foramt_reward": 0.5234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9985526353120804,
"rewards/rank_overall_format_reward_more": 0.890625,
"rewards/rank_verify_format_reward": 0.9673026353120804,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 476.625,
"epoch": 0.464,
"grad_norm": 0.02534506469964981,
"kl": 0.0007425546646118164,
"learning_rate": 1.9999331716790303e-05,
"loss": -0.0169,
"reward": 4.837222576141357,
"reward_std": 1.9827671647071838,
"rewards/mrr_reward": 0.33585068956017494,
"rewards/rank_analyze_format_reward": 0.20996354706585407,
"rewards/rank_answer_foramt_reward": 0.470703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9807952791452408,
"rewards/rank_overall_format_reward_more": 0.8515625,
"rewards/rank_verify_format_reward": 0.9807952791452408,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 530.046875,
"epoch": 0.472,
"grad_norm": 0.022839965298771858,
"kl": 0.0004405379295349121,
"learning_rate": 1.9999302345516278e-05,
"loss": -0.0295,
"reward": 5.279780864715576,
"reward_std": 1.9629344046115875,
"rewards/mrr_reward": 0.36336806416511536,
"rewards/rank_analyze_format_reward": 0.2832249477505684,
"rewards/rank_answer_foramt_reward": 0.654296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.890625,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 516.65625,
"epoch": 0.48,
"grad_norm": 0.0263381227850914,
"kl": 0.0005091428756713867,
"learning_rate": 1.9999272342631644e-05,
"loss": -0.0381,
"reward": 6.471034526824951,
"reward_std": 1.9417240023612976,
"rewards/mrr_reward": 0.6197172403335571,
"rewards/rank_analyze_format_reward": 0.26364994794130325,
"rewards/rank_answer_foramt_reward": 0.791015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9375,
"rewards/rank_verify_format_reward": 1.0,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 480.421875,
"epoch": 0.488,
"grad_norm": 0.02566557377576828,
"kl": 0.0005975961685180664,
"learning_rate": 1.9999241708138296e-05,
"loss": -0.0056,
"reward": 5.077809810638428,
"reward_std": 1.307851292192936,
"rewards/mrr_reward": 0.35500991344451904,
"rewards/rank_analyze_format_reward": 0.10372397117316723,
"rewards/rank_answer_foramt_reward": 0.6328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9957729876041412,
"rewards/rank_overall_format_reward_more": 0.9296875,
"rewards/rank_verify_format_reward": 0.9957729876041412,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 472.875,
"epoch": 0.496,
"grad_norm": 0.027827711775898933,
"kl": 0.000952601432800293,
"learning_rate": 1.9999210442038164e-05,
"loss": -0.0339,
"reward": 4.869051575660706,
"reward_std": 1.8942435383796692,
"rewards/mrr_reward": 0.3203125074505806,
"rewards/rank_analyze_format_reward": 0.16581160761415958,
"rewards/rank_answer_foramt_reward": 0.548828125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9834558814764023,
"rewards/rank_overall_format_reward_more": 0.921875,
"rewards/rank_verify_format_reward": 0.9678308814764023,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 485.609375,
"epoch": 0.504,
"grad_norm": 0.024270422756671906,
"kl": 0.000729680061340332,
"learning_rate": 1.9999178544333228e-05,
"loss": 0.0064,
"reward": 5.877958178520203,
"reward_std": 1.8244962692260742,
"rewards/mrr_reward": 0.5174479112029076,
"rewards/rank_analyze_format_reward": 0.19235198944807053,
"rewards/rank_answer_foramt_reward": 0.736328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983368366956711,
"rewards/rank_overall_format_reward_more": 0.8828125,
"rewards/rank_verify_format_reward": 0.9983368366956711,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 515.46875,
"epoch": 0.512,
"grad_norm": 0.022133484482765198,
"kl": 0.0008175373077392578,
"learning_rate": 1.9999146015025503e-05,
"loss": 0.0092,
"reward": 5.555278539657593,
"reward_std": 1.9869469702243805,
"rewards/mrr_reward": 0.45848215371370316,
"rewards/rank_analyze_format_reward": 0.22101733088493347,
"rewards/rank_answer_foramt_reward": 0.666015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9913771450519562,
"rewards/rank_overall_format_reward_more": 0.8828125,
"rewards/rank_verify_format_reward": 0.9601271450519562,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 511.625,
"epoch": 0.52,
"grad_norm": 0.024551959708333015,
"kl": 0.0007832050323486328,
"learning_rate": 1.999911285411704e-05,
"loss": -0.0049,
"reward": 5.41889089345932,
"reward_std": 1.9643912464380264,
"rewards/mrr_reward": 0.43281250074505806,
"rewards/rank_analyze_format_reward": 0.2138027586042881,
"rewards/rank_answer_foramt_reward": 0.62890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9927783608436584,
"rewards/rank_overall_format_reward_more": 0.875,
"rewards/rank_verify_format_reward": 0.9771533608436584,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 528.5,
"epoch": 0.528,
"grad_norm": 0.02290545031428337,
"kl": 0.0008490085601806641,
"learning_rate": 1.9999079061609933e-05,
"loss": -0.021,
"reward": 4.910151720046997,
"reward_std": 1.064635694026947,
"rewards/mrr_reward": 0.2832217253744602,
"rewards/rank_analyze_format_reward": 0.2712905704975128,
"rewards/rank_answer_foramt_reward": 0.5859375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 514.546875,
"epoch": 0.536,
"grad_norm": 0.024759415537118912,
"kl": 0.0009107589721679688,
"learning_rate": 1.999904463750632e-05,
"loss": 0.0076,
"reward": 4.854610323905945,
"reward_std": 1.8393707275390625,
"rewards/mrr_reward": 0.30915798619389534,
"rewards/rank_analyze_format_reward": 0.23711884673684835,
"rewards/rank_answer_foramt_reward": 0.576171875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.8203125,
"rewards/rank_verify_format_reward": 0.984375,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 499.859375,
"epoch": 0.544,
"grad_norm": 0.024759415537118912,
"kl": 0.0008776187896728516,
"learning_rate": 1.999904463750632e-05,
"loss": -0.0246,
"reward": 5.42217218875885,
"reward_std": 1.3200950622558594,
"rewards/mrr_reward": 0.42010788805782795,
"rewards/rank_analyze_format_reward": 0.19172357022762299,
"rewards/rank_answer_foramt_reward": 0.654296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.998641312122345,
"rewards/rank_overall_format_reward_more": 0.9140625,
"rewards/rank_verify_format_reward": 0.983016312122345,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 496.140625,
"epoch": 0.552,
"grad_norm": 0.02841918356716633,
"kl": 0.0010838508605957031,
"learning_rate": 1.999900958180838e-05,
"loss": -0.0281,
"reward": 5.81439483165741,
"reward_std": 1.740799367427826,
"rewards/mrr_reward": 0.5312500074505806,
"rewards/rank_analyze_format_reward": 0.1757229631766677,
"rewards/rank_answer_foramt_reward": 0.677734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.8828125,
"rewards/rank_verify_format_reward": 0.96875,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 466.078125,
"epoch": 0.56,
"grad_norm": 0.02891196869313717,
"kl": 0.001157999038696289,
"learning_rate": 1.9998973894518318e-05,
"loss": -0.0123,
"reward": 5.705892205238342,
"reward_std": 2.0529025495052338,
"rewards/mrr_reward": 0.4973958432674408,
"rewards/rank_analyze_format_reward": 0.15696396678686142,
"rewards/rank_answer_foramt_reward": 0.638671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9993990361690521,
"rewards/rank_overall_format_reward_more": 0.921875,
"rewards/rank_verify_format_reward": 0.9993990361690521,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 492.296875,
"epoch": 0.568,
"grad_norm": 0.024859309196472168,
"kl": 0.0010205507278442383,
"learning_rate": 1.999893757563839e-05,
"loss": 0.0114,
"reward": 5.464065313339233,
"reward_std": 1.7677285969257355,
"rewards/mrr_reward": 0.446893610060215,
"rewards/rank_analyze_format_reward": 0.10556165501475334,
"rewards/rank_answer_foramt_reward": 0.68359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.890625,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 531.109375,
"epoch": 0.576,
"grad_norm": 0.026754125952720642,
"kl": 0.001056671142578125,
"learning_rate": 1.9998900625170897e-05,
"loss": -0.0067,
"reward": 6.407280087471008,
"reward_std": 1.8228637278079987,
"rewards/mrr_reward": 0.5859375298023224,
"rewards/rank_analyze_format_reward": 0.30571743845939636,
"rewards/rank_answer_foramt_reward": 0.828125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9296875,
"rewards/rank_verify_format_reward": 1.0,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 508.484375,
"epoch": 0.584,
"grad_norm": 0.028836144134402275,
"kl": 0.00142669677734375,
"learning_rate": 1.9998863043118163e-05,
"loss": -0.0076,
"reward": 4.505983591079712,
"reward_std": 1.231943815946579,
"rewards/mrr_reward": 0.2062872126698494,
"rewards/rank_analyze_format_reward": 0.20817857421934605,
"rewards/rank_answer_foramt_reward": 0.56640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.921875,
"rewards/rank_verify_format_reward": 0.984375,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 511.453125,
"epoch": 0.592,
"grad_norm": 0.025052759796380997,
"kl": 0.0013051033020019531,
"learning_rate": 1.999882482948257e-05,
"loss": -0.0097,
"reward": 5.300284147262573,
"reward_std": 1.6650860607624054,
"rewards/mrr_reward": 0.38593750447034836,
"rewards/rank_analyze_format_reward": 0.15106541197746992,
"rewards/rank_answer_foramt_reward": 0.66015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 1.0,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 504.625,
"epoch": 0.6,
"grad_norm": 0.026283830404281616,
"kl": 0.0021805763244628906,
"learning_rate": 1.999878598426653e-05,
"loss": -0.0317,
"reward": 5.158125400543213,
"reward_std": 1.4547997415065765,
"rewards/mrr_reward": 0.3474578373134136,
"rewards/rank_analyze_format_reward": 0.2175126150250435,
"rewards/rank_answer_foramt_reward": 0.58203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 1.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 516.265625,
"epoch": 0.608,
"grad_norm": 0.027127476409077644,
"kl": 0.001390218734741211,
"learning_rate": 1.9998746507472493e-05,
"loss": -0.0426,
"reward": 5.807446002960205,
"reward_std": 1.929233893752098,
"rewards/mrr_reward": 0.4895833358168602,
"rewards/rank_analyze_format_reward": 0.3203737363219261,
"rewards/rank_answer_foramt_reward": 0.689453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9821428656578064,
"rewards/rank_overall_format_reward_more": 0.890625,
"rewards/rank_verify_format_reward": 0.9665178656578064,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 515.015625,
"epoch": 0.616,
"grad_norm": 0.026926733553409576,
"kl": 0.001764059066772461,
"learning_rate": 1.999870639910296e-05,
"loss": -0.0223,
"reward": 5.370245575904846,
"reward_std": 1.9943826496601105,
"rewards/mrr_reward": 0.3968749977648258,
"rewards/rank_analyze_format_reward": 0.2607038579881191,
"rewards/rank_answer_foramt_reward": 0.607421875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9924661070108414,
"rewards/rank_overall_format_reward_more": 0.9296875,
"rewards/rank_verify_format_reward": 0.9924661070108414,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 481.015625,
"epoch": 0.624,
"grad_norm": 0.027938006445765495,
"kl": 0.0017654895782470703,
"learning_rate": 1.9998665659160453e-05,
"loss": -0.0188,
"reward": 5.412413477897644,
"reward_std": 1.896736979484558,
"rewards/mrr_reward": 0.41354167461395264,
"rewards/rank_analyze_format_reward": 0.24247420020401478,
"rewards/rank_answer_foramt_reward": 0.63671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9942144006490707,
"rewards/rank_overall_format_reward_more": 0.90625,
"rewards/rank_verify_format_reward": 0.9785894006490707,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 499.15625,
"epoch": 0.632,
"grad_norm": 0.024667983874678612,
"kl": 0.0013856887817382812,
"learning_rate": 1.999862428764756e-05,
"loss": -0.0076,
"reward": 6.024145722389221,
"reward_std": 1.524814635515213,
"rewards/mrr_reward": 0.5302269533276558,
"rewards/rank_analyze_format_reward": 0.23562652617692947,
"rewards/rank_answer_foramt_reward": 0.794921875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9984335899353027,
"rewards/rank_overall_format_reward_more": 0.90625,
"rewards/rank_verify_format_reward": 0.9680059552192688,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 501.484375,
"epoch": 0.64,
"grad_norm": 0.028410576283931732,
"kl": 0.0016107559204101562,
"learning_rate": 1.9998582284566878e-05,
"loss": 0.0072,
"reward": 5.220240831375122,
"reward_std": 1.5586610436439514,
"rewards/mrr_reward": 0.35975322872400284,
"rewards/rank_analyze_format_reward": 0.2044668523594737,
"rewards/rank_answer_foramt_reward": 0.640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9954117387533188,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.9954117387533188,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 541.9375,
"epoch": 0.648,
"grad_norm": 0.024989139288663864,
"kl": 0.002213001251220703,
"learning_rate": 1.999853964992107e-05,
"loss": -0.0076,
"reward": 5.271288990974426,
"reward_std": 1.666042000055313,
"rewards/mrr_reward": 0.3229972794651985,
"rewards/rank_analyze_format_reward": 0.38286497443914413,
"rewards/rank_answer_foramt_reward": 0.65625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 446.703125,
"epoch": 0.656,
"grad_norm": 0.03217592090368271,
"kl": 0.0023698806762695312,
"learning_rate": 1.9998496383712828e-05,
"loss": -0.0122,
"reward": 5.724093914031982,
"reward_std": 1.503628522157669,
"rewards/mrr_reward": 0.4970238097012043,
"rewards/rank_analyze_format_reward": 0.05476433038711548,
"rewards/rank_answer_foramt_reward": 0.75,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9968671798706055,
"rewards/rank_overall_format_reward_more": 0.9375,
"rewards/rank_verify_format_reward": 0.9968671798706055,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 511.421875,
"epoch": 0.664,
"grad_norm": 0.026632074266672134,
"kl": 0.001974821090698242,
"learning_rate": 1.999845248594489e-05,
"loss": -0.0378,
"reward": 5.284371018409729,
"reward_std": 1.7509951293468475,
"rewards/mrr_reward": 0.37621527537703514,
"rewards/rank_analyze_format_reward": 0.1659046746790409,
"rewards/rank_answer_foramt_reward": 0.693359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9952791333198547,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.9796541333198547,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 497.40625,
"epoch": 0.672,
"grad_norm": 0.028008421882987022,
"kl": 0.002154827117919922,
"learning_rate": 1.9998407956620017e-05,
"loss": -0.0174,
"reward": 5.500829696655273,
"reward_std": 1.7818693816661835,
"rewards/mrr_reward": 0.46015624701976776,
"rewards/rank_analyze_format_reward": 0.18122385442256927,
"rewards/rank_answer_foramt_reward": 0.69140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9836309552192688,
"rewards/rank_overall_format_reward_more": 0.8515625,
"rewards/rank_verify_format_reward": 0.9523809552192688,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 504.109375,
"epoch": 0.68,
"grad_norm": 0.02929595857858658,
"kl": 0.0015263557434082031,
"learning_rate": 1.9998362795741027e-05,
"loss": -0.0149,
"reward": 4.848661541938782,
"reward_std": 1.5195987075567245,
"rewards/mrr_reward": 0.27783359214663506,
"rewards/rank_analyze_format_reward": 0.19695308804512024,
"rewards/rank_answer_foramt_reward": 0.658203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9801479876041412,
"rewards/rank_overall_format_reward_more": 0.9375,
"rewards/rank_verify_format_reward": 0.9645229876041412,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 520.03125,
"epoch": 0.688,
"grad_norm": 0.028656957671046257,
"kl": 0.0018339157104492188,
"learning_rate": 1.9998317003310775e-05,
"loss": 0.0018,
"reward": 6.04072630405426,
"reward_std": 1.6347778737545013,
"rewards/mrr_reward": 0.5263826847076416,
"rewards/rank_analyze_format_reward": 0.2711330959573388,
"rewards/rank_answer_foramt_reward": 0.7890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.90625,
"rewards/rank_verify_format_reward": 0.96875,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 514.953125,
"epoch": 0.696,
"grad_norm": 0.0294534619897604,
"kl": 0.0030837059020996094,
"learning_rate": 1.9998270579332154e-05,
"loss": -0.0213,
"reward": 5.602773904800415,
"reward_std": 1.9321411848068237,
"rewards/mrr_reward": 0.45494791865348816,
"rewards/rank_analyze_format_reward": 0.22951603773981333,
"rewards/rank_answer_foramt_reward": 0.65234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.921875,
"rewards/rank_verify_format_reward": 0.9818111509084702,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 517.953125,
"epoch": 0.704,
"grad_norm": 0.02795729972422123,
"kl": 0.0021820068359375,
"learning_rate": 1.9998223523808092e-05,
"loss": -0.005,
"reward": 5.259730100631714,
"reward_std": 1.7037486732006073,
"rewards/mrr_reward": 0.384002972394228,
"rewards/rank_analyze_format_reward": 0.18732355255633593,
"rewards/rank_answer_foramt_reward": 0.642578125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9976895451545715,
"rewards/rank_overall_format_reward_more": 0.8984375,
"rewards/rank_verify_format_reward": 0.9976895451545715,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 572.234375,
"epoch": 0.712,
"grad_norm": 0.025803212076425552,
"kl": 0.0023365020751953125,
"learning_rate": 1.9998175836741564e-05,
"loss": -0.0233,
"reward": 5.643940687179565,
"reward_std": 2.12572318315506,
"rewards/mrr_reward": 0.41141493432223797,
"rewards/rank_analyze_format_reward": 0.43535757809877396,
"rewards/rank_answer_foramt_reward": 0.65234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9826335161924362,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 518.875,
"epoch": 0.72,
"grad_norm": 0.027814343571662903,
"kl": 0.0020236968994140625,
"learning_rate": 1.999812751813558e-05,
"loss": -0.051,
"reward": 5.96042013168335,
"reward_std": 1.2770089283585548,
"rewards/mrr_reward": 0.4757130518555641,
"rewards/rank_analyze_format_reward": 0.3149571679532528,
"rewards/rank_answer_foramt_reward": 0.79296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 495.8125,
"epoch": 0.728,
"grad_norm": 0.031177863478660583,
"kl": 0.002357959747314453,
"learning_rate": 1.9998078567993197e-05,
"loss": -0.0346,
"reward": 5.881357431411743,
"reward_std": 1.7492572218179703,
"rewards/mrr_reward": 0.5347842201590538,
"rewards/rank_analyze_format_reward": 0.09312985371798277,
"rewards/rank_answer_foramt_reward": 0.7265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9964202791452408,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.9807952791452408,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 511.15625,
"epoch": 0.736,
"grad_norm": 0.028669161722064018,
"kl": 0.002231597900390625,
"learning_rate": 1.9998028986317504e-05,
"loss": -0.0145,
"reward": 5.656317114830017,
"reward_std": 1.6166883707046509,
"rewards/mrr_reward": 0.44843751192092896,
"rewards/rank_analyze_format_reward": 0.20554364286363125,
"rewards/rank_answer_foramt_reward": 0.76953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9984335899353027,
"rewards/rank_overall_format_reward_more": 0.90625,
"rewards/rank_verify_format_reward": 0.9828085899353027,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 522.484375,
"epoch": 0.744,
"grad_norm": 0.027386236935853958,
"kl": 0.0021200180053710938,
"learning_rate": 1.999797877311163e-05,
"loss": -0.0246,
"reward": 5.928924918174744,
"reward_std": 1.48914834856987,
"rewards/mrr_reward": 0.4644097238779068,
"rewards/rank_analyze_format_reward": 0.3623017445206642,
"rewards/rank_answer_foramt_reward": 0.802734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.921875,
"rewards/rank_verify_format_reward": 0.984375,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 497.1875,
"epoch": 0.752,
"grad_norm": 0.0282078068703413,
"kl": 0.003062725067138672,
"learning_rate": 1.9997927928378753e-05,
"loss": 0.0186,
"reward": 6.396650433540344,
"reward_std": 1.9676957428455353,
"rewards/mrr_reward": 0.6083333343267441,
"rewards/rank_analyze_format_reward": 0.25528283044695854,
"rewards/rank_answer_foramt_reward": 0.76171875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9965953528881073,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9965953528881073,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 517.390625,
"epoch": 0.76,
"grad_norm": 0.03063831850886345,
"kl": 0.002585887908935547,
"learning_rate": 1.999787645212208e-05,
"loss": -0.0102,
"reward": 6.281728744506836,
"reward_std": 1.7576136887073517,
"rewards/mrr_reward": 0.5565538108348846,
"rewards/rank_analyze_format_reward": 0.30067696794867516,
"rewards/rank_answer_foramt_reward": 0.7890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9985119104385376,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9985119104385376,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 522.171875,
"epoch": 0.768,
"grad_norm": 0.02829769253730774,
"kl": 0.0035347938537597656,
"learning_rate": 1.999782434434486e-05,
"loss": 0.0108,
"reward": 5.318088173866272,
"reward_std": 1.6170280575752258,
"rewards/mrr_reward": 0.3567398265004158,
"rewards/rank_analyze_format_reward": 0.24012142419815063,
"rewards/rank_answer_foramt_reward": 0.708984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9827302694320679,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 537.703125,
"epoch": 0.776,
"grad_norm": 0.026142382994294167,
"kl": 0.002566814422607422,
"learning_rate": 1.999777160505039e-05,
"loss": -0.0223,
"reward": 5.818326234817505,
"reward_std": 1.489253669977188,
"rewards/mrr_reward": 0.45615699887275696,
"rewards/rank_analyze_format_reward": 0.23597807995975018,
"rewards/rank_answer_foramt_reward": 0.822265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9872584789991379,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9872584789991379,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 513.171875,
"epoch": 0.784,
"grad_norm": 0.03225073963403702,
"kl": 0.0032796859741210938,
"learning_rate": 1.9997718234242e-05,
"loss": -0.0376,
"reward": 5.64834451675415,
"reward_std": 1.8088513016700745,
"rewards/mrr_reward": 0.4361979216337204,
"rewards/rank_analyze_format_reward": 0.2531745582818985,
"rewards/rank_answer_foramt_reward": 0.6953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9931579083204269,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9931579083204269,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 513.203125,
"epoch": 0.792,
"grad_norm": 0.028324192389845848,
"kl": 0.002949237823486328,
"learning_rate": 1.999766423192306e-05,
"loss": -0.0073,
"reward": 5.801540851593018,
"reward_std": 1.364225059747696,
"rewards/mrr_reward": 0.45811013877391815,
"rewards/rank_analyze_format_reward": 0.2562095895409584,
"rewards/rank_answer_foramt_reward": 0.736328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 522.625,
"epoch": 0.8,
"grad_norm": 0.03095146454870701,
"kl": 0.0033349990844726562,
"learning_rate": 1.9997609598096982e-05,
"loss": -0.0571,
"reward": 5.498512506484985,
"reward_std": 1.5701228380203247,
"rewards/mrr_reward": 0.38402778655290604,
"rewards/rank_analyze_format_reward": 0.31815899908542633,
"rewards/rank_answer_foramt_reward": 0.720703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9891133904457092,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9734883904457092,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 505.75,
"epoch": 0.808,
"grad_norm": 0.030804021283984184,
"kl": 0.003856658935546875,
"learning_rate": 1.9997554332767214e-05,
"loss": -0.0226,
"reward": 6.090959072113037,
"reward_std": 1.7257481813430786,
"rewards/mrr_reward": 0.5503038242459297,
"rewards/rank_analyze_format_reward": 0.20266878511756659,
"rewards/rank_answer_foramt_reward": 0.748046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9929515719413757,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9929515719413757,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 486.828125,
"epoch": 0.816,
"grad_norm": 0.03461439907550812,
"kl": 0.0033884048461914062,
"learning_rate": 1.9997498435937254e-05,
"loss": -0.0362,
"reward": 5.366485238075256,
"reward_std": 1.3259476721286774,
"rewards/mrr_reward": 0.3723524361848831,
"rewards/rank_analyze_format_reward": 0.1968497335910797,
"rewards/rank_answer_foramt_reward": 0.732421875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9973393976688385,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9973393976688385,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 533.84375,
"epoch": 0.824,
"grad_norm": 0.028838761150836945,
"kl": 0.0028543472290039062,
"learning_rate": 1.9997441907610624e-05,
"loss": -0.0262,
"reward": 5.746440768241882,
"reward_std": 1.2651481330394745,
"rewards/mrr_reward": 0.41945064067840576,
"rewards/rank_analyze_format_reward": 0.30614617466926575,
"rewards/rank_answer_foramt_reward": 0.828125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9984335899353027,
"rewards/rank_overall_format_reward_more": 0.9375,
"rewards/rank_verify_format_reward": 0.9984335899353027,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 534.390625,
"epoch": 0.832,
"grad_norm": 0.03118244931101799,
"kl": 0.0032520294189453125,
"learning_rate": 1.9997384747790903e-05,
"loss": -0.0115,
"reward": 5.606603145599365,
"reward_std": 1.3689128905534744,
"rewards/mrr_reward": 0.4183097779750824,
"rewards/rank_analyze_format_reward": 0.1789928413927555,
"rewards/rank_answer_foramt_reward": 0.841796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.9296875,
"rewards/rank_verify_format_reward": 0.9836309552192688,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 541.140625,
"epoch": 0.84,
"grad_norm": 0.0289431344717741,
"kl": 0.004002094268798828,
"learning_rate": 1.9997326956481693e-05,
"loss": 0.0299,
"reward": 5.412080824375153,
"reward_std": 1.5776411294937134,
"rewards/mrr_reward": 0.40980901941657066,
"rewards/rank_analyze_format_reward": 0.2225375398993492,
"rewards/rank_answer_foramt_reward": 0.6484375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9704661071300507,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9704661071300507,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 546.625,
"epoch": 0.848,
"grad_norm": 0.02917851321399212,
"kl": 0.003693103790283203,
"learning_rate": 1.999726853368665e-05,
"loss": -0.0132,
"reward": 6.237439870834351,
"reward_std": 1.6906473636627197,
"rewards/mrr_reward": 0.5492559522390366,
"rewards/rank_analyze_format_reward": 0.32810740265995264,
"rewards/rank_answer_foramt_reward": 0.744140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9958027005195618,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9958027005195618,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 552.84375,
"epoch": 0.856,
"grad_norm": 0.02737216092646122,
"kl": 0.0034399032592773438,
"learning_rate": 1.9997209479409464e-05,
"loss": -0.0087,
"reward": 5.876426458358765,
"reward_std": 1.4228278696537018,
"rewards/mrr_reward": 0.4435453861951828,
"rewards/rank_analyze_format_reward": 0.38482026010751724,
"rewards/rank_answer_foramt_reward": 0.79296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9895716160535812,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.9895716160535812,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 508.390625,
"epoch": 0.864,
"grad_norm": 0.028911437839269638,
"kl": 0.0033540725708007812,
"learning_rate": 1.9997149793653862e-05,
"loss": -0.0094,
"reward": 6.699026107788086,
"reward_std": 1.3455817177891731,
"rewards/mrr_reward": 0.6710069477558136,
"rewards/rank_analyze_format_reward": 0.18851793929934502,
"rewards/rank_answer_foramt_reward": 0.8359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 486.1875,
"epoch": 0.872,
"grad_norm": 0.03616398200392723,
"kl": 0.004034519195556641,
"learning_rate": 1.9997089476423617e-05,
"loss": 0.0287,
"reward": 6.017909646034241,
"reward_std": 1.8136086165904999,
"rewards/mrr_reward": 0.5158420130610466,
"rewards/rank_analyze_format_reward": 0.26905644312500954,
"rewards/rank_answer_foramt_reward": 0.74609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9853207767009735,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9853207767009735,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 513.421875,
"epoch": 0.88,
"grad_norm": 0.029927760362625122,
"kl": 0.003938198089599609,
"learning_rate": 1.999702852772254e-05,
"loss": 0.0003,
"reward": 5.642710447311401,
"reward_std": 1.466000735759735,
"rewards/mrr_reward": 0.3956349194049835,
"rewards/rank_analyze_format_reward": 0.3304584436118603,
"rewards/rank_answer_foramt_reward": 0.814453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.981067106127739,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.981067106127739,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 527.78125,
"epoch": 0.888,
"grad_norm": 0.03488059714436531,
"kl": 0.0036568641662597656,
"learning_rate": 1.9996966947554476e-05,
"loss": -0.0217,
"reward": 6.343585729598999,
"reward_std": 1.7120259702205658,
"rewards/mrr_reward": 0.5808779820799828,
"rewards/rank_analyze_format_reward": 0.3235646188259125,
"rewards/rank_answer_foramt_reward": 0.75,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.996692106127739,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.981067106127739,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 508.9375,
"epoch": 0.896,
"grad_norm": 0.0328449085354805,
"kl": 0.0038404464721679688,
"learning_rate": 1.9996904735923325e-05,
"loss": -0.0289,
"reward": 6.122893452644348,
"reward_std": 1.4696560502052307,
"rewards/mrr_reward": 0.5161458402872086,
"rewards/rank_analyze_format_reward": 0.31539197266101837,
"rewards/rank_answer_foramt_reward": 0.806640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9915762841701508,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9915762841701508,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 541.578125,
"epoch": 0.904,
"grad_norm": 0.031444139778614044,
"kl": 0.0041828155517578125,
"learning_rate": 1.9996841892833e-05,
"loss": -0.0134,
"reward": 6.206910610198975,
"reward_std": 1.6045927107334137,
"rewards/mrr_reward": 0.5138206705451012,
"rewards/rank_analyze_format_reward": 0.36523886024951935,
"rewards/rank_answer_foramt_reward": 0.8203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9947571158409119,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9947571158409119,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 511.28125,
"epoch": 0.912,
"grad_norm": 0.034729719161987305,
"kl": 0.004141807556152344,
"learning_rate": 1.9996778418287486e-05,
"loss": 0.0052,
"reward": 5.282190442085266,
"reward_std": 1.5597249567508698,
"rewards/mrr_reward": 0.35017360746860504,
"rewards/rank_analyze_format_reward": 0.25228141248226166,
"rewards/rank_answer_foramt_reward": 0.685546875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 531.9375,
"epoch": 0.92,
"grad_norm": 0.03240945562720299,
"kl": 0.003923892974853516,
"learning_rate": 1.9996714312290784e-05,
"loss": -0.0297,
"reward": 5.8050724267959595,
"reward_std": 1.4556776583194733,
"rewards/mrr_reward": 0.4047433137893677,
"rewards/rank_analyze_format_reward": 0.3966461531817913,
"rewards/rank_answer_foramt_reward": 0.830078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992187470197678,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9992187470197678,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 545.4375,
"epoch": 0.928,
"grad_norm": 0.033669959753751755,
"kl": 0.0044651031494140625,
"learning_rate": 1.9996649574846948e-05,
"loss": -0.0214,
"reward": 6.237725496292114,
"reward_std": 1.6311175972223282,
"rewards/mrr_reward": 0.5039434656500816,
"rewards/rank_analyze_format_reward": 0.40749866887927055,
"rewards/rank_answer_foramt_reward": 0.830078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 541.984375,
"epoch": 0.936,
"grad_norm": 0.03145231306552887,
"kl": 0.004774570465087891,
"learning_rate": 1.9996584205960063e-05,
"loss": -0.0014,
"reward": 5.562940955162048,
"reward_std": 1.5884797871112823,
"rewards/mrr_reward": 0.40140748769044876,
"rewards/rank_analyze_format_reward": 0.33169008791446686,
"rewards/rank_answer_foramt_reward": 0.689453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9954276382923126,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9798026382923126,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 527.875,
"epoch": 0.944,
"grad_norm": 0.035210635513067245,
"kl": 0.0044097900390625,
"learning_rate": 1.999651820563426e-05,
"loss": -0.0421,
"reward": 5.673676252365112,
"reward_std": 1.3604719787836075,
"rewards/mrr_reward": 0.3857142850756645,
"rewards/rank_analyze_format_reward": 0.39724233001470566,
"rewards/rank_answer_foramt_reward": 0.7890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9956946671009064,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9800696671009064,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 534.984375,
"epoch": 0.952,
"grad_norm": 0.03147454559803009,
"kl": 0.0076541900634765625,
"learning_rate": 1.999645157387371e-05,
"loss": -0.0133,
"reward": 6.33061408996582,
"reward_std": 1.298683062195778,
"rewards/mrr_reward": 0.553689256310463,
"rewards/rank_analyze_format_reward": 0.3629737161099911,
"rewards/rank_answer_foramt_reward": 0.787109375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9985119104385376,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9985119104385376,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 551.703125,
"epoch": 0.96,
"grad_norm": 0.031304825097322464,
"kl": 0.004342079162597656,
"learning_rate": 1.9996384310682615e-05,
"loss": -0.0365,
"reward": 5.393260598182678,
"reward_std": 1.5107265412807465,
"rewards/mrr_reward": 0.31743552163243294,
"rewards/rank_analyze_format_reward": 0.3929348886013031,
"rewards/rank_answer_foramt_reward": 0.763671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 501.890625,
"epoch": 0.968,
"grad_norm": 0.037323277443647385,
"kl": 0.004292488098144531,
"learning_rate": 1.999631641606523e-05,
"loss": -0.0058,
"reward": 6.189559578895569,
"reward_std": 1.2323561608791351,
"rewards/mrr_reward": 0.5643229335546494,
"rewards/rank_analyze_format_reward": 0.11282643768936396,
"rewards/rank_answer_foramt_reward": 0.87109375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9819862246513367,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9976112246513367,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 545.25,
"epoch": 0.976,
"grad_norm": 0.034240033477544785,
"kl": 0.005130767822265625,
"learning_rate": 1.9996247890025845e-05,
"loss": -0.0263,
"reward": 5.799539566040039,
"reward_std": 1.6965691149234772,
"rewards/mrr_reward": 0.4192212335765362,
"rewards/rank_analyze_format_reward": 0.41699668765068054,
"rewards/rank_answer_foramt_reward": 0.748046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983368366956711,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9983368366956711,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 522.9375,
"epoch": 0.984,
"grad_norm": 0.03087456338107586,
"kl": 0.004132270812988281,
"learning_rate": 1.9996178732568784e-05,
"loss": -0.0128,
"reward": 5.433954238891602,
"reward_std": 1.3873755782842636,
"rewards/mrr_reward": 0.35366444662213326,
"rewards/rank_analyze_format_reward": 0.3041442818939686,
"rewards/rank_answer_foramt_reward": 0.802734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9296875,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 548.90625,
"epoch": 0.992,
"grad_norm": 0.03247498720884323,
"kl": 0.004190921783447266,
"learning_rate": 1.9996108943698412e-05,
"loss": -0.02,
"reward": 6.039711356163025,
"reward_std": 1.745398223400116,
"rewards/mrr_reward": 0.49064359068870544,
"rewards/rank_analyze_format_reward": 0.3508656769990921,
"rewards/rank_answer_foramt_reward": 0.734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9959480613470078,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9959480613470078,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 502.75,
"epoch": 1.0,
"grad_norm": 0.031171714887022972,
"kl": 0.00469207763671875,
"learning_rate": 1.9996038523419148e-05,
"loss": -0.0226,
"reward": 5.955172896385193,
"reward_std": 1.2524618208408356,
"rewards/mrr_reward": 0.460627481341362,
"rewards/rank_analyze_format_reward": 0.33911067247390747,
"rewards/rank_answer_foramt_reward": 0.806640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 549.53125,
"epoch": 1.008,
"grad_norm": 0.03288634493947029,
"kl": 0.00435638427734375,
"learning_rate": 1.9995967471735433e-05,
"loss": -0.0184,
"reward": 6.16729462146759,
"reward_std": 1.4908590912818909,
"rewards/mrr_reward": 0.5096540227532387,
"rewards/rank_analyze_format_reward": 0.39567676931619644,
"rewards/rank_answer_foramt_reward": 0.775390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983368366956711,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9983368366956711,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 533.65625,
"epoch": 1.016,
"grad_norm": 0.03470674157142639,
"kl": 0.005002021789550781,
"learning_rate": 1.9995895788651753e-05,
"loss": -0.0254,
"reward": 6.5735520124435425,
"reward_std": 1.469813510775566,
"rewards/mrr_reward": 0.621657982468605,
"rewards/rank_analyze_format_reward": 0.31275077164173126,
"rewards/rank_answer_foramt_reward": 0.818359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 546.34375,
"epoch": 1.024,
"grad_norm": 0.03493339568376541,
"kl": 0.004825592041015625,
"learning_rate": 1.9995823474172644e-05,
"loss": -0.0097,
"reward": 5.744642496109009,
"reward_std": 1.9137973487377167,
"rewards/mrr_reward": 0.4302021265029907,
"rewards/rank_analyze_format_reward": 0.3747362494468689,
"rewards/rank_answer_foramt_reward": 0.6953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9925176054239273,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9925176054239273,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 522.65625,
"epoch": 1.032,
"grad_norm": 0.03346191346645355,
"kl": 0.0044116973876953125,
"learning_rate": 1.9995750528302668e-05,
"loss": -0.0069,
"reward": 6.34830904006958,
"reward_std": 1.5635737180709839,
"rewards/mrr_reward": 0.5352182611823082,
"rewards/rank_analyze_format_reward": 0.3498992621898651,
"rewards/rank_answer_foramt_reward": 0.875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 518.90625,
"epoch": 1.04,
"grad_norm": 0.027642706409096718,
"kl": 0.0034575462341308594,
"learning_rate": 1.999567695104643e-05,
"loss": -0.0083,
"reward": 6.863049745559692,
"reward_std": 0.995637645944953,
"rewards/mrr_reward": 0.6565104126930237,
"rewards/rank_analyze_format_reward": 0.3502893391996622,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 546.265625,
"epoch": 1.048,
"grad_norm": 0.03320496901869774,
"kl": 0.00482177734375,
"learning_rate": 1.9995602742408584e-05,
"loss": -0.0297,
"reward": 5.561194658279419,
"reward_std": 1.0441379398107529,
"rewards/mrr_reward": 0.3508804552257061,
"rewards/rank_analyze_format_reward": 0.3411516472697258,
"rewards/rank_answer_foramt_reward": 0.857421875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 538.40625,
"epoch": 1.056,
"grad_norm": 0.03059810772538185,
"kl": 0.004889488220214844,
"learning_rate": 1.9995527902393814e-05,
"loss": -0.031,
"reward": 6.096472501754761,
"reward_std": 1.3641002774238586,
"rewards/mrr_reward": 0.4807477816939354,
"rewards/rank_analyze_format_reward": 0.329423014074564,
"rewards/rank_answer_foramt_reward": 0.845703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 530.125,
"epoch": 1.064,
"grad_norm": 0.03329962119460106,
"kl": 0.004602909088134766,
"learning_rate": 1.9995452431006844e-05,
"loss": -0.0196,
"reward": 5.331088542938232,
"reward_std": 0.830422654747963,
"rewards/mrr_reward": 0.3011222556233406,
"rewards/rank_analyze_format_reward": 0.27261858060956,
"rewards/rank_answer_foramt_reward": 0.88671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9992559552192688,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 546.765625,
"epoch": 1.072,
"grad_norm": 0.03147532418370247,
"kl": 0.004273891448974609,
"learning_rate": 1.999537632825245e-05,
"loss": -0.0228,
"reward": 5.826651930809021,
"reward_std": 1.0173790007829666,
"rewards/mrr_reward": 0.4183593764901161,
"rewards/rank_analyze_format_reward": 0.37596380710601807,
"rewards/rank_answer_foramt_reward": 0.861328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.997023805975914,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.965773805975914,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 540.265625,
"epoch": 1.08,
"grad_norm": 0.03314289450645447,
"kl": 0.0045490264892578125,
"learning_rate": 1.9995299594135434e-05,
"loss": -0.0181,
"reward": 6.364633798599243,
"reward_std": 1.2888767421245575,
"rewards/mrr_reward": 0.5286644101142883,
"rewards/rank_analyze_format_reward": 0.40576110780239105,
"rewards/rank_answer_foramt_reward": 0.861328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9992559552192688,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 537.515625,
"epoch": 1.088,
"grad_norm": 0.03256648778915405,
"kl": 0.005957603454589844,
"learning_rate": 1.999522222866064e-05,
"loss": -0.0253,
"reward": 6.410487055778503,
"reward_std": 1.080582246184349,
"rewards/mrr_reward": 0.5400917902588844,
"rewards/rank_analyze_format_reward": 0.3477761074900627,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 565.78125,
"epoch": 1.096,
"grad_norm": 0.03460180386900902,
"kl": 0.012660980224609375,
"learning_rate": 1.999514423183296e-05,
"loss": -0.0144,
"reward": 6.023637771606445,
"reward_std": 1.5978916585445404,
"rewards/mrr_reward": 0.44875992834568024,
"rewards/rank_analyze_format_reward": 0.44508665800094604,
"rewards/rank_answer_foramt_reward": 0.83203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9835526347160339,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 515.0,
"epoch": 1.104,
"grad_norm": 0.030293360352516174,
"kl": 0.005436897277832031,
"learning_rate": 1.9995065603657317e-05,
"loss": -0.0128,
"reward": 6.0460041761398315,
"reward_std": 1.0893033295869827,
"rewards/mrr_reward": 0.48198164254426956,
"rewards/rank_analyze_format_reward": 0.2604257594794035,
"rewards/rank_answer_foramt_reward": 0.876953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 560.9375,
"epoch": 1.112,
"grad_norm": 0.034913912415504456,
"kl": 0.0055980682373046875,
"learning_rate": 1.999498634413868e-05,
"loss": -0.009,
"reward": 6.033616900444031,
"reward_std": 1.7777451276779175,
"rewards/mrr_reward": 0.4813368245959282,
"rewards/rank_analyze_format_reward": 0.3849602974951267,
"rewards/rank_answer_foramt_reward": 0.74609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9964202791452408,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9964202791452408,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 519.21875,
"epoch": 1.12,
"grad_norm": 0.03727564588189125,
"kl": 0.004870414733886719,
"learning_rate": 1.9994906453282055e-05,
"loss": -0.0243,
"reward": 6.689180135726929,
"reward_std": 1.181299865245819,
"rewards/mrr_reward": 0.6408420205116272,
"rewards/rank_analyze_format_reward": 0.2992168888449669,
"rewards/rank_answer_foramt_reward": 0.876953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 556.578125,
"epoch": 1.1280000000000001,
"grad_norm": 0.035206038504838943,
"kl": 0.006304740905761719,
"learning_rate": 1.9994825931092486e-05,
"loss": -0.0367,
"reward": 6.255677342414856,
"reward_std": 1.9543142914772034,
"rewards/mrr_reward": 0.5096974149346352,
"rewards/rank_analyze_format_reward": 0.4908938556909561,
"rewards/rank_answer_foramt_reward": 0.75390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9938564151525497,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9938564151525497,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 531.640625,
"epoch": 1.1360000000000001,
"grad_norm": 0.031349923461675644,
"kl": 0.005738258361816406,
"learning_rate": 1.9994744777575064e-05,
"loss": 0.0027,
"reward": 6.044549226760864,
"reward_std": 1.1382797956466675,
"rewards/mrr_reward": 0.48732637614011765,
"rewards/rank_analyze_format_reward": 0.32322094589471817,
"rewards/rank_answer_foramt_reward": 0.83203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9817143976688385,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9817143976688385,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 566.484375,
"epoch": 1.144,
"grad_norm": 0.03165869414806366,
"kl": 0.00574493408203125,
"learning_rate": 1.999466299273491e-05,
"loss": 0.0042,
"reward": 6.553520321846008,
"reward_std": 1.627190262079239,
"rewards/mrr_reward": 0.5967448204755783,
"rewards/rank_analyze_format_reward": 0.44682280719280243,
"rewards/rank_answer_foramt_reward": 0.84765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9907185882329941,
"rewards/rank_overall_format_reward_more": 0.9375,
"rewards/rank_verify_format_reward": 0.9438435882329941,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 531.203125,
"epoch": 1.152,
"grad_norm": 0.03519825637340546,
"kl": 0.0061855316162109375,
"learning_rate": 1.9994580576577193e-05,
"loss": -0.0129,
"reward": 5.729455947875977,
"reward_std": 1.3913188576698303,
"rewards/mrr_reward": 0.41431671380996704,
"rewards/rank_analyze_format_reward": 0.32957829907536507,
"rewards/rank_answer_foramt_reward": 0.76171875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 524.3125,
"epoch": 1.16,
"grad_norm": 0.03151266649365425,
"kl": 0.006320953369140625,
"learning_rate": 1.9994497529107118e-05,
"loss": -0.0148,
"reward": 6.072369456291199,
"reward_std": 1.2223908305168152,
"rewards/mrr_reward": 0.4943700544536114,
"rewards/rank_analyze_format_reward": 0.23118487000465393,
"rewards/rank_answer_foramt_reward": 0.875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 536.890625,
"epoch": 1.168,
"grad_norm": 0.03530154004693031,
"kl": 0.005878448486328125,
"learning_rate": 1.999441385032993e-05,
"loss": -0.0308,
"reward": 6.625038385391235,
"reward_std": 1.2249456346035004,
"rewards/mrr_reward": 0.6044022962450981,
"rewards/rank_analyze_format_reward": 0.34935425966978073,
"rewards/rank_answer_foramt_reward": 0.880859375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9964202791452408,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9964202791452408,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 520.6875,
"epoch": 1.176,
"grad_norm": 0.035086773335933685,
"kl": 0.007404327392578125,
"learning_rate": 1.9994329540250918e-05,
"loss": -0.0321,
"reward": 6.542271018028259,
"reward_std": 1.3827645033597946,
"rewards/mrr_reward": 0.5946986712515354,
"rewards/rank_analyze_format_reward": 0.292152963578701,
"rewards/rank_answer_foramt_reward": 0.875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 533.8125,
"epoch": 1.184,
"grad_norm": 0.032674577087163925,
"kl": 0.00643157958984375,
"learning_rate": 1.99942445988754e-05,
"loss": -0.033,
"reward": 6.091751337051392,
"reward_std": 1.2941071689128876,
"rewards/mrr_reward": 0.479445680975914,
"rewards/rank_analyze_format_reward": 0.35560934245586395,
"rewards/rank_answer_foramt_reward": 0.818359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 551.0625,
"epoch": 1.192,
"grad_norm": 0.03282872214913368,
"kl": 0.005850791931152344,
"learning_rate": 1.999415902620875e-05,
"loss": -0.025,
"reward": 6.667526364326477,
"reward_std": 1.0924562439322472,
"rewards/mrr_reward": 0.604253463447094,
"rewards/rank_analyze_format_reward": 0.3833247348666191,
"rewards/rank_answer_foramt_reward": 0.890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 564.640625,
"epoch": 1.2,
"grad_norm": 0.032991521060466766,
"kl": 0.0060176849365234375,
"learning_rate": 1.999407282225637e-05,
"loss": 0.0052,
"reward": 5.798678278923035,
"reward_std": 1.1788080930709839,
"rewards/mrr_reward": 0.402951393276453,
"rewards/rank_analyze_format_reward": 0.38478153944015503,
"rewards/rank_answer_foramt_reward": 0.861328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9938189834356308,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9781939834356308,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 575.6875,
"epoch": 1.208,
"grad_norm": 0.02944212593138218,
"kl": 0.005663871765136719,
"learning_rate": 1.9993985987023703e-05,
"loss": -0.0115,
"reward": 6.4621899127960205,
"reward_std": 1.2538374364376068,
"rewards/mrr_reward": 0.5166728720068932,
"rewards/rank_analyze_format_reward": 0.4970608651638031,
"rewards/rank_answer_foramt_reward": 0.9140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 583.15625,
"epoch": 1.216,
"grad_norm": 0.03144199773669243,
"kl": 0.005504608154296875,
"learning_rate": 1.9993898520516233e-05,
"loss": 0.0178,
"reward": 7.210927963256836,
"reward_std": 1.4847297072410583,
"rewards/mrr_reward": 0.7067708373069763,
"rewards/rank_analyze_format_reward": 0.5710361748933792,
"rewards/rank_answer_foramt_reward": 0.845703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9835526347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 569.125,
"epoch": 1.224,
"grad_norm": 0.03476106375455856,
"kl": 0.006566047668457031,
"learning_rate": 1.9993810422739496e-05,
"loss": -0.0255,
"reward": 5.501855969429016,
"reward_std": 1.1074179112911224,
"rewards/mrr_reward": 0.2869729772210121,
"rewards/rank_analyze_format_reward": 0.5554128363728523,
"rewards/rank_answer_foramt_reward": 0.8359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9969318807125092,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9969318807125092,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 559.140625,
"epoch": 1.232,
"grad_norm": 0.0336473323404789,
"kl": 0.006374359130859375,
"learning_rate": 1.999372169369904e-05,
"loss": -0.0304,
"reward": 7.210146188735962,
"reward_std": 1.3585944771766663,
"rewards/mrr_reward": 0.7406249940395355,
"rewards/rank_analyze_format_reward": 0.4076874777674675,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 565.015625,
"epoch": 1.24,
"grad_norm": 0.03243670612573624,
"kl": 0.0064258575439453125,
"learning_rate": 1.999363233340048e-05,
"loss": 0.0124,
"reward": 6.775290489196777,
"reward_std": 1.6960014998912811,
"rewards/mrr_reward": 0.6491319388151169,
"rewards/rank_analyze_format_reward": 0.3369657965376973,
"rewards/rank_answer_foramt_reward": 0.849609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 567.671875,
"epoch": 1.248,
"grad_norm": 0.03587043285369873,
"kl": 0.008923530578613281,
"learning_rate": 1.9993542341849462e-05,
"loss": -0.0172,
"reward": 6.484335541725159,
"reward_std": 1.4846598207950592,
"rewards/mrr_reward": 0.545331098139286,
"rewards/rank_analyze_format_reward": 0.5122500844299793,
"rewards/rank_answer_foramt_reward": 0.822265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9959664940834045,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9959664940834045,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 541.015625,
"epoch": 1.256,
"grad_norm": 0.03247671574354172,
"kl": 0.006220817565917969,
"learning_rate": 1.9993451719051663e-05,
"loss": -0.0057,
"reward": 6.91133987903595,
"reward_std": 1.076777160167694,
"rewards/mrr_reward": 0.6208333373069763,
"rewards/rank_analyze_format_reward": 0.4648074358701706,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 536.0,
"epoch": 1.264,
"grad_norm": 0.033995021134614944,
"kl": 0.006260871887207031,
"learning_rate": 1.999336046501281e-05,
"loss": -0.0107,
"reward": 6.491420269012451,
"reward_std": 1.1248966604471207,
"rewards/mrr_reward": 0.5537760369479656,
"rewards/rank_analyze_format_reward": 0.38733571022748947,
"rewards/rank_answer_foramt_reward": 0.890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 537.09375,
"epoch": 1.272,
"grad_norm": 0.03796149790287018,
"kl": 0.008052825927734375,
"learning_rate": 1.999326857973867e-05,
"loss": -0.0482,
"reward": 7.359132528305054,
"reward_std": 1.4535967111587524,
"rewards/mrr_reward": 0.7499999850988388,
"rewards/rank_analyze_format_reward": 0.4489763230085373,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 564.671875,
"epoch": 1.28,
"grad_norm": 0.036241207271814346,
"kl": 0.0068416595458984375,
"learning_rate": 1.9993176063235046e-05,
"loss": -0.0176,
"reward": 7.0445317029953,
"reward_std": 1.626471757888794,
"rewards/mrr_reward": 0.662822425365448,
"rewards/rank_analyze_format_reward": 0.5297309085726738,
"rewards/rank_answer_foramt_reward": 0.875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 579.453125,
"epoch": 1.288,
"grad_norm": 0.037075724452733994,
"kl": 0.006366729736328125,
"learning_rate": 1.9993082915507776e-05,
"loss": -0.0144,
"reward": 6.376061797142029,
"reward_std": 1.2733474969863892,
"rewards/mrr_reward": 0.5289496555924416,
"rewards/rank_analyze_format_reward": 0.42237265408039093,
"rewards/rank_answer_foramt_reward": 0.876953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 1.0,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 577.296875,
"epoch": 1.296,
"grad_norm": 0.03217633441090584,
"kl": 0.006763458251953125,
"learning_rate": 1.999298913656275e-05,
"loss": -0.0085,
"reward": 6.5947242975234985,
"reward_std": 1.2779672592878342,
"rewards/mrr_reward": 0.5658544301986694,
"rewards/rank_analyze_format_reward": 0.4991602599620819,
"rewards/rank_answer_foramt_reward": 0.849609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 527.328125,
"epoch": 1.304,
"grad_norm": 0.03425245359539986,
"kl": 0.007500648498535156,
"learning_rate": 1.9992894726405894e-05,
"loss": -0.0124,
"reward": 6.557482957839966,
"reward_std": 1.4475017786026,
"rewards/mrr_reward": 0.5980902686715126,
"rewards/rank_analyze_format_reward": 0.2979341112077236,
"rewards/rank_answer_foramt_reward": 0.875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 564.59375,
"epoch": 1.312,
"grad_norm": 0.03478895127773285,
"kl": 0.007842063903808594,
"learning_rate": 1.9992799685043165e-05,
"loss": -0.0553,
"reward": 6.300098657608032,
"reward_std": 1.0886222496628761,
"rewards/mrr_reward": 0.48148561269044876,
"rewards/rank_analyze_format_reward": 0.5164258703589439,
"rewards/rank_answer_foramt_reward": 0.875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 568.953125,
"epoch": 1.32,
"grad_norm": 0.0446629598736763,
"kl": 0.006804466247558594,
"learning_rate": 1.999270401248057e-05,
"loss": -0.0217,
"reward": 6.716991662979126,
"reward_std": 1.4165300726890564,
"rewards/mrr_reward": 0.5955481305718422,
"rewards/rank_analyze_format_reward": 0.43624673783779144,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 584.984375,
"epoch": 1.328,
"grad_norm": 0.03580395132303238,
"kl": 0.008753776550292969,
"learning_rate": 1.999260770872415e-05,
"loss": 0.0004,
"reward": 5.964340448379517,
"reward_std": 1.1115762144327164,
"rewards/mrr_reward": 0.3914806619286537,
"rewards/rank_analyze_format_reward": 0.5389279127120972,
"rewards/rank_answer_foramt_reward": 0.876953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 571.453125,
"epoch": 1.336,
"grad_norm": 0.03663269430398941,
"kl": 0.007559776306152344,
"learning_rate": 1.999251077377999e-05,
"loss": -0.0458,
"reward": 6.374191999435425,
"reward_std": 1.2914250791072845,
"rewards/mrr_reward": 0.4921874962747097,
"rewards/rank_analyze_format_reward": 0.5749406069517136,
"rewards/rank_answer_foramt_reward": 0.849609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 573.96875,
"epoch": 1.3439999999999999,
"grad_norm": 0.033142536878585815,
"kl": 0.0076465606689453125,
"learning_rate": 1.999241320765421e-05,
"loss": -0.0188,
"reward": 6.285339713096619,
"reward_std": 1.369349867105484,
"rewards/mrr_reward": 0.4851934462785721,
"rewards/rank_analyze_format_reward": 0.479331374168396,
"rewards/rank_answer_foramt_reward": 0.873046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 547.984375,
"epoch": 1.3519999999999999,
"grad_norm": 0.03498915210366249,
"kl": 0.008561134338378906,
"learning_rate": 1.9992315010352978e-05,
"loss": -0.0274,
"reward": 6.904844880104065,
"reward_std": 1.2574369013309479,
"rewards/mrr_reward": 0.6432291716337204,
"rewards/rank_analyze_format_reward": 0.44520963728427887,
"rewards/rank_answer_foramt_reward": 0.88671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 576.765625,
"epoch": 1.3599999999999999,
"grad_norm": 0.03831981122493744,
"kl": 0.010663986206054688,
"learning_rate": 1.9992216181882492e-05,
"loss": -0.0089,
"reward": 6.317743182182312,
"reward_std": 1.1524057537317276,
"rewards/mrr_reward": 0.4661892428994179,
"rewards/rank_analyze_format_reward": 0.5786355137825012,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9977221935987473,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9977221935987473,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 549.5625,
"epoch": 1.3679999999999999,
"grad_norm": 0.03543487936258316,
"kl": 0.008349418640136719,
"learning_rate": 1.9992116722248997e-05,
"loss": 0.009,
"reward": 6.215874433517456,
"reward_std": 1.5494773089885712,
"rewards/mrr_reward": 0.5115203410387039,
"rewards/rank_analyze_format_reward": 0.3291758671402931,
"rewards/rank_answer_foramt_reward": 0.859375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9984335899353027,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9984335899353027,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 563.40625,
"epoch": 1.376,
"grad_norm": 0.03732339292764664,
"kl": 0.00754547119140625,
"learning_rate": 1.9992016631458774e-05,
"loss": -0.0044,
"reward": 6.333064913749695,
"reward_std": 1.5073265135288239,
"rewards/mrr_reward": 0.5336371585726738,
"rewards/rank_analyze_format_reward": 0.373980063945055,
"rewards/rank_answer_foramt_reward": 0.83203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9962525367736816,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9962525367736816,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 565.109375,
"epoch": 1.384,
"grad_norm": 0.03858262300491333,
"kl": 0.0084381103515625,
"learning_rate": 1.9991915909518146e-05,
"loss": -0.0484,
"reward": 6.43630588054657,
"reward_std": 1.1248457580804825,
"rewards/mrr_reward": 0.5476128421723843,
"rewards/rank_analyze_format_reward": 0.5014840885996819,
"rewards/rank_answer_foramt_reward": 0.779296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9825367629528046,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 603.625,
"epoch": 1.392,
"grad_norm": 0.031945351511240005,
"kl": 0.008295059204101562,
"learning_rate": 1.9991814556433475e-05,
"loss": -0.0415,
"reward": 6.396109580993652,
"reward_std": 1.257490947842598,
"rewards/mrr_reward": 0.5233692973852158,
"rewards/rank_analyze_format_reward": 0.5277140513062477,
"rewards/rank_answer_foramt_reward": 0.830078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9919514656066895,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9919514656066895,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 547.390625,
"epoch": 1.4,
"grad_norm": 0.03459803760051727,
"kl": 0.009899139404296875,
"learning_rate": 1.9991712572211163e-05,
"loss": -0.0283,
"reward": 6.693902850151062,
"reward_std": 1.5040415227413177,
"rewards/mrr_reward": 0.6218750029802322,
"rewards/rank_analyze_format_reward": 0.38910815864801407,
"rewards/rank_answer_foramt_reward": 0.822265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.997514471411705,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.997514471411705,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 596.953125,
"epoch": 1.408,
"grad_norm": 0.03564726933836937,
"kl": 0.008817672729492188,
"learning_rate": 1.999160995685765e-05,
"loss": 0.0041,
"reward": 6.393037676811218,
"reward_std": 1.5305506885051727,
"rewards/mrr_reward": 0.4989583343267441,
"rewards/rank_analyze_format_reward": 0.5883020609617233,
"rewards/rank_answer_foramt_reward": 0.833984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 610.8125,
"epoch": 1.416,
"grad_norm": 0.0317256897687912,
"kl": 0.007786750793457031,
"learning_rate": 1.9991506710379424e-05,
"loss": -0.0038,
"reward": 6.927413702011108,
"reward_std": 1.1728498041629791,
"rewards/mrr_reward": 0.5874070003628731,
"rewards/rank_analyze_format_reward": 0.7316593676805496,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 614.015625,
"epoch": 1.424,
"grad_norm": 0.03363263979554176,
"kl": 0.006999015808105469,
"learning_rate": 1.9991402832783e-05,
"loss": -0.0222,
"reward": 6.334396123886108,
"reward_std": 1.1969702541828156,
"rewards/mrr_reward": 0.49082961305975914,
"rewards/rank_analyze_format_reward": 0.532570406794548,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9671052694320679,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 639.0625,
"epoch": 1.432,
"grad_norm": 0.0324893482029438,
"kl": 0.0071563720703125,
"learning_rate": 1.9991298324074942e-05,
"loss": -0.0215,
"reward": 6.33887255191803,
"reward_std": 1.0472588911652565,
"rewards/mrr_reward": 0.4644531235098839,
"rewards/rank_analyze_format_reward": 0.6554184406995773,
"rewards/rank_answer_foramt_reward": 0.86328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9817143976688385,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.996271014213562,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 597.53125,
"epoch": 1.44,
"grad_norm": 0.038431741297245026,
"kl": 0.0085906982421875,
"learning_rate": 1.999119318426185e-05,
"loss": -0.0425,
"reward": 5.978406071662903,
"reward_std": 1.4375847578048706,
"rewards/mrr_reward": 0.37400173395872116,
"rewards/rank_analyze_format_reward": 0.654976025223732,
"rewards/rank_answer_foramt_reward": 0.86328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9937897026538849,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9937897026538849,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 644.21875,
"epoch": 1.448,
"grad_norm": 0.03540867194533348,
"kl": 0.008702278137207031,
"learning_rate": 1.9991087413350367e-05,
"loss": 0.0273,
"reward": 7.00466001033783,
"reward_std": 1.6190518736839294,
"rewards/mrr_reward": 0.6167968884110451,
"rewards/rank_analyze_format_reward": 0.7251099199056625,
"rewards/rank_answer_foramt_reward": 0.859375,
"rewards/rank_contrast_format_reward": 0.012996495701372623,
"rewards/rank_initial_format_reward": 0.9817143976688385,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9817143976688385,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 575.96875,
"epoch": 1.456,
"grad_norm": 0.03581292927265167,
"kl": 0.009832382202148438,
"learning_rate": 1.9990981011347172e-05,
"loss": -0.0048,
"reward": 5.947044134140015,
"reward_std": 0.9751862585544586,
"rewards/mrr_reward": 0.3696366660296917,
"rewards/rank_analyze_format_reward": 0.6148670166730881,
"rewards/rank_answer_foramt_reward": 0.86328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 584.296875,
"epoch": 1.464,
"grad_norm": 0.0367310456931591,
"kl": 0.008490562438964844,
"learning_rate": 1.999087397825899e-05,
"loss": -0.0219,
"reward": 6.547907114028931,
"reward_std": 0.9392938762903214,
"rewards/mrr_reward": 0.5391679182648659,
"rewards/rank_analyze_format_reward": 0.5218650847673416,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9825367629528046,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 584.265625,
"epoch": 1.472,
"grad_norm": 0.036771222949028015,
"kl": 0.00975799560546875,
"learning_rate": 1.9990766314092575e-05,
"loss": 0.0093,
"reward": 7.504821062088013,
"reward_std": 1.017032966017723,
"rewards/mrr_reward": 0.7345609813928604,
"rewards/rank_analyze_format_reward": 0.6810796558856964,
"rewards/rank_answer_foramt_reward": 0.890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 570.015625,
"epoch": 1.48,
"grad_norm": 0.035612449049949646,
"kl": 0.009767532348632812,
"learning_rate": 1.9990658018854737e-05,
"loss": -0.0192,
"reward": 6.572237730026245,
"reward_std": 1.1024248152971268,
"rewards/mrr_reward": 0.5541418492794037,
"rewards/rank_analyze_format_reward": 0.48548950254917145,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9966137856245041,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9809887856245041,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 610.625,
"epoch": 1.488,
"grad_norm": 0.03268010914325714,
"kl": 0.007953643798828125,
"learning_rate": 1.9990549092552307e-05,
"loss": -0.0163,
"reward": 7.752923250198364,
"reward_std": 1.1804132461547852,
"rewards/mrr_reward": 0.7671007066965103,
"rewards/rank_analyze_format_reward": 0.7255359292030334,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 618.6875,
"epoch": 1.496,
"grad_norm": 0.033477578312158585,
"kl": 0.010374069213867188,
"learning_rate": 1.999043953519217e-05,
"loss": -0.0446,
"reward": 6.951627135276794,
"reward_std": 1.142410233616829,
"rewards/mrr_reward": 0.5984498858451843,
"rewards/rank_analyze_format_reward": 0.6071917712688446,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9977788031101227,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9977788031101227,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 591.640625,
"epoch": 1.504,
"grad_norm": 0.03295287489891052,
"kl": 0.008535385131835938,
"learning_rate": 1.999032934678125e-05,
"loss": -0.0228,
"reward": 6.217561841011047,
"reward_std": 0.885568305850029,
"rewards/mrr_reward": 0.4311321973800659,
"rewards/rank_analyze_format_reward": 0.5961254388093948,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 595.359375,
"epoch": 1.512,
"grad_norm": 0.03543277829885483,
"kl": 0.008122444152832031,
"learning_rate": 1.99902185273265e-05,
"loss": -0.0164,
"reward": 6.661153793334961,
"reward_std": 0.7280477955937386,
"rewards/mrr_reward": 0.5164248645305634,
"rewards/rank_analyze_format_reward": 0.623047724366188,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9969455003738403,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9969455003738403,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 626.109375,
"epoch": 1.52,
"grad_norm": 0.037641286849975586,
"kl": 0.008847236633300781,
"learning_rate": 1.999010707683492e-05,
"loss": -0.0658,
"reward": 6.347493886947632,
"reward_std": 0.9116277098655701,
"rewards/mrr_reward": 0.43297991901636124,
"rewards/rank_analyze_format_reward": 0.666047140955925,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 610.25,
"epoch": 1.528,
"grad_norm": 0.03490091487765312,
"kl": 0.009138107299804688,
"learning_rate": 1.998999499531356e-05,
"loss": -0.0516,
"reward": 7.269640564918518,
"reward_std": 0.6211766228079796,
"rewards/mrr_reward": 0.6727616675198078,
"rewards/rank_analyze_format_reward": 0.6237920597195625,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.011442550458014011,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 581.203125,
"epoch": 1.536,
"grad_norm": 0.03623748943209648,
"kl": 0.010578155517578125,
"learning_rate": 1.9989882282769485e-05,
"loss": -0.0328,
"reward": 6.117859721183777,
"reward_std": 1.3281791657209396,
"rewards/mrr_reward": 0.4266369119286537,
"rewards/rank_analyze_format_reward": 0.5988272428512573,
"rewards/rank_answer_foramt_reward": 0.83203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9980392158031464,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9980392158031464,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 631.859375,
"epoch": 1.544,
"grad_norm": 0.03717571124434471,
"kl": 0.012277603149414062,
"learning_rate": 1.9989768939209826e-05,
"loss": -0.0291,
"reward": 6.472392678260803,
"reward_std": 1.0950042307376862,
"rewards/mrr_reward": 0.4958333298563957,
"rewards/rank_analyze_format_reward": 0.7097622603178024,
"rewards/rank_answer_foramt_reward": 0.833984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 1.0,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 603.59375,
"epoch": 1.552,
"grad_norm": 0.031889960169792175,
"kl": 0.0109710693359375,
"learning_rate": 1.9989654964641737e-05,
"loss": -0.0297,
"reward": 6.880647420883179,
"reward_std": 0.8547341153025627,
"rewards/mrr_reward": 0.580071933567524,
"rewards/rank_analyze_format_reward": 0.6654053032398224,
"rewards/rank_answer_foramt_reward": 0.9140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 599.234375,
"epoch": 1.56,
"grad_norm": 0.036006029695272446,
"kl": 0.012359619140625,
"learning_rate": 1.998954035907242e-05,
"loss": -0.0148,
"reward": 6.577338814735413,
"reward_std": 1.2951306998729706,
"rewards/mrr_reward": 0.5316840335726738,
"rewards/rank_analyze_format_reward": 0.5455324053764343,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9984335899353027,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9984335899353027,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 631.953125,
"epoch": 1.568,
"grad_norm": 0.030870715156197548,
"kl": 0.0106658935546875,
"learning_rate": 1.9989425122509113e-05,
"loss": -0.0305,
"reward": 6.851738214492798,
"reward_std": 0.7111386805772781,
"rewards/mrr_reward": 0.5270833075046539,
"rewards/rank_analyze_format_reward": 0.784420520067215,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 620.390625,
"epoch": 1.576,
"grad_norm": 0.03541827201843262,
"kl": 0.011089324951171875,
"learning_rate": 1.9989309254959096e-05,
"loss": -0.0172,
"reward": 7.087416887283325,
"reward_std": 1.3555363416671753,
"rewards/mrr_reward": 0.6345486119389534,
"rewards/rank_analyze_format_reward": 0.7166584730148315,
"rewards/rank_answer_foramt_reward": 0.873046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992897808551788,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9836647808551788,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 621.65625,
"epoch": 1.584,
"grad_norm": 0.0363469123840332,
"kl": 0.011362075805664062,
"learning_rate": 1.998919275642968e-05,
"loss": 0.0444,
"reward": 6.63647723197937,
"reward_std": 1.5355401635169983,
"rewards/mrr_reward": 0.537413202226162,
"rewards/rank_analyze_format_reward": 0.7207763195037842,
"rewards/rank_answer_foramt_reward": 0.81640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 619.609375,
"epoch": 1.592,
"grad_norm": 0.0350794680416584,
"kl": 0.010587692260742188,
"learning_rate": 1.9989075626928237e-05,
"loss": -0.0324,
"reward": 7.593704700469971,
"reward_std": 1.2587448060512543,
"rewards/mrr_reward": 0.7476562410593033,
"rewards/rank_analyze_format_reward": 0.6766816079616547,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 641.125,
"epoch": 1.6,
"grad_norm": 0.03762039542198181,
"kl": 0.011552810668945312,
"learning_rate": 1.9988957866462155e-05,
"loss": 0.0012,
"reward": 6.556584358215332,
"reward_std": 0.7802992425858974,
"rewards/mrr_reward": 0.4782552234828472,
"rewards/rank_analyze_format_reward": 0.7131441533565521,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 615.515625,
"epoch": 1.608,
"grad_norm": 0.03529913350939751,
"kl": 0.011404037475585938,
"learning_rate": 1.998883947503888e-05,
"loss": -0.0285,
"reward": 6.747278928756714,
"reward_std": 0.8986479938030243,
"rewards/mrr_reward": 0.5536458566784859,
"rewards/rank_analyze_format_reward": 0.6645828187465668,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9975329041481018,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9975329041481018,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 636.9375,
"epoch": 1.616,
"grad_norm": 0.03680592030286789,
"kl": 0.011167526245117188,
"learning_rate": 1.9988720452665885e-05,
"loss": -0.0142,
"reward": 7.523893117904663,
"reward_std": 1.5109763741493225,
"rewards/mrr_reward": 0.7254092246294022,
"rewards/rank_analyze_format_reward": 0.6940822452306747,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.013573232106864452,
"rewards/rank_initial_format_reward": 0.9973393976688385,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9973393976688385,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 654.671875,
"epoch": 1.624,
"grad_norm": 0.031202631071209908,
"kl": 0.011119842529296875,
"learning_rate": 1.9988600799350685e-05,
"loss": -0.011,
"reward": 7.5892653465271,
"reward_std": 0.8718039393424988,
"rewards/mrr_reward": 0.7219122052192688,
"rewards/rank_analyze_format_reward": 0.7621632516384125,
"rewards/rank_answer_foramt_reward": 0.955078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 624.46875,
"epoch": 1.6320000000000001,
"grad_norm": 0.03513794392347336,
"kl": 0.011323928833007812,
"learning_rate": 1.998848051510085e-05,
"loss": -0.0116,
"reward": 7.873760461807251,
"reward_std": 0.9668747493997216,
"rewards/mrr_reward": 0.8035590276122093,
"rewards/rank_analyze_format_reward": 0.738672748208046,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9985119104385376,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9985119104385376,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 616.203125,
"epoch": 1.6400000000000001,
"grad_norm": 0.03514819219708443,
"kl": 0.012950897216796875,
"learning_rate": 1.9988359599923964e-05,
"loss": -0.0071,
"reward": 6.787094712257385,
"reward_std": 1.260214388370514,
"rewards/mrr_reward": 0.5561384037137032,
"rewards/rank_analyze_format_reward": 0.7126235961914062,
"rewards/rank_answer_foramt_reward": 0.859375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 650.375,
"epoch": 1.6480000000000001,
"grad_norm": 0.03228963539004326,
"kl": 0.012664794921875,
"learning_rate": 1.9988238053827677e-05,
"loss": -0.0375,
"reward": 7.256770491600037,
"reward_std": 0.48313772678375244,
"rewards/mrr_reward": 0.6615699455142021,
"rewards/rank_analyze_format_reward": 0.6667077392339706,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 667.546875,
"epoch": 1.6560000000000001,
"grad_norm": 0.03247380256652832,
"kl": 0.011510848999023438,
"learning_rate": 1.9988115876819654e-05,
"loss": -0.0066,
"reward": 7.226160883903503,
"reward_std": 0.7291913609951735,
"rewards/mrr_reward": 0.6235677003860474,
"rewards/rank_analyze_format_reward": 0.7994760870933533,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9984335899353027,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9984335899353027,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 617.203125,
"epoch": 1.6640000000000001,
"grad_norm": 0.03485206514596939,
"kl": 0.012271881103515625,
"learning_rate": 1.9987993068907624e-05,
"loss": -0.0256,
"reward": 6.819635629653931,
"reward_std": 1.4911159574985504,
"rewards/mrr_reward": 0.5687500089406967,
"rewards/rank_analyze_format_reward": 0.6510217636823654,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.014248084276914597,
"rewards/rank_initial_format_reward": 0.9963235259056091,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9963235259056091,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 640.609375,
"epoch": 1.6720000000000002,
"grad_norm": 0.03412061929702759,
"kl": 0.011951446533203125,
"learning_rate": 1.9987869630099333e-05,
"loss": -0.0183,
"reward": 7.066570281982422,
"reward_std": 1.0215441137552261,
"rewards/mrr_reward": 0.6142113208770752,
"rewards/rank_analyze_format_reward": 0.7064512819051743,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9985119104385376,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9985119104385376,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 618.9375,
"epoch": 1.6800000000000002,
"grad_norm": 0.035708099603652954,
"kl": 0.011415481567382812,
"learning_rate": 1.998774556040259e-05,
"loss": 0.0207,
"reward": 7.148289203643799,
"reward_std": 0.40048687532544136,
"rewards/mrr_reward": 0.6233135014772415,
"rewards/rank_analyze_format_reward": 0.6780079305171967,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9992559552192688,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 642.90625,
"epoch": 1.688,
"grad_norm": 0.03594611957669258,
"kl": 0.012083053588867188,
"learning_rate": 1.9987620859825225e-05,
"loss": 0.007,
"reward": 7.130272626876831,
"reward_std": 1.0038132444024086,
"rewards/mrr_reward": 0.5943328440189362,
"rewards/rank_analyze_format_reward": 0.8232535421848297,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 632.3125,
"epoch": 1.696,
"grad_norm": 0.03679274767637253,
"kl": 0.012105941772460938,
"learning_rate": 1.9987495528375115e-05,
"loss": 0.0071,
"reward": 7.324402451515198,
"reward_std": 1.0858530811965466,
"rewards/mrr_reward": 0.6619791686534882,
"rewards/rank_analyze_format_reward": 0.7507043033838272,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 633.578125,
"epoch": 1.704,
"grad_norm": 0.032590728253126144,
"kl": 0.011951446533203125,
"learning_rate": 1.998736956606018e-05,
"loss": -0.0204,
"reward": 7.353400826454163,
"reward_std": 1.2198131084442139,
"rewards/mrr_reward": 0.6886904761195183,
"rewards/rank_analyze_format_reward": 0.7275451272726059,
"rewards/rank_answer_foramt_reward": 0.92578125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.984375,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 648.125,
"epoch": 1.712,
"grad_norm": 0.03516772761940956,
"kl": 0.011865615844726562,
"learning_rate": 1.9987242972888368e-05,
"loss": -0.0256,
"reward": 6.390246629714966,
"reward_std": 1.2206433862447739,
"rewards/mrr_reward": 0.4318266250193119,
"rewards/rank_analyze_format_reward": 0.744662880897522,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 626.515625,
"epoch": 1.72,
"grad_norm": 0.034917186945676804,
"kl": 0.010568618774414062,
"learning_rate": 1.9987115748867685e-05,
"loss": -0.0075,
"reward": 7.013459086418152,
"reward_std": 1.1758202761411667,
"rewards/mrr_reward": 0.6146267428994179,
"rewards/rank_analyze_format_reward": 0.6679250225424767,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 645.75,
"epoch": 1.728,
"grad_norm": 0.03526122495532036,
"kl": 0.011056900024414062,
"learning_rate": 1.9986987894006164e-05,
"loss": -0.0348,
"reward": 7.004386067390442,
"reward_std": 1.013509213924408,
"rewards/mrr_reward": 0.6190104186534882,
"rewards/rank_analyze_format_reward": 0.6921346038579941,
"rewards/rank_answer_foramt_reward": 0.849609375,
"rewards/rank_contrast_format_reward": 0.014774133451282978,
"rewards/rank_initial_format_reward": 0.9898194670677185,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9898194670677185,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 587.296875,
"epoch": 1.736,
"grad_norm": 0.03770997375249863,
"kl": 0.013156890869140625,
"learning_rate": 1.9986859408311878e-05,
"loss": -0.0243,
"reward": 7.723721385002136,
"reward_std": 1.2577708065509796,
"rewards/mrr_reward": 0.8122829794883728,
"rewards/rank_analyze_format_reward": 0.5527143776416779,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 625.390625,
"epoch": 1.744,
"grad_norm": 0.03594063222408295,
"kl": 0.01361083984375,
"learning_rate": 1.9986730291792945e-05,
"loss": -0.0125,
"reward": 6.763970732688904,
"reward_std": 1.1345993727445602,
"rewards/mrr_reward": 0.5692894533276558,
"rewards/rank_analyze_format_reward": 0.6561181470751762,
"rewards/rank_answer_foramt_reward": 0.857421875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 595.8125,
"epoch": 1.752,
"grad_norm": 0.03884103149175644,
"kl": 0.012781143188476562,
"learning_rate": 1.9986600544457524e-05,
"loss": -0.0204,
"reward": 6.09786331653595,
"reward_std": 1.128006488084793,
"rewards/mrr_reward": 0.4502604268491268,
"rewards/rank_analyze_format_reward": 0.48041532188653946,
"rewards/rank_answer_foramt_reward": 0.84765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 611.0625,
"epoch": 1.76,
"grad_norm": 0.03623896837234497,
"kl": 0.01174163818359375,
"learning_rate": 1.9986470166313805e-05,
"loss": 0.0022,
"reward": 6.999427080154419,
"reward_std": 0.6746486648917198,
"rewards/mrr_reward": 0.608004704117775,
"rewards/rank_analyze_format_reward": 0.6940975040197372,
"rewards/rank_answer_foramt_reward": 0.9140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991554021835327,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9835304021835327,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 640.984375,
"epoch": 1.768,
"grad_norm": 0.03681391850113869,
"kl": 0.01168060302734375,
"learning_rate": 1.9986339157370026e-05,
"loss": 0.0156,
"reward": 6.224501371383667,
"reward_std": 1.1424128413200378,
"rewards/mrr_reward": 0.4014260917901993,
"rewards/rank_analyze_format_reward": 0.7514945864677429,
"rewards/rank_answer_foramt_reward": 0.884765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 675.609375,
"epoch": 1.776,
"grad_norm": 0.035715728998184204,
"kl": 0.011419296264648438,
"learning_rate": 1.9986207517634466e-05,
"loss": -0.0075,
"reward": 6.838769316673279,
"reward_std": 1.138169839978218,
"rewards/mrr_reward": 0.5326946973800659,
"rewards/rank_analyze_format_reward": 0.8114955276250839,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9951225072145462,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9951225072145462,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 650.0625,
"epoch": 1.784,
"grad_norm": 0.03682604804635048,
"kl": 0.012578964233398438,
"learning_rate": 1.998607524711543e-05,
"loss": -0.024,
"reward": 6.9665446281433105,
"reward_std": 1.341919094324112,
"rewards/mrr_reward": 0.5829861015081406,
"rewards/rank_analyze_format_reward": 0.8279595226049423,
"rewards/rank_answer_foramt_reward": 0.876953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.984375,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 664.046875,
"epoch": 1.792,
"grad_norm": 0.03405497223138809,
"kl": 0.012273788452148438,
"learning_rate": 1.9985942345821285e-05,
"loss": 0.0101,
"reward": 7.542881608009338,
"reward_std": 0.9405869543552399,
"rewards/mrr_reward": 0.70331721752882,
"rewards/rank_analyze_format_reward": 0.8331284523010254,
"rewards/rank_answer_foramt_reward": 0.927734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 1.0,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 641.34375,
"epoch": 1.8,
"grad_norm": 0.03588543459773064,
"kl": 0.010999679565429688,
"learning_rate": 1.998580881376042e-05,
"loss": 0.0182,
"reward": 7.186712980270386,
"reward_std": 1.0480735301971436,
"rewards/mrr_reward": 0.6471106112003326,
"rewards/rank_analyze_format_reward": 0.7620245963335037,
"rewards/rank_answer_foramt_reward": 0.869140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 679.96875,
"epoch": 1.808,
"grad_norm": 0.03298197686672211,
"kl": 0.011339187622070312,
"learning_rate": 1.9985674650941265e-05,
"loss": -0.0075,
"reward": 6.580728888511658,
"reward_std": 1.171303242444992,
"rewards/mrr_reward": 0.49358879029750824,
"rewards/rank_analyze_format_reward": 0.7704363465309143,
"rewards/rank_answer_foramt_reward": 0.8984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.984375,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 641.984375,
"epoch": 1.8159999999999998,
"grad_norm": 0.032649777829647064,
"kl": 0.011322021484375,
"learning_rate": 1.9985539857372303e-05,
"loss": -0.0173,
"reward": 6.867309093475342,
"reward_std": 0.8678697645664215,
"rewards/mrr_reward": 0.557161457836628,
"rewards/rank_analyze_format_reward": 0.736319363117218,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 644.921875,
"epoch": 1.8239999999999998,
"grad_norm": 0.038027409464120865,
"kl": 0.011888504028320312,
"learning_rate": 1.998540443306204e-05,
"loss": 0.0094,
"reward": 6.406673431396484,
"reward_std": 1.36880823969841,
"rewards/mrr_reward": 0.47701510787010193,
"rewards/rank_analyze_format_reward": 0.7093206197023392,
"rewards/rank_answer_foramt_reward": 0.84765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9825367629528046,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 691.5625,
"epoch": 1.8319999999999999,
"grad_norm": 0.03739263862371445,
"kl": 0.016162872314453125,
"learning_rate": 1.998526837801904e-05,
"loss": -0.0163,
"reward": 6.16663670539856,
"reward_std": 0.7895801216363907,
"rewards/mrr_reward": 0.36532738618552685,
"rewards/rank_analyze_format_reward": 0.8101400434970856,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.013089364394545555,
"rewards/rank_initial_format_reward": 0.9976895451545715,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9820645451545715,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 646.640625,
"epoch": 1.8399999999999999,
"grad_norm": 0.03776266425848007,
"kl": 0.010849952697753906,
"learning_rate": 1.9985131692251887e-05,
"loss": 0.0068,
"reward": 6.760786771774292,
"reward_std": 1.123057559132576,
"rewards/mrr_reward": 0.5368923768401146,
"rewards/rank_analyze_format_reward": 0.7293006330728531,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9985989332199097,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9985989332199097,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 613.015625,
"epoch": 1.8479999999999999,
"grad_norm": 0.03792120888829231,
"kl": 0.01201629638671875,
"learning_rate": 1.9984994375769222e-05,
"loss": -0.0071,
"reward": 7.100589036941528,
"reward_std": 1.1812313869595528,
"rewards/mrr_reward": 0.6353298723697662,
"rewards/rank_analyze_format_reward": 0.6587639302015305,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 645.921875,
"epoch": 1.8559999999999999,
"grad_norm": 0.034471407532691956,
"kl": 0.012517929077148438,
"learning_rate": 1.9984856428579717e-05,
"loss": -0.0154,
"reward": 7.1253886222839355,
"reward_std": 0.9221947491168976,
"rewards/mrr_reward": 0.6032862067222595,
"rewards/rank_analyze_format_reward": 0.8059941083192825,
"rewards/rank_answer_foramt_reward": 0.9140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 621.453125,
"epoch": 1.8639999999999999,
"grad_norm": 0.03487012907862663,
"kl": 0.010175704956054688,
"learning_rate": 1.998471785069208e-05,
"loss": -0.0252,
"reward": 7.108256816864014,
"reward_std": 1.0599358081817627,
"rewards/mrr_reward": 0.6225880309939384,
"rewards/rank_analyze_format_reward": 0.7038420140743256,
"rewards/rank_answer_foramt_reward": 0.9140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 614.71875,
"epoch": 1.8719999999999999,
"grad_norm": 0.03678525239229202,
"kl": 0.011201858520507812,
"learning_rate": 1.9984578642115072e-05,
"loss": -0.0072,
"reward": 7.174077749252319,
"reward_std": 1.0892403870821,
"rewards/mrr_reward": 0.6339843720197678,
"rewards/rank_analyze_format_reward": 0.7592339366674423,
"rewards/rank_answer_foramt_reward": 0.92578125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.984375,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 626.484375,
"epoch": 1.88,
"grad_norm": 0.03547394275665283,
"kl": 0.012744903564453125,
"learning_rate": 1.998443880285748e-05,
"loss": -0.0371,
"reward": 7.188539266586304,
"reward_std": 1.558995470404625,
"rewards/mrr_reward": 0.6575520932674408,
"rewards/rank_analyze_format_reward": 0.7216200232505798,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9828085899353027,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9828085899353027,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 612.84375,
"epoch": 1.888,
"grad_norm": 0.03983930125832558,
"kl": 0.011608123779296875,
"learning_rate": 1.9984298332928142e-05,
"loss": -0.0087,
"reward": 7.840075254440308,
"reward_std": 1.4074196517467499,
"rewards/mrr_reward": 0.8069010525941849,
"rewards/rank_analyze_format_reward": 0.7511429786682129,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.96875,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 608.953125,
"epoch": 1.896,
"grad_norm": 0.03890369087457657,
"kl": 0.012737274169921875,
"learning_rate": 1.9984157232335926e-05,
"loss": -0.0036,
"reward": 6.91395902633667,
"reward_std": 1.4606387615203857,
"rewards/mrr_reward": 0.5853298753499985,
"rewards/rank_analyze_format_reward": 0.6925735026597977,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9966736733913422,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9966736733913422,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 626.671875,
"epoch": 1.904,
"grad_norm": 0.03019222430884838,
"kl": 0.010616302490234375,
"learning_rate": 1.998401550108975e-05,
"loss": -0.0175,
"reward": 7.32897675037384,
"reward_std": 0.9665245488286018,
"rewards/mrr_reward": 0.676432304084301,
"rewards/rank_analyze_format_reward": 0.7130914330482483,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 636.8125,
"epoch": 1.912,
"grad_norm": 0.03469119966030121,
"kl": 0.011953353881835938,
"learning_rate": 1.9983873139198565e-05,
"loss": 0.0037,
"reward": 6.612988352775574,
"reward_std": 1.0446814224123955,
"rewards/mrr_reward": 0.47405755519866943,
"rewards/rank_analyze_format_reward": 0.7889089584350586,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 621.203125,
"epoch": 1.92,
"grad_norm": 0.03232351318001747,
"kl": 0.0106048583984375,
"learning_rate": 1.9983730146671363e-05,
"loss": -0.0148,
"reward": 6.731534361839294,
"reward_std": 1.2606956362724304,
"rewards/mrr_reward": 0.5494357720017433,
"rewards/rank_analyze_format_reward": 0.6778992190957069,
"rewards/rank_answer_foramt_reward": 0.9140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 606.515625,
"epoch": 1.928,
"grad_norm": 0.03420478478074074,
"kl": 0.01219940185546875,
"learning_rate": 1.9983586523517175e-05,
"loss": -0.0438,
"reward": 7.590452075004578,
"reward_std": 1.6388859748840332,
"rewards/mrr_reward": 0.7669270932674408,
"rewards/rank_analyze_format_reward": 0.6672752201557159,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.96875,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 636.84375,
"epoch": 1.936,
"grad_norm": 0.03276892751455307,
"kl": 0.01093292236328125,
"learning_rate": 1.9983442269745073e-05,
"loss": -0.0257,
"reward": 6.300868988037109,
"reward_std": 0.995959609746933,
"rewards/mrr_reward": 0.4575396776199341,
"rewards/rank_analyze_format_reward": 0.6771672368049622,
"rewards/rank_answer_foramt_reward": 0.818359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9954044073820114,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9954044073820114,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 617.359375,
"epoch": 1.944,
"grad_norm": 0.03749570995569229,
"kl": 0.010999679565429688,
"learning_rate": 1.9983297385364166e-05,
"loss": -0.0007,
"reward": 7.169430136680603,
"reward_std": 1.120530128479004,
"rewards/mrr_reward": 0.6705729141831398,
"rewards/rank_analyze_format_reward": 0.6551071107387543,
"rewards/rank_answer_foramt_reward": 0.86328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 613.453125,
"epoch": 1.952,
"grad_norm": 0.04473373666405678,
"kl": 0.01145172119140625,
"learning_rate": 1.9983151870383614e-05,
"loss": -0.0107,
"reward": 6.484450101852417,
"reward_std": 1.0988103747367859,
"rewards/mrr_reward": 0.46861979365348816,
"rewards/rank_analyze_format_reward": 0.764109417796135,
"rewards/rank_answer_foramt_reward": 0.890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9932432472705841,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9776182472705841,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 621.328125,
"epoch": 1.96,
"grad_norm": 0.03367482125759125,
"kl": 0.013071060180664062,
"learning_rate": 1.99830057248126e-05,
"loss": -0.0296,
"reward": 6.685883641242981,
"reward_std": 0.9533030688762665,
"rewards/mrr_reward": 0.5285590291023254,
"rewards/rank_analyze_format_reward": 0.6315773874521255,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 655.34375,
"epoch": 1.968,
"grad_norm": 0.034726452082395554,
"kl": 0.010492324829101562,
"learning_rate": 1.9982858948660363e-05,
"loss": -0.0181,
"reward": 6.672136902809143,
"reward_std": 1.0319916605949402,
"rewards/mrr_reward": 0.48452381789684296,
"rewards/rank_analyze_format_reward": 0.7848227173089981,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 612.4375,
"epoch": 1.976,
"grad_norm": 0.03818318620324135,
"kl": 0.00988006591796875,
"learning_rate": 1.9982711541936167e-05,
"loss": -0.0117,
"reward": 7.333935976028442,
"reward_std": 1.081397719681263,
"rewards/mrr_reward": 0.6711309552192688,
"rewards/rank_analyze_format_reward": 0.7507446557283401,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 619.75,
"epoch": 1.984,
"grad_norm": 0.09436666965484619,
"kl": 0.037281036376953125,
"learning_rate": 1.9982563504649327e-05,
"loss": -0.0099,
"reward": 7.042810320854187,
"reward_std": 1.4771567583084106,
"rewards/mrr_reward": 0.6272321417927742,
"rewards/rank_analyze_format_reward": 0.7505638301372528,
"rewards/rank_answer_foramt_reward": 0.80859375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 593.03125,
"epoch": 1.992,
"grad_norm": 0.040008366107940674,
"kl": 0.011211395263671875,
"learning_rate": 1.998241483680919e-05,
"loss": 0.0073,
"reward": 6.97391951084137,
"reward_std": 1.2818303257226944,
"rewards/mrr_reward": 0.5991319715976715,
"rewards/rank_analyze_format_reward": 0.7252494841814041,
"rewards/rank_answer_foramt_reward": 0.88671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983368366956711,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9827118366956711,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 644.59375,
"epoch": 2.0,
"grad_norm": 0.03942989930510521,
"kl": 0.011959075927734375,
"learning_rate": 1.9982265538425157e-05,
"loss": 0.0371,
"reward": 6.234715461730957,
"reward_std": 1.436354637145996,
"rewards/mrr_reward": 0.47746776789426804,
"rewards/rank_analyze_format_reward": 0.5933748111128807,
"rewards/rank_answer_foramt_reward": 0.818359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9800696671009064,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9799154698848724,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 636.46875,
"epoch": 2.008,
"grad_norm": 0.03840762376785278,
"kl": 0.01082611083984375,
"learning_rate": 1.9982115609506648e-05,
"loss": -0.0149,
"reward": 7.465001344680786,
"reward_std": 1.3534227311611176,
"rewards/mrr_reward": 0.701078861951828,
"rewards/rank_analyze_format_reward": 0.73264279961586,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 623.4375,
"epoch": 2.016,
"grad_norm": 0.03643304482102394,
"kl": 0.0111846923828125,
"learning_rate": 1.9981965050063134e-05,
"loss": 0.0095,
"reward": 6.563894629478455,
"reward_std": 1.0918782949447632,
"rewards/mrr_reward": 0.49427083879709244,
"rewards/rank_analyze_format_reward": 0.6970945447683334,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9975927919149399,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9975927919149399,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 623.140625,
"epoch": 2.024,
"grad_norm": 0.03906136751174927,
"kl": 0.011167526245117188,
"learning_rate": 1.998181386010413e-05,
"loss": 0.0076,
"reward": 7.883460879325867,
"reward_std": 0.9759941548109055,
"rewards/mrr_reward": 0.7747395783662796,
"rewards/rank_analyze_format_reward": 0.8154443502426147,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 589.5,
"epoch": 2.032,
"grad_norm": 0.035734061151742935,
"kl": 0.014560699462890625,
"learning_rate": 1.9981662039639182e-05,
"loss": -0.0189,
"reward": 7.1975014209747314,
"reward_std": 1.0746060460805893,
"rewards/mrr_reward": 0.6796006858348846,
"rewards/rank_analyze_format_reward": 0.5943331569433212,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 608.125,
"epoch": 2.04,
"grad_norm": 0.035253558307886124,
"kl": 0.011186599731445312,
"learning_rate": 1.9981509588677883e-05,
"loss": -0.0403,
"reward": 6.368244171142578,
"reward_std": 0.9275897480547428,
"rewards/mrr_reward": 0.43844248354434967,
"rewards/rank_analyze_format_reward": 0.7043182849884033,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 593.515625,
"epoch": 2.048,
"grad_norm": 0.03972550854086876,
"kl": 0.012750625610351562,
"learning_rate": 1.9981356507229862e-05,
"loss": -0.0269,
"reward": 6.800292491912842,
"reward_std": 1.1689245849847794,
"rewards/mrr_reward": 0.5714161694049835,
"rewards/rank_analyze_format_reward": 0.627255916595459,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9964202791452408,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9964202791452408,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 593.203125,
"epoch": 2.056,
"grad_norm": 0.03649269416928291,
"kl": 0.009584426879882812,
"learning_rate": 1.9981202795304787e-05,
"loss": -0.0051,
"reward": 7.230230689048767,
"reward_std": 1.2953073680400848,
"rewards/mrr_reward": 0.6908172070980072,
"rewards/rank_analyze_format_reward": 0.6150907501578331,
"rewards/rank_answer_foramt_reward": 0.900390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 640.5,
"epoch": 2.064,
"grad_norm": 0.03631464019417763,
"kl": 0.010379791259765625,
"learning_rate": 1.9981048452912364e-05,
"loss": 0.0223,
"reward": 6.423146486282349,
"reward_std": 1.1042785942554474,
"rewards/mrr_reward": 0.46945685893297195,
"rewards/rank_analyze_format_reward": 0.7502825409173965,
"rewards/rank_answer_foramt_reward": 0.9140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9834558814764023,
"rewards/rank_overall_format_reward_more": 0.9140625,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 601.90625,
"epoch": 2.072,
"grad_norm": 0.03482425957918167,
"kl": 0.011255264282226562,
"learning_rate": 1.998089348006235e-05,
"loss": -0.0123,
"reward": 6.214681625366211,
"reward_std": 1.3232944011688232,
"rewards/mrr_reward": 0.4206349328160286,
"rewards/rank_analyze_format_reward": 0.6760562360286713,
"rewards/rank_answer_foramt_reward": 0.859375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 607.171875,
"epoch": 2.08,
"grad_norm": 0.03515848517417908,
"kl": 0.008946418762207031,
"learning_rate": 1.998073787676453e-05,
"loss": -0.0182,
"reward": 6.849403977394104,
"reward_std": 1.1705361306667328,
"rewards/mrr_reward": 0.5722842365503311,
"rewards/rank_analyze_format_reward": 0.6880913898348808,
"rewards/rank_answer_foramt_reward": 0.876953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9976112246513367,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9976112246513367,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 625.765625,
"epoch": 2.088,
"grad_norm": 0.033137645572423935,
"kl": 0.010486602783203125,
"learning_rate": 1.9980581643028732e-05,
"loss": -0.0257,
"reward": 6.725158452987671,
"reward_std": 0.918092668056488,
"rewards/mrr_reward": 0.5186383947730064,
"rewards/rank_analyze_format_reward": 0.7032241895794868,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 596.21875,
"epoch": 2.096,
"grad_norm": 0.03581111505627632,
"kl": 0.011474609375,
"learning_rate": 1.9980424778864825e-05,
"loss": -0.028,
"reward": 6.540898442268372,
"reward_std": 1.0225854963064194,
"rewards/mrr_reward": 0.4951760917901993,
"rewards/rank_analyze_format_reward": 0.618479423224926,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 587.578125,
"epoch": 2.104,
"grad_norm": 0.033393606543540955,
"kl": 0.009927749633789062,
"learning_rate": 1.9980267284282718e-05,
"loss": -0.0212,
"reward": 7.45247495174408,
"reward_std": 0.4263784661889076,
"rewards/mrr_reward": 0.7192708477377892,
"rewards/rank_analyze_format_reward": 0.5848489105701447,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 607.234375,
"epoch": 2.112,
"grad_norm": 0.034759897738695145,
"kl": 0.008701324462890625,
"learning_rate": 1.998010915929236e-05,
"loss": -0.0146,
"reward": 7.091454982757568,
"reward_std": 0.9185773134231567,
"rewards/mrr_reward": 0.6087363660335541,
"rewards/rank_analyze_format_reward": 0.6858063042163849,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 601.015625,
"epoch": 2.12,
"grad_norm": 0.03577468544244766,
"kl": 0.01145172119140625,
"learning_rate": 1.9979950403903732e-05,
"loss": -0.0014,
"reward": 6.77937126159668,
"reward_std": 1.279131755232811,
"rewards/mrr_reward": 0.563430055975914,
"rewards/rank_analyze_format_reward": 0.6385087594389915,
"rewards/rank_answer_foramt_reward": 0.8984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 559.375,
"epoch": 2.128,
"grad_norm": 0.03831040486693382,
"kl": 0.010589599609375,
"learning_rate": 1.9979791018126874e-05,
"loss": -0.0106,
"reward": 6.678526520729065,
"reward_std": 1.4866646826267242,
"rewards/mrr_reward": 0.5484995096921921,
"rewards/rank_analyze_format_reward": 0.595856636762619,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 583.3125,
"epoch": 2.136,
"grad_norm": 0.0412379652261734,
"kl": 0.010957717895507812,
"learning_rate": 1.9979631001971848e-05,
"loss": -0.0116,
"reward": 7.416189789772034,
"reward_std": 1.0926668643951416,
"rewards/mrr_reward": 0.7192708253860474,
"rewards/rank_analyze_format_reward": 0.6211378127336502,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 609.828125,
"epoch": 2.144,
"grad_norm": 0.03350284695625305,
"kl": 0.008955001831054688,
"learning_rate": 1.9979470355448756e-05,
"loss": -0.0158,
"reward": 7.620032906532288,
"reward_std": 0.6238258853554726,
"rewards/mrr_reward": 0.7218749970197678,
"rewards/rank_analyze_format_reward": 0.7774548083543777,
"rewards/rank_answer_foramt_reward": 0.955078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 592.359375,
"epoch": 2.152,
"grad_norm": 0.03770367428660393,
"kl": 0.011816024780273438,
"learning_rate": 1.9979309078567756e-05,
"loss": -0.0043,
"reward": 6.694323897361755,
"reward_std": 1.3028307557106018,
"rewards/mrr_reward": 0.5659226104617119,
"rewards/rank_analyze_format_reward": 0.5534504503011703,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 606.828125,
"epoch": 2.16,
"grad_norm": 0.03764275088906288,
"kl": 0.009786605834960938,
"learning_rate": 1.9979147171339022e-05,
"loss": -0.019,
"reward": 6.99415135383606,
"reward_std": 1.3437075316905975,
"rewards/mrr_reward": 0.6053075417876244,
"rewards/rank_analyze_format_reward": 0.6764369979500771,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 595.4375,
"epoch": 2.168,
"grad_norm": 0.03837438300251961,
"kl": 0.011205673217773438,
"learning_rate": 1.9978984633772795e-05,
"loss": -0.0289,
"reward": 5.901566505432129,
"reward_std": 0.9236202016472816,
"rewards/mrr_reward": 0.35381324775516987,
"rewards/rank_analyze_format_reward": 0.59522345662117,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9992559552192688,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 575.375,
"epoch": 2.176,
"grad_norm": 0.04059956222772598,
"kl": 0.011159896850585938,
"learning_rate": 1.9978821465879332e-05,
"loss": -0.0362,
"reward": 6.7173460721969604,
"reward_std": 0.7962133586406708,
"rewards/mrr_reward": 0.5370783656835556,
"rewards/rank_analyze_format_reward": 0.6237201392650604,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 570.328125,
"epoch": 2.184,
"grad_norm": 0.03902539238333702,
"kl": 0.0102081298828125,
"learning_rate": 1.9978657667668945e-05,
"loss": -0.032,
"reward": 6.786892771720886,
"reward_std": 1.4763158559799194,
"rewards/mrr_reward": 0.6156250163912773,
"rewards/rank_analyze_format_reward": 0.49029337987303734,
"rewards/rank_answer_foramt_reward": 0.875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 632.28125,
"epoch": 2.192,
"grad_norm": 0.035940494388341904,
"kl": 0.012010574340820312,
"learning_rate": 1.9978493239151976e-05,
"loss": -0.0052,
"reward": 7.241865515708923,
"reward_std": 1.5207486748695374,
"rewards/mrr_reward": 0.6480902805924416,
"rewards/rank_analyze_format_reward": 0.7944418787956238,
"rewards/rank_answer_foramt_reward": 0.876953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9968671798706055,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9968671798706055,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 616.765625,
"epoch": 2.2,
"grad_norm": 0.03509166091680527,
"kl": 0.013257980346679688,
"learning_rate": 1.997832818033881e-05,
"loss": 0.0139,
"reward": 6.9878867864608765,
"reward_std": 1.2263060361146927,
"rewards/mrr_reward": 0.592051088809967,
"rewards/rank_analyze_format_reward": 0.7836297750473022,
"rewards/rank_answer_foramt_reward": 0.884765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9834558814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 574.640625,
"epoch": 2.208,
"grad_norm": 0.03720112144947052,
"kl": 0.013032913208007812,
"learning_rate": 1.9978162491239882e-05,
"loss": -0.0178,
"reward": 7.190923571586609,
"reward_std": 1.171968013048172,
"rewards/mrr_reward": 0.640625,
"rewards/rank_analyze_format_reward": 0.6909236311912537,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 613.5625,
"epoch": 2.216,
"grad_norm": 0.04091575741767883,
"kl": 0.012386322021484375,
"learning_rate": 1.997799617186565e-05,
"loss": -0.003,
"reward": 6.570623397827148,
"reward_std": 1.0336104482412338,
"rewards/mrr_reward": 0.48072298616170883,
"rewards/rank_analyze_format_reward": 0.7270140051841736,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9994212985038757,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9994212985038757,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 622.53125,
"epoch": 2.224,
"grad_norm": 0.03800741583108902,
"kl": 0.01259613037109375,
"learning_rate": 1.9977829222226622e-05,
"loss": -0.0266,
"reward": 6.372930645942688,
"reward_std": 0.8913363832980394,
"rewards/mrr_reward": 0.46861979365348816,
"rewards/rank_analyze_format_reward": 0.731744721531868,
"rewards/rank_answer_foramt_reward": 0.802734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9976112246513367,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9819862246513367,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 608.8125,
"epoch": 2.232,
"grad_norm": 0.0358574353158474,
"kl": 0.012134552001953125,
"learning_rate": 1.9977661642333344e-05,
"loss": -0.0335,
"reward": 6.156337261199951,
"reward_std": 1.1192015409469604,
"rewards/mrr_reward": 0.4033792242407799,
"rewards/rank_analyze_format_reward": 0.7136551886796951,
"rewards/rank_answer_foramt_reward": 0.859375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9966137856245041,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9966137856245041,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 590.6875,
"epoch": 2.24,
"grad_norm": 0.03665808215737343,
"kl": 0.012432098388671875,
"learning_rate": 1.99774934321964e-05,
"loss": -0.0148,
"reward": 7.189491271972656,
"reward_std": 1.3065388202667236,
"rewards/mrr_reward": 0.682291679084301,
"rewards/rank_analyze_format_reward": 0.6308017671108246,
"rewards/rank_answer_foramt_reward": 0.8828125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9967927634716034,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9811677634716034,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 632.296875,
"epoch": 2.248,
"grad_norm": 0.039501260966062546,
"kl": 0.010639190673828125,
"learning_rate": 1.9977324591826415e-05,
"loss": -0.0105,
"reward": 6.4820040464401245,
"reward_std": 1.1038605086505413,
"rewards/mrr_reward": 0.45491691678762436,
"rewards/rank_analyze_format_reward": 0.7659522593021393,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.012876884080469608,
"rewards/rank_initial_format_reward": 0.9974177181720734,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9974177181720734,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 621.796875,
"epoch": 2.2560000000000002,
"grad_norm": 0.040637850761413574,
"kl": 0.012548446655273438,
"learning_rate": 1.9977155121234056e-05,
"loss": 0.008,
"reward": 6.498598098754883,
"reward_std": 1.4399305284023285,
"rewards/mrr_reward": 0.4924045279622078,
"rewards/rank_analyze_format_reward": 0.7026933282613754,
"rewards/rank_answer_foramt_reward": 0.8359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 622.609375,
"epoch": 2.2640000000000002,
"grad_norm": 0.039998337626457214,
"kl": 0.01102447509765625,
"learning_rate": 1.9976985020430022e-05,
"loss": 0.0019,
"reward": 6.484407901763916,
"reward_std": 0.9918918311595917,
"rewards/mrr_reward": 0.4627170190215111,
"rewards/rank_analyze_format_reward": 0.7155710011720657,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 634.28125,
"epoch": 2.2720000000000002,
"grad_norm": 0.034088097512722015,
"kl": 0.0094451904296875,
"learning_rate": 1.9976814289425066e-05,
"loss": 0.0066,
"reward": 6.765654683113098,
"reward_std": 1.0432685762643814,
"rewards/mrr_reward": 0.5332837402820587,
"rewards/rank_analyze_format_reward": 0.6904967427253723,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 620.1875,
"epoch": 2.2800000000000002,
"grad_norm": 0.03567035123705864,
"kl": 0.015941619873046875,
"learning_rate": 1.9976642928229965e-05,
"loss": -0.0143,
"reward": 7.0589940547943115,
"reward_std": 0.7747539728879929,
"rewards/mrr_reward": 0.5898003429174423,
"rewards/rank_analyze_format_reward": 0.7460509389638901,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 640.59375,
"epoch": 2.288,
"grad_norm": 0.033182136714458466,
"kl": 0.009305953979492188,
"learning_rate": 1.997647093685555e-05,
"loss": 0.0029,
"reward": 7.651683449745178,
"reward_std": 0.4577641859650612,
"rewards/mrr_reward": 0.7307725697755814,
"rewards/rank_analyze_format_reward": 0.7285931408405304,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 589.5,
"epoch": 2.296,
"grad_norm": 0.03666054829955101,
"kl": 0.010992050170898438,
"learning_rate": 1.9976298315312675e-05,
"loss": -0.0206,
"reward": 7.6038994789123535,
"reward_std": 1.4697020053863525,
"rewards/mrr_reward": 0.7263020724058151,
"rewards/rank_analyze_format_reward": 0.7533785998821259,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 636.65625,
"epoch": 2.304,
"grad_norm": 0.03196245804429054,
"kl": 0.009922027587890625,
"learning_rate": 1.9976125063612254e-05,
"loss": -0.0084,
"reward": 7.176369905471802,
"reward_std": 1.0738315135240555,
"rewards/mrr_reward": 0.6143229156732559,
"rewards/rank_analyze_format_reward": 0.7952503263950348,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 575.3125,
"epoch": 2.312,
"grad_norm": 0.036641672253608704,
"kl": 0.01151275634765625,
"learning_rate": 1.9975951181765226e-05,
"loss": -0.0135,
"reward": 6.732638239860535,
"reward_std": 1.1722622215747833,
"rewards/mrr_reward": 0.5541604608297348,
"rewards/rank_analyze_format_reward": 0.6565065011382103,
"rewards/rank_answer_foramt_reward": 0.861328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 620.125,
"epoch": 2.32,
"grad_norm": 0.03352293372154236,
"kl": 0.00873565673828125,
"learning_rate": 1.9975776669782572e-05,
"loss": -0.0098,
"reward": 7.056705951690674,
"reward_std": 0.74837876111269,
"rewards/mrr_reward": 0.5602182596921921,
"rewards/rank_analyze_format_reward": 0.8049077540636063,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.012763278558850288,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 594.203125,
"epoch": 2.328,
"grad_norm": 0.03776485472917557,
"kl": 0.011941909790039062,
"learning_rate": 1.997560152767532e-05,
"loss": -0.011,
"reward": 7.487109661102295,
"reward_std": 0.8209907524287701,
"rewards/mrr_reward": 0.7063492089509964,
"rewards/rank_analyze_format_reward": 0.7513267993927002,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9825367629528046,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 610.453125,
"epoch": 2.336,
"grad_norm": 0.037363357841968536,
"kl": 0.013795852661132812,
"learning_rate": 1.997542575545453e-05,
"loss": 0.0103,
"reward": 7.0443562269210815,
"reward_std": 1.2127674743533134,
"rewards/mrr_reward": 0.5873635932803154,
"rewards/rank_analyze_format_reward": 0.7670525759458542,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 590.578125,
"epoch": 2.344,
"grad_norm": 0.03627763316035271,
"kl": 0.01107025146484375,
"learning_rate": 1.9975249353131304e-05,
"loss": -0.0153,
"reward": 7.811681151390076,
"reward_std": 1.2126767039299011,
"rewards/mrr_reward": 0.8069444298744202,
"rewards/rank_analyze_format_reward": 0.6737470030784607,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 590.75,
"epoch": 2.352,
"grad_norm": 0.03678389638662338,
"kl": 0.009225845336914062,
"learning_rate": 1.9975072320716785e-05,
"loss": -0.0396,
"reward": 6.60707688331604,
"reward_std": 1.2315413057804108,
"rewards/mrr_reward": 0.5236669182777405,
"rewards/rank_analyze_format_reward": 0.584445059299469,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 576.390625,
"epoch": 2.36,
"grad_norm": 0.03965243697166443,
"kl": 0.013790130615234375,
"learning_rate": 1.997489465822216e-05,
"loss": -0.0106,
"reward": 7.775085091590881,
"reward_std": 1.3139366656541824,
"rewards/mrr_reward": 0.8050967454910278,
"rewards/rank_analyze_format_reward": 0.6439119428396225,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9944556355476379,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9944556355476379,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 606.703125,
"epoch": 2.368,
"grad_norm": 0.039732079952955246,
"kl": 0.011026382446289062,
"learning_rate": 1.9974716365658646e-05,
"loss": -0.0467,
"reward": 7.427183151245117,
"reward_std": 1.2437842339277267,
"rewards/mrr_reward": 0.7122395783662796,
"rewards/rank_analyze_format_reward": 0.7224476039409637,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 605.25,
"epoch": 2.376,
"grad_norm": 0.03692779690027237,
"kl": 0.010181427001953125,
"learning_rate": 1.9974537443037504e-05,
"loss": -0.0119,
"reward": 7.6130610704422,
"reward_std": 1.0890810042619705,
"rewards/mrr_reward": 0.7197916656732559,
"rewards/rank_analyze_format_reward": 0.8107158541679382,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9977221935987473,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9977221935987473,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 611.5625,
"epoch": 2.384,
"grad_norm": 0.039538830518722534,
"kl": 0.013807296752929688,
"learning_rate": 1.9974357890370038e-05,
"loss": -0.008,
"reward": 6.635961890220642,
"reward_std": 0.7657184079289436,
"rewards/mrr_reward": 0.48133058845996857,
"rewards/rank_analyze_format_reward": 0.7749776542186737,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 606.328125,
"epoch": 2.392,
"grad_norm": 0.03752804920077324,
"kl": 0.013395309448242188,
"learning_rate": 1.9974177707667594e-05,
"loss": 0.0098,
"reward": 7.015731453895569,
"reward_std": 1.1001620888710022,
"rewards/mrr_reward": 0.6202257052063942,
"rewards/rank_analyze_format_reward": 0.6712347567081451,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9962500035762787,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9806250035762787,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 593.0,
"epoch": 2.4,
"grad_norm": 0.039174940437078476,
"kl": 0.011379241943359375,
"learning_rate": 1.9973996894941545e-05,
"loss": -0.0011,
"reward": 7.0397127866744995,
"reward_std": 1.0055639445781708,
"rewards/mrr_reward": 0.5911644473671913,
"rewards/rank_analyze_format_reward": 0.7411527559161186,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 590.3125,
"epoch": 2.408,
"grad_norm": 0.0378861129283905,
"kl": 0.011119842529296875,
"learning_rate": 1.9973815452203314e-05,
"loss": 0.0056,
"reward": 7.447056770324707,
"reward_std": 1.2125954329967499,
"rewards/mrr_reward": 0.7122395783662796,
"rewards/rank_analyze_format_reward": 0.669488713145256,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9994612038135529,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9994612038135529,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 565.15625,
"epoch": 2.416,
"grad_norm": 0.03746671974658966,
"kl": 0.011993408203125,
"learning_rate": 1.997363337946437e-05,
"loss": -0.0198,
"reward": 6.575040936470032,
"reward_std": 0.9133451133966446,
"rewards/mrr_reward": 0.5259300693869591,
"rewards/rank_analyze_format_reward": 0.5709301829338074,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 641.546875,
"epoch": 2.424,
"grad_norm": 0.03554888442158699,
"kl": 0.010702133178710938,
"learning_rate": 1.9973450676736205e-05,
"loss": -0.0074,
"reward": 7.236762523651123,
"reward_std": 0.604234242811799,
"rewards/mrr_reward": 0.6168154701590538,
"rewards/rank_analyze_format_reward": 0.8082548528909683,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 614.53125,
"epoch": 2.432,
"grad_norm": 0.03649809956550598,
"kl": 0.011503219604492188,
"learning_rate": 1.997326734403036e-05,
"loss": -0.0239,
"reward": 6.725122928619385,
"reward_std": 1.2124179899692535,
"rewards/mrr_reward": 0.5331907123327255,
"rewards/rank_analyze_format_reward": 0.7333841472864151,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 595.609375,
"epoch": 2.44,
"grad_norm": 0.03357018902897835,
"kl": 0.011198043823242188,
"learning_rate": 1.997308338135842e-05,
"loss": -0.0394,
"reward": 7.099708437919617,
"reward_std": 1.0707662254571915,
"rewards/mrr_reward": 0.617491327226162,
"rewards/rank_analyze_format_reward": 0.6802160441875458,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 639.46875,
"epoch": 2.448,
"grad_norm": 0.04073842614889145,
"kl": 0.01114654541015625,
"learning_rate": 1.9972898788732e-05,
"loss": -0.0205,
"reward": 6.205634713172913,
"reward_std": 1.0768165290355682,
"rewards/mrr_reward": 0.40212054550647736,
"rewards/rank_analyze_format_reward": 0.6713712811470032,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 592.875,
"epoch": 2.456,
"grad_norm": 0.038296766579151154,
"kl": 0.0117034912109375,
"learning_rate": 1.9972713566162763e-05,
"loss": -0.0115,
"reward": 6.65511429309845,
"reward_std": 0.8909335732460022,
"rewards/mrr_reward": 0.5184585936367512,
"rewards/rank_analyze_format_reward": 0.6747215688228607,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 571.96875,
"epoch": 2.464,
"grad_norm": 0.03361259400844574,
"kl": 0.010142326354980469,
"learning_rate": 1.997252771366241e-05,
"loss": -0.0059,
"reward": 7.825888633728027,
"reward_std": 0.7059714342467487,
"rewards/mrr_reward": 0.8350446447730064,
"rewards/rank_analyze_format_reward": 0.4857100807130337,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 606.9375,
"epoch": 2.472,
"grad_norm": 0.03526763245463371,
"kl": 0.0130767822265625,
"learning_rate": 1.9972341231242675e-05,
"loss": -0.0398,
"reward": 6.988335967063904,
"reward_std": 0.7815524078905582,
"rewards/mrr_reward": 0.5860863253474236,
"rewards/rank_analyze_format_reward": 0.722115769982338,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 599.140625,
"epoch": 2.48,
"grad_norm": 0.03733767569065094,
"kl": 0.012132644653320312,
"learning_rate": 1.9972154118915344e-05,
"loss": -0.0251,
"reward": 7.347846150398254,
"reward_std": 1.1197139769792557,
"rewards/mrr_reward": 0.6794270873069763,
"rewards/rank_analyze_format_reward": 0.7024035751819611,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 592.796875,
"epoch": 2.488,
"grad_norm": 0.037743665277957916,
"kl": 0.010995864868164062,
"learning_rate": 1.997196637669223e-05,
"loss": -0.0057,
"reward": 7.16385281085968,
"reward_std": 0.9465463161468506,
"rewards/mrr_reward": 0.614341527223587,
"rewards/rank_analyze_format_reward": 0.7628190815448761,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 628.25,
"epoch": 2.496,
"grad_norm": 0.03614401817321777,
"kl": 0.010850906372070312,
"learning_rate": 1.99717780045852e-05,
"loss": -0.0312,
"reward": 7.732061147689819,
"reward_std": 0.6288701333105564,
"rewards/mrr_reward": 0.7590463757514954,
"rewards/rank_analyze_format_reward": 0.7517846375703812,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 605.609375,
"epoch": 2.504,
"grad_norm": 0.035900671035051346,
"kl": 0.010019302368164062,
"learning_rate": 1.997158900260614e-05,
"loss": 0.001,
"reward": 7.1635472774505615,
"reward_std": 1.0679296404123306,
"rewards/mrr_reward": 0.6484374925494194,
"rewards/rank_analyze_format_reward": 0.6771043539047241,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9834558814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 596.1875,
"epoch": 2.512,
"grad_norm": 0.041989874094724655,
"kl": 0.015058517456054688,
"learning_rate": 1.9971399370767e-05,
"loss": -0.0166,
"reward": 6.863955616950989,
"reward_std": 0.7592495381832123,
"rewards/mrr_reward": 0.565854400396347,
"rewards/rank_analyze_format_reward": 0.6161628141999245,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 599.0,
"epoch": 2.52,
"grad_norm": 0.041266754269599915,
"kl": 0.013032913208007812,
"learning_rate": 1.9971209109079752e-05,
"loss": -0.0229,
"reward": 7.460736155509949,
"reward_std": 1.0799484848976135,
"rewards/mrr_reward": 0.7114583253860474,
"rewards/rank_analyze_format_reward": 0.6832623034715652,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 646.28125,
"epoch": 2.528,
"grad_norm": 0.03333236649632454,
"kl": 0.009759902954101562,
"learning_rate": 1.9971018217556416e-05,
"loss": -0.0106,
"reward": 6.682798147201538,
"reward_std": 0.5989858657121658,
"rewards/mrr_reward": 0.4994109719991684,
"rewards/rank_analyze_format_reward": 0.7437479048967361,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 631.4375,
"epoch": 2.536,
"grad_norm": 0.0350511260330677,
"kl": 0.010562896728515625,
"learning_rate": 1.997082669620905e-05,
"loss": -0.0302,
"reward": 6.6315062046051025,
"reward_std": 1.0686845779418945,
"rewards/mrr_reward": 0.4913690462708473,
"rewards/rank_analyze_format_reward": 0.7528755962848663,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9975927770137787,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9975927770137787,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 583.140625,
"epoch": 2.544,
"grad_norm": 0.03914601355791092,
"kl": 0.013078689575195312,
"learning_rate": 1.997063454504975e-05,
"loss": -0.0055,
"reward": 6.575037002563477,
"reward_std": 1.3988100588321686,
"rewards/mrr_reward": 0.514732152223587,
"rewards/rank_analyze_format_reward": 0.6743116676807404,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 618.71875,
"epoch": 2.552,
"grad_norm": 0.0380953773856163,
"kl": 0.013456344604492188,
"learning_rate": 1.9970441764090654e-05,
"loss": -0.0518,
"reward": 7.295857548713684,
"reward_std": 1.004029467701912,
"rewards/mrr_reward": 0.6721354275941849,
"rewards/rank_analyze_format_reward": 0.7370236366987228,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9878805130720139,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9878805130720139,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 613.4375,
"epoch": 2.56,
"grad_norm": 0.040197595953941345,
"kl": 0.013912200927734375,
"learning_rate": 1.9970248353343943e-05,
"loss": -0.0075,
"reward": 6.5366517305374146,
"reward_std": 1.0288221687078476,
"rewards/mrr_reward": 0.4604600891470909,
"rewards/rank_analyze_format_reward": 0.7944208830595016,
"rewards/rank_answer_foramt_reward": 0.900390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 649.296875,
"epoch": 2.568,
"grad_norm": 0.038607921451330185,
"kl": 0.013824462890625,
"learning_rate": 1.997005431282183e-05,
"loss": 0.0172,
"reward": 7.0922359228134155,
"reward_std": 1.0854482501745224,
"rewards/mrr_reward": 0.603298619389534,
"rewards/rank_analyze_format_reward": 0.7901396751403809,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9825367629528046,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 641.296875,
"epoch": 2.576,
"grad_norm": 0.03311832994222641,
"kl": 0.011646270751953125,
"learning_rate": 1.996985964253657e-05,
"loss": -0.0369,
"reward": 6.7459012269973755,
"reward_std": 0.9181017801165581,
"rewards/mrr_reward": 0.5021019279956818,
"rewards/rank_analyze_format_reward": 0.7544548064470291,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 627.34375,
"epoch": 2.584,
"grad_norm": 0.037428632378578186,
"kl": 0.012990951538085938,
"learning_rate": 1.996966434250046e-05,
"loss": -0.0228,
"reward": 7.209717512130737,
"reward_std": 1.1640962213277817,
"rewards/mrr_reward": 0.6627604216337204,
"rewards/rank_analyze_format_reward": 0.7012539207935333,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 1.0,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 586.28125,
"epoch": 2.592,
"grad_norm": 0.035629965364933014,
"kl": 0.011270523071289062,
"learning_rate": 1.996946841272584e-05,
"loss": -0.0126,
"reward": 6.940586090087891,
"reward_std": 1.4230458736419678,
"rewards/mrr_reward": 0.5958519503474236,
"rewards/rank_analyze_format_reward": 0.6704594492912292,
"rewards/rank_answer_foramt_reward": 0.88671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 625.984375,
"epoch": 2.6,
"grad_norm": 0.032520923763513565,
"kl": 0.011152267456054688,
"learning_rate": 1.9969271853225083e-05,
"loss": -0.0061,
"reward": 7.102632761001587,
"reward_std": 0.8966164737939835,
"rewards/mrr_reward": 0.6060329973697662,
"rewards/rank_analyze_format_reward": 0.717139944434166,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 620.765625,
"epoch": 2.608,
"grad_norm": 0.03493724763393402,
"kl": 0.011966705322265625,
"learning_rate": 1.9969074664010605e-05,
"loss": -0.0149,
"reward": 6.612971305847168,
"reward_std": 0.9198006242513657,
"rewards/mrr_reward": 0.479879729449749,
"rewards/rank_analyze_format_reward": 0.7714625149965286,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 610.78125,
"epoch": 2.616,
"grad_norm": 0.03426508232951164,
"kl": 0.010488510131835938,
"learning_rate": 1.9968876845094864e-05,
"loss": -0.0116,
"reward": 7.175417423248291,
"reward_std": 0.7358394265174866,
"rewards/mrr_reward": 0.6250000074505806,
"rewards/rank_analyze_format_reward": 0.764837920665741,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 612.21875,
"epoch": 2.624,
"grad_norm": 0.03677660971879959,
"kl": 0.01381683349609375,
"learning_rate": 1.996867839649035e-05,
"loss": -0.0066,
"reward": 7.328829765319824,
"reward_std": 0.97315713763237,
"rewards/mrr_reward": 0.6791852787137032,
"rewards/rank_analyze_format_reward": 0.7214639633893967,
"rewards/rank_answer_foramt_reward": 0.9140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 587.625,
"epoch": 2.632,
"grad_norm": 0.03748798742890358,
"kl": 0.01129150390625,
"learning_rate": 1.9968479318209603e-05,
"loss": 0.0107,
"reward": 7.366376042366028,
"reward_std": 0.7245956286787987,
"rewards/mrr_reward": 0.6915550529956818,
"rewards/rank_analyze_format_reward": 0.6485605537891388,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 603.265625,
"epoch": 2.64,
"grad_norm": 0.038382068276405334,
"kl": 0.014995574951171875,
"learning_rate": 1.9968279610265194e-05,
"loss": -0.0244,
"reward": 7.351204872131348,
"reward_std": 1.0787476003170013,
"rewards/mrr_reward": 0.6888020783662796,
"rewards/rank_analyze_format_reward": 0.7559229284524918,
"rewards/rank_answer_foramt_reward": 0.875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9825367629528046,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 617.171875,
"epoch": 2.648,
"grad_norm": 0.036839522421360016,
"kl": 0.011720657348632812,
"learning_rate": 1.9968079272669744e-05,
"loss": 0.0057,
"reward": 6.830013751983643,
"reward_std": 1.1275426745414734,
"rewards/mrr_reward": 0.5580295100808144,
"rewards/rank_analyze_format_reward": 0.7516124844551086,
"rewards/rank_answer_foramt_reward": 0.849609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983368366956711,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983368366956711,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 620.265625,
"epoch": 2.656,
"grad_norm": 0.03564363345503807,
"kl": 0.012371063232421875,
"learning_rate": 1.9967878305435902e-05,
"loss": -0.0231,
"reward": 7.337198257446289,
"reward_std": 0.7928859405219555,
"rewards/mrr_reward": 0.6541852578520775,
"rewards/rank_analyze_format_reward": 0.8176662474870682,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9972937107086182,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9972937107086182,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 647.890625,
"epoch": 2.664,
"grad_norm": 0.038595810532569885,
"kl": 0.010858535766601562,
"learning_rate": 1.9967676708576362e-05,
"loss": -0.0045,
"reward": 6.599027991294861,
"reward_std": 0.9832871407270432,
"rewards/mrr_reward": 0.4508804567158222,
"rewards/rank_analyze_format_reward": 0.8443343043327332,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 644.796875,
"epoch": 2.672,
"grad_norm": 0.03759092092514038,
"kl": 0.0118408203125,
"learning_rate": 1.9967474482103863e-05,
"loss": -0.0121,
"reward": 6.94339394569397,
"reward_std": 0.9748950749635696,
"rewards/mrr_reward": 0.5725632309913635,
"rewards/rank_analyze_format_reward": 0.733219176530838,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 660.65625,
"epoch": 2.68,
"grad_norm": 0.03415609896183014,
"kl": 0.011899948120117188,
"learning_rate": 1.996727162603117e-05,
"loss": -0.0132,
"reward": 6.538380742073059,
"reward_std": 0.7016656026244164,
"rewards/mrr_reward": 0.44487228989601135,
"rewards/rank_analyze_format_reward": 0.8194384127855301,
"rewards/rank_answer_foramt_reward": 0.955078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 634.390625,
"epoch": 2.6879999999999997,
"grad_norm": 0.03727827966213226,
"kl": 0.011407852172851562,
"learning_rate": 1.9967068140371103e-05,
"loss": 0.0018,
"reward": 7.043541312217712,
"reward_std": 0.7633183086290956,
"rewards/mrr_reward": 0.5886718779802322,
"rewards/rank_analyze_format_reward": 0.7822953313589096,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 600.71875,
"epoch": 2.6959999999999997,
"grad_norm": 0.03895430639386177,
"kl": 0.013332366943359375,
"learning_rate": 1.9966864025136518e-05,
"loss": -0.0042,
"reward": 6.765047073364258,
"reward_std": 0.8223965764045715,
"rewards/mrr_reward": 0.5205729305744171,
"rewards/rank_analyze_format_reward": 0.7120521813631058,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 660.328125,
"epoch": 2.7039999999999997,
"grad_norm": 0.038571566343307495,
"kl": 0.012439727783203125,
"learning_rate": 1.99666592803403e-05,
"loss": -0.0154,
"reward": 6.975342035293579,
"reward_std": 0.840043693780899,
"rewards/mrr_reward": 0.5555741637945175,
"rewards/rank_analyze_format_reward": 0.811639130115509,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 576.015625,
"epoch": 2.7119999999999997,
"grad_norm": 0.03610292449593544,
"kl": 0.012050628662109375,
"learning_rate": 1.9966453905995386e-05,
"loss": -0.0219,
"reward": 6.419227600097656,
"reward_std": 1.1811564713716507,
"rewards/mrr_reward": 0.46623264998197556,
"rewards/rank_analyze_format_reward": 0.7026196420192719,
"rewards/rank_answer_foramt_reward": 0.876953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 611.953125,
"epoch": 2.7199999999999998,
"grad_norm": 0.040995605289936066,
"kl": 0.010702133178710938,
"learning_rate": 1.996624790211475e-05,
"loss": 0.0069,
"reward": 7.764137506484985,
"reward_std": 0.872068215161562,
"rewards/mrr_reward": 0.7421006858348846,
"rewards/rank_analyze_format_reward": 0.8094066381454468,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 601.390625,
"epoch": 2.7279999999999998,
"grad_norm": 0.036672018468379974,
"kl": 0.010931015014648438,
"learning_rate": 1.9966041268711404e-05,
"loss": -0.0282,
"reward": 7.355572700500488,
"reward_std": 0.8193893283605576,
"rewards/mrr_reward": 0.6698970645666122,
"rewards/rank_analyze_format_reward": 0.7189528197050095,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 595.25,
"epoch": 2.7359999999999998,
"grad_norm": 0.036827512085437775,
"kl": 0.011262893676757812,
"learning_rate": 1.9965834005798395e-05,
"loss": 0.0009,
"reward": 7.232412695884705,
"reward_std": 0.9624816030263901,
"rewards/mrr_reward": 0.6321304589509964,
"rewards/rank_analyze_format_reward": 0.776156485080719,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 583.015625,
"epoch": 2.7439999999999998,
"grad_norm": 0.041386183351278305,
"kl": 0.01598358154296875,
"learning_rate": 1.9965626113388823e-05,
"loss": -0.0151,
"reward": 7.414017677307129,
"reward_std": 1.145112544298172,
"rewards/mrr_reward": 0.7001488208770752,
"rewards/rank_analyze_format_reward": 0.7220657765865326,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 610.671875,
"epoch": 2.752,
"grad_norm": 0.03522395342588425,
"kl": 0.011472702026367188,
"learning_rate": 1.9965417591495813e-05,
"loss": -0.0021,
"reward": 6.261266589164734,
"reward_std": 0.648932583630085,
"rewards/mrr_reward": 0.4110739082098007,
"rewards/rank_analyze_format_reward": 0.6695905476808548,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 595.421875,
"epoch": 2.76,
"grad_norm": 0.036285560578107834,
"kl": 0.011045455932617188,
"learning_rate": 1.9965208440132538e-05,
"loss": -0.0084,
"reward": 7.684949636459351,
"reward_std": 0.6939431764185429,
"rewards/mrr_reward": 0.7300347238779068,
"rewards/rank_analyze_format_reward": 0.782451868057251,
"rewards/rank_answer_foramt_reward": 0.984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9989919364452362,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9989919364452362,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 602.15625,
"epoch": 2.768,
"grad_norm": 0.03910991922020912,
"kl": 0.011774063110351562,
"learning_rate": 1.9964998659312212e-05,
"loss": -0.0189,
"reward": 6.8010218143463135,
"reward_std": 0.8553978726267815,
"rewards/mrr_reward": 0.5462859645485878,
"rewards/rank_analyze_format_reward": 0.7228547036647797,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9943632036447525,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9943632036447525,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 611.359375,
"epoch": 2.776,
"grad_norm": 0.038007643073797226,
"kl": 0.010667800903320312,
"learning_rate": 1.996478824904808e-05,
"loss": 0.003,
"reward": 7.355239748954773,
"reward_std": 0.9060100615024567,
"rewards/mrr_reward": 0.6795572899281979,
"rewards/rank_analyze_format_reward": 0.7034169733524323,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 597.8125,
"epoch": 2.784,
"grad_norm": 0.03934268653392792,
"kl": 0.01244354248046875,
"learning_rate": 1.9964577209353438e-05,
"loss": -0.0656,
"reward": 7.2533485889434814,
"reward_std": 1.1916275918483734,
"rewards/mrr_reward": 0.6880208402872086,
"rewards/rank_analyze_format_reward": 0.6423462107777596,
"rewards/rank_answer_foramt_reward": 0.92578125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9821939468383789,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9821939468383789,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 616.140625,
"epoch": 2.792,
"grad_norm": 0.036522042006254196,
"kl": 0.013441085815429688,
"learning_rate": 1.9964365540241614e-05,
"loss": 0.0013,
"reward": 7.095219135284424,
"reward_std": 1.0741036236286163,
"rewards/mrr_reward": 0.6266059279441833,
"rewards/rank_analyze_format_reward": 0.6649671494960785,
"rewards/rank_answer_foramt_reward": 0.939453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 598.15625,
"epoch": 2.8,
"grad_norm": 0.03805486485362053,
"kl": 0.010654449462890625,
"learning_rate": 1.9964153241725984e-05,
"loss": -0.0168,
"reward": 7.228509426116943,
"reward_std": 0.9055161625146866,
"rewards/mrr_reward": 0.6221354231238365,
"rewards/rank_analyze_format_reward": 0.8102801889181137,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 614.125,
"epoch": 2.808,
"grad_norm": 0.036437440663576126,
"kl": 0.009984970092773438,
"learning_rate": 1.996394031381995e-05,
"loss": -0.0147,
"reward": 6.869751572608948,
"reward_std": 0.8186332434415817,
"rewards/mrr_reward": 0.5391058996319771,
"rewards/rank_analyze_format_reward": 0.77509605884552,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 588.75,
"epoch": 2.816,
"grad_norm": 0.04047045111656189,
"kl": 0.013484954833984375,
"learning_rate": 1.996372675653696e-05,
"loss": 0.0169,
"reward": 7.264615893363953,
"reward_std": 1.1256726384162903,
"rewards/mrr_reward": 0.6621279790997505,
"rewards/rank_analyze_format_reward": 0.7302107512950897,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9961873590946198,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 614.28125,
"epoch": 2.824,
"grad_norm": 0.03461950272321701,
"kl": 0.011775970458984375,
"learning_rate": 1.9963512569890512e-05,
"loss": -0.0006,
"reward": 6.854212045669556,
"reward_std": 0.9395613223314285,
"rewards/mrr_reward": 0.5478236600756645,
"rewards/rank_analyze_format_reward": 0.7212026119232178,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 601.59375,
"epoch": 2.832,
"grad_norm": 0.03836781159043312,
"kl": 0.01210784912109375,
"learning_rate": 1.9963297753894134e-05,
"loss": -0.0137,
"reward": 6.814990997314453,
"reward_std": 1.3405095338821411,
"rewards/mrr_reward": 0.5263392850756645,
"rewards/rank_analyze_format_reward": 0.7955712080001831,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 613.484375,
"epoch": 2.84,
"grad_norm": 0.03449360653758049,
"kl": 0.013011932373046875,
"learning_rate": 1.9963082308561386e-05,
"loss": -0.021,
"reward": 7.53871476650238,
"reward_std": 0.9666296392679214,
"rewards/mrr_reward": 0.7184895724058151,
"rewards/rank_analyze_format_reward": 0.7090613692998886,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 605.71875,
"epoch": 2.848,
"grad_norm": 0.04157762601971626,
"kl": 0.012449264526367188,
"learning_rate": 1.9962866233905887e-05,
"loss": -0.0148,
"reward": 7.414668679237366,
"reward_std": 0.9551695212721825,
"rewards/mrr_reward": 0.693489596247673,
"rewards/rank_analyze_format_reward": 0.6817260161042213,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 614.6875,
"epoch": 2.856,
"grad_norm": 0.034568723291158676,
"kl": 0.011356353759765625,
"learning_rate": 1.9962649529941283e-05,
"loss": -0.0159,
"reward": 7.724859952926636,
"reward_std": 0.819370448589325,
"rewards/mrr_reward": 0.7456287145614624,
"rewards/rank_analyze_format_reward": 0.7604811042547226,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9977678656578064,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9977678656578064,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 618.8125,
"epoch": 2.864,
"grad_norm": 0.039391741156578064,
"kl": 0.012399673461914062,
"learning_rate": 1.996243219668126e-05,
"loss": -0.0153,
"reward": 5.852332949638367,
"reward_std": 1.0752842128276825,
"rewards/mrr_reward": 0.323691725730896,
"rewards/rank_analyze_format_reward": 0.7135076522827148,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9679276347160339,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 620.046875,
"epoch": 2.872,
"grad_norm": 0.040229834616184235,
"kl": 0.011371612548828125,
"learning_rate": 1.996221423413954e-05,
"loss": 0.0015,
"reward": 6.387848496437073,
"reward_std": 1.1234539598226547,
"rewards/mrr_reward": 0.4234747067093849,
"rewards/rank_analyze_format_reward": 0.7486371248960495,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 628.109375,
"epoch": 2.88,
"grad_norm": 0.0374617837369442,
"kl": 0.011615753173828125,
"learning_rate": 1.9961995642329905e-05,
"loss": 0.0084,
"reward": 7.307153582572937,
"reward_std": 1.3044872879981995,
"rewards/mrr_reward": 0.6740141361951828,
"rewards/rank_analyze_format_reward": 0.7395800352096558,
"rewards/rank_answer_foramt_reward": 0.875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 591.71875,
"epoch": 2.888,
"grad_norm": 0.04277306795120239,
"kl": 0.014926910400390625,
"learning_rate": 1.996177642126615e-05,
"loss": -0.0085,
"reward": 7.5333287715911865,
"reward_std": 0.9014619141817093,
"rewards/mrr_reward": 0.6997581869363785,
"rewards/rank_analyze_format_reward": 0.7635927647352219,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 603.28125,
"epoch": 2.896,
"grad_norm": 0.041481465101242065,
"kl": 0.014026641845703125,
"learning_rate": 1.996155657096213e-05,
"loss": -0.0272,
"reward": 6.84517502784729,
"reward_std": 1.0781239420175552,
"rewards/mrr_reward": 0.5556175634264946,
"rewards/rank_analyze_format_reward": 0.7236873209476471,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983368366956711,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983368366956711,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 580.53125,
"epoch": 2.904,
"grad_norm": 0.04171738028526306,
"kl": 0.012874603271484375,
"learning_rate": 1.9961336091431728e-05,
"loss": -0.0004,
"reward": 7.211669564247131,
"reward_std": 0.8956931233406067,
"rewards/mrr_reward": 0.6445312649011612,
"rewards/rank_analyze_format_reward": 0.6827789545059204,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9968671649694443,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9968671649694443,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 636.328125,
"epoch": 2.912,
"grad_norm": 0.03671007230877876,
"kl": 0.011234283447265625,
"learning_rate": 1.9961114982688868e-05,
"loss": -0.0257,
"reward": 7.139348030090332,
"reward_std": 0.8967479169368744,
"rewards/mrr_reward": 0.6116319298744202,
"rewards/rank_analyze_format_reward": 0.7958708107471466,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9992559552192688,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 648.640625,
"epoch": 2.92,
"grad_norm": 0.033731456845998764,
"kl": 0.010288238525390625,
"learning_rate": 1.9960893244747525e-05,
"loss": -0.0108,
"reward": 7.166544318199158,
"reward_std": 0.6106544919312,
"rewards/mrr_reward": 0.6010168790817261,
"rewards/rank_analyze_format_reward": 0.7859143763780594,
"rewards/rank_answer_foramt_reward": 0.984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 627.625,
"epoch": 2.928,
"grad_norm": 0.036414846777915955,
"kl": 0.012035369873046875,
"learning_rate": 1.9960670877621697e-05,
"loss": -0.0184,
"reward": 6.740770578384399,
"reward_std": 0.8052867725491524,
"rewards/mrr_reward": 0.5220424234867096,
"rewards/rank_analyze_format_reward": 0.7072883993387222,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 619.90625,
"epoch": 2.936,
"grad_norm": 0.03801536187529564,
"kl": 0.013200759887695312,
"learning_rate": 1.9960447881325433e-05,
"loss": -0.0308,
"reward": 6.5149757862091064,
"reward_std": 0.7093790546059608,
"rewards/mrr_reward": 0.44720981270074844,
"rewards/rank_analyze_format_reward": 0.7612926959991455,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 591.1875,
"epoch": 2.944,
"grad_norm": 0.040969040244817734,
"kl": 0.01461029052734375,
"learning_rate": 1.996022425587282e-05,
"loss": -0.0185,
"reward": 7.41820216178894,
"reward_std": 0.9369710832834244,
"rewards/mrr_reward": 0.6916666775941849,
"rewards/rank_analyze_format_reward": 0.7156801223754883,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 636.859375,
"epoch": 2.952,
"grad_norm": 0.03583036735653877,
"kl": 0.011312484741210938,
"learning_rate": 1.9960000001277985e-05,
"loss": -0.0276,
"reward": 7.153052568435669,
"reward_std": 0.6550789251923561,
"rewards/mrr_reward": 0.6043154746294022,
"rewards/rank_analyze_format_reward": 0.7746230661869049,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 607.65625,
"epoch": 2.96,
"grad_norm": 0.04044310748577118,
"kl": 0.012559890747070312,
"learning_rate": 1.9959775117555085e-05,
"loss": 0.0112,
"reward": 7.005048513412476,
"reward_std": 1.1625263132154942,
"rewards/mrr_reward": 0.5972346290946007,
"rewards/rank_analyze_format_reward": 0.7758757621049881,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992187470197678,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9679687470197678,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 597.0,
"epoch": 2.968,
"grad_norm": 0.03745017945766449,
"kl": 0.013088226318359375,
"learning_rate": 1.995954960471833e-05,
"loss": 0.0034,
"reward": 7.509567379951477,
"reward_std": 0.961163155734539,
"rewards/mrr_reward": 0.7000806033611298,
"rewards/rank_analyze_format_reward": 0.7853019386529922,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 592.859375,
"epoch": 2.976,
"grad_norm": 0.038514841347932816,
"kl": 0.012132644653320312,
"learning_rate": 1.995932346278197e-05,
"loss": -0.0071,
"reward": 7.772576689720154,
"reward_std": 0.6515968926250935,
"rewards/mrr_reward": 0.7575520798563957,
"rewards/rank_analyze_format_reward": 0.7793630510568619,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 586.09375,
"epoch": 2.984,
"grad_norm": 0.03822262957692146,
"kl": 0.013006210327148438,
"learning_rate": 1.9959096691760284e-05,
"loss": -0.0155,
"reward": 7.534856200218201,
"reward_std": 0.7668619826436043,
"rewards/mrr_reward": 0.7446614354848862,
"rewards/rank_analyze_format_reward": 0.685593493282795,
"rewards/rank_answer_foramt_reward": 0.955078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9773005694150925,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9773005694150925,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 621.109375,
"epoch": 2.992,
"grad_norm": 0.03631046786904335,
"kl": 0.011692047119140625,
"learning_rate": 1.995886929166759e-05,
"loss": 0.0136,
"reward": 7.2465866804122925,
"reward_std": 0.8428932726383209,
"rewards/mrr_reward": 0.6395833343267441,
"rewards/rank_analyze_format_reward": 0.7468471378087997,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 668.40625,
"epoch": 3.0,
"grad_norm": 0.03699138015508652,
"kl": 0.011791229248046875,
"learning_rate": 1.9958641262518263e-05,
"loss": 0.0192,
"reward": 7.813745975494385,
"reward_std": 0.7177924737334251,
"rewards/mrr_reward": 0.7415550798177719,
"rewards/rank_analyze_format_reward": 0.8690101951360703,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 615.921875,
"epoch": 3.008,
"grad_norm": 0.035117242485284805,
"kl": 0.013750076293945312,
"learning_rate": 3.4816627469912147e-06,
"loss": 0.0291,
"reward": 7.042345643043518,
"reward_std": 0.7293612845242023,
"rewards/mrr_reward": 0.5897755473852158,
"rewards/rank_analyze_format_reward": 0.7183997631072998,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 637.125,
"epoch": 3.016,
"grad_norm": 0.03519332408905029,
"kl": 0.01271820068359375,
"learning_rate": 3.4341424424704373e-06,
"loss": -0.0114,
"reward": 6.630066633224487,
"reward_std": 0.9696584269404411,
"rewards/mrr_reward": 0.4780319929122925,
"rewards/rank_analyze_format_reward": 0.7667666971683502,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 621.59375,
"epoch": 3.024,
"grad_norm": 0.036034248769283295,
"kl": 0.014505386352539062,
"learning_rate": 3.3868813467634833e-06,
"loss": -0.0026,
"reward": 7.198747515678406,
"reward_std": 1.1167692840099335,
"rewards/mrr_reward": 0.6099516302347183,
"rewards/rank_analyze_format_reward": 0.8406638205051422,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 637.28125,
"epoch": 3.032,
"grad_norm": 0.03498866409063339,
"kl": 0.01226043701171875,
"learning_rate": 3.3398813256574847e-06,
"loss": -0.0099,
"reward": 7.360252737998962,
"reward_std": 0.8017124682664871,
"rewards/mrr_reward": 0.6533172130584717,
"rewards/rank_analyze_format_reward": 0.8052692711353302,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 606.265625,
"epoch": 3.04,
"grad_norm": 0.0373137928545475,
"kl": 0.013418197631835938,
"learning_rate": 3.2931442346328e-06,
"loss": 0.0002,
"reward": 7.177944183349609,
"reward_std": 1.186311975121498,
"rewards/mrr_reward": 0.6419270783662796,
"rewards/rank_analyze_format_reward": 0.6922670155763626,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 655.390625,
"epoch": 3.048,
"grad_norm": 0.03675440698862076,
"kl": 0.0125885009765625,
"learning_rate": 3.2466719187897555e-06,
"loss": 0.0072,
"reward": 6.830274343490601,
"reward_std": 0.661302238702774,
"rewards/mrr_reward": 0.4951822906732559,
"rewards/rank_analyze_format_reward": 0.8651701956987381,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 610.34375,
"epoch": 3.056,
"grad_norm": 0.03555948659777641,
"kl": 0.01416015625,
"learning_rate": 3.200466212775808e-06,
"loss": -0.0196,
"reward": 7.550482988357544,
"reward_std": 1.0687852203845978,
"rewards/mrr_reward": 0.71484375,
"rewards/rank_analyze_format_reward": 0.7417741417884827,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 619.625,
"epoch": 3.064,
"grad_norm": 0.03907148540019989,
"kl": 0.013399124145507812,
"learning_rate": 3.1545289407131128e-06,
"loss": -0.0043,
"reward": 7.558589220046997,
"reward_std": 1.2266802489757538,
"rewards/mrr_reward": 0.7244791686534882,
"rewards/rank_analyze_format_reward": 0.7446569502353668,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 598.09375,
"epoch": 3.072,
"grad_norm": 0.038272228091955185,
"kl": 0.011919021606445312,
"learning_rate": 3.108861916126518e-06,
"loss": 0.002,
"reward": 8.19713008403778,
"reward_std": 0.7785622999072075,
"rewards/mrr_reward": 0.85546875,
"rewards/rank_analyze_format_reward": 0.8240830302238464,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 627.640625,
"epoch": 3.08,
"grad_norm": 0.041524823755025864,
"kl": 0.01570892333984375,
"learning_rate": 3.063466941871952e-06,
"loss": 0.0153,
"reward": 7.146573901176453,
"reward_std": 1.074029102921486,
"rewards/mrr_reward": 0.6067274287343025,
"rewards/rank_analyze_format_reward": 0.834898442029953,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 630.671875,
"epoch": 3.088,
"grad_norm": 0.03623941168189049,
"kl": 0.013017654418945312,
"learning_rate": 3.0183458100652752e-06,
"loss": -0.0022,
"reward": 7.186826229095459,
"reward_std": 0.6731258956715465,
"rewards/mrr_reward": 0.5905319899320602,
"rewards/rank_analyze_format_reward": 0.863645926117897,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 624.265625,
"epoch": 3.096,
"grad_norm": 0.03690262883901596,
"kl": 0.01531219482421875,
"learning_rate": 2.9735003020115095e-06,
"loss": 0.0131,
"reward": 7.618446707725525,
"reward_std": 0.5352663211524487,
"rewards/mrr_reward": 0.7252604365348816,
"rewards/rank_analyze_format_reward": 0.7408426254987717,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 633.4375,
"epoch": 3.104,
"grad_norm": 0.03921409696340561,
"kl": 0.014377593994140625,
"learning_rate": 2.9289321881345257e-06,
"loss": -0.0006,
"reward": 7.131399869918823,
"reward_std": 1.3833198249340057,
"rewards/mrr_reward": 0.6131696403026581,
"rewards/rank_analyze_format_reward": 0.8094245195388794,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981250017881393,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9825000017881393,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 660.5625,
"epoch": 3.112,
"grad_norm": 0.03858316317200661,
"kl": 0.01251220703125,
"learning_rate": 2.884643227907147e-06,
"loss": 0.0078,
"reward": 6.986513733863831,
"reward_std": 1.1185480952262878,
"rewards/mrr_reward": 0.5550533309578896,
"rewards/rank_analyze_format_reward": 0.8381428718566895,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 586.0625,
"epoch": 3.12,
"grad_norm": 0.03950336202979088,
"kl": 0.01474761962890625,
"learning_rate": 2.840635169781688e-06,
"loss": -0.0229,
"reward": 6.151705384254456,
"reward_std": 1.3553853258490562,
"rewards/mrr_reward": 0.416666679084301,
"rewards/rank_analyze_format_reward": 0.666062742471695,
"rewards/rank_answer_foramt_reward": 0.822265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 610.796875,
"epoch": 3.128,
"grad_norm": 0.03854774311184883,
"kl": 0.013458251953125,
"learning_rate": 2.796909751120931e-06,
"loss": -0.007,
"reward": 7.251393556594849,
"reward_std": 1.445090800523758,
"rewards/mrr_reward": 0.6562500074505806,
"rewards/rank_analyze_format_reward": 0.7045186460018158,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 633.71875,
"epoch": 3.136,
"grad_norm": 0.03738197311758995,
"kl": 0.013525009155273438,
"learning_rate": 2.7534686981295335e-06,
"loss": -0.0034,
"reward": 6.909914255142212,
"reward_std": 1.123517245054245,
"rewards/mrr_reward": 0.5471974164247513,
"rewards/rank_analyze_format_reward": 0.779603436589241,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 648.96875,
"epoch": 3.144,
"grad_norm": 0.0402173288166523,
"kl": 0.010915756225585938,
"learning_rate": 2.7103137257858867e-06,
"loss": 0.0094,
"reward": 6.921466946601868,
"reward_std": 0.774784117937088,
"rewards/mrr_reward": 0.5484312921762466,
"rewards/rank_analyze_format_reward": 0.8028685003519058,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9975927919149399,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9975927919149399,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 619.296875,
"epoch": 3.152,
"grad_norm": 0.04159383475780487,
"kl": 0.014692306518554688,
"learning_rate": 2.667446537774402e-06,
"loss": -0.0153,
"reward": 6.572040319442749,
"reward_std": 1.726172387599945,
"rewards/mrr_reward": 0.4993923604488373,
"rewards/rank_analyze_format_reward": 0.7517964094877243,
"rewards/rank_answer_foramt_reward": 0.8203125,
"rewards/rank_contrast_format_reward": 0.012423780746757984,
"rewards/rank_initial_format_reward": 0.994969055056572,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.994969055056572,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 642.765625,
"epoch": 3.16,
"grad_norm": 0.042603906244039536,
"kl": 0.010896682739257812,
"learning_rate": 2.624868826418262e-06,
"loss": 0.0296,
"reward": 7.20228123664856,
"reward_std": 0.9538848847150803,
"rewards/mrr_reward": 0.6085069477558136,
"rewards/rank_analyze_format_reward": 0.8265387862920761,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 678.578125,
"epoch": 3.168,
"grad_norm": 0.03769225999712944,
"kl": 0.011178970336914062,
"learning_rate": 2.5825822726126095e-06,
"loss": 0.0099,
"reward": 7.425193428993225,
"reward_std": 1.2249933630228043,
"rewards/mrr_reward": 0.6672184988856316,
"rewards/rank_analyze_format_reward": 0.8207724988460541,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 612.90625,
"epoch": 3.176,
"grad_norm": 0.03580842167139053,
"kl": 0.01148223876953125,
"learning_rate": 2.5405885457581793e-06,
"loss": 0.0051,
"reward": 7.102351069450378,
"reward_std": 1.0760410577058792,
"rewards/mrr_reward": 0.5938120186328888,
"rewards/rank_analyze_format_reward": 0.7681185156106949,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 622.234375,
"epoch": 3.184,
"grad_norm": 0.03826119750738144,
"kl": 0.014284133911132812,
"learning_rate": 2.4988893036954045e-06,
"loss": 0.0084,
"reward": 7.598180770874023,
"reward_std": 1.0497512221336365,
"rewards/mrr_reward": 0.7157738208770752,
"rewards/rank_analyze_format_reward": 0.8209080398082733,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 629.640625,
"epoch": 3.192,
"grad_norm": 0.039424311369657516,
"kl": 0.012294769287109375,
"learning_rate": 2.4574861926389615e-06,
"loss": 0.0079,
"reward": 7.362653613090515,
"reward_std": 0.824449434876442,
"rewards/mrr_reward": 0.6566406339406967,
"rewards/rank_analyze_format_reward": 0.8023822903633118,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 642.015625,
"epoch": 3.2,
"grad_norm": 0.03927793353796005,
"kl": 0.01302337646484375,
"learning_rate": 2.4163808471127815e-06,
"loss": -0.0046,
"reward": 7.52125608921051,
"reward_std": 1.2097734808921814,
"rewards/mrr_reward": 0.7062934041023254,
"rewards/rank_analyze_format_reward": 0.7816215455532074,
"rewards/rank_answer_foramt_reward": 0.955078125,
"rewards/rank_contrast_format_reward": 0.014070273377001286,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.984375,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 611.1875,
"epoch": 3.208,
"grad_norm": 0.03897445276379585,
"kl": 0.013471603393554688,
"learning_rate": 2.37557488988552e-06,
"loss": -0.0031,
"reward": 6.832857847213745,
"reward_std": 1.2639935612678528,
"rewards/mrr_reward": 0.5474764406681061,
"rewards/rank_analyze_format_reward": 0.7873684614896774,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 627.421875,
"epoch": 3.216,
"grad_norm": 0.03820272535085678,
"kl": 0.012819290161132812,
"learning_rate": 2.335069931906503e-06,
"loss": -0.0068,
"reward": 7.258768320083618,
"reward_std": 1.3467806428670883,
"rewards/mrr_reward": 0.6381944566965103,
"rewards/rank_analyze_format_reward": 0.7626311928033829,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 623.78125,
"epoch": 3.224,
"grad_norm": 0.037081021815538406,
"kl": 0.013256072998046875,
"learning_rate": 2.2948675722421086e-06,
"loss": -0.0032,
"reward": 7.068072199821472,
"reward_std": 1.0497987121343613,
"rewards/mrr_reward": 0.5989149361848831,
"rewards/rank_analyze_format_reward": 0.7660476416349411,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 605.0625,
"epoch": 3.232,
"grad_norm": 0.0360063761472702,
"kl": 0.015211105346679688,
"learning_rate": 2.254969398012663e-06,
"loss": -0.0158,
"reward": 7.160408020019531,
"reward_std": 1.019886076450348,
"rewards/mrr_reward": 0.646112360060215,
"rewards/rank_analyze_format_reward": 0.7062013298273087,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 625.453125,
"epoch": 3.24,
"grad_norm": 0.03534236177802086,
"kl": 0.014194488525390625,
"learning_rate": 2.215376984329767e-06,
"loss": -0.0216,
"reward": 7.393091082572937,
"reward_std": 0.8330601751804352,
"rewards/mrr_reward": 0.6536458283662796,
"rewards/rank_analyze_format_reward": 0.7999920099973679,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 604.921875,
"epoch": 3.248,
"grad_norm": 0.038391463458538055,
"kl": 0.013559341430664062,
"learning_rate": 2.1760918942341193e-06,
"loss": -0.0178,
"reward": 7.6344475746154785,
"reward_std": 0.9299002774059772,
"rewards/mrr_reward": 0.7469618022441864,
"rewards/rank_analyze_format_reward": 0.7185573130846024,
"rewards/rank_answer_foramt_reward": 0.96875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 630.015625,
"epoch": 3.2560000000000002,
"grad_norm": 0.03509964421391487,
"kl": 0.01255035400390625,
"learning_rate": 2.1371156786338108e-06,
"loss": -0.0117,
"reward": 6.898256897926331,
"reward_std": 0.9209974706172943,
"rewards/mrr_reward": 0.5367559418082237,
"rewards/rank_analyze_format_reward": 0.8304647654294968,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9965170323848724,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9808920323848724,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 608.5625,
"epoch": 3.2640000000000002,
"grad_norm": 0.03695489838719368,
"kl": 0.0167388916015625,
"learning_rate": 2.098449876243096e-06,
"loss": -0.0313,
"reward": 6.917726039886475,
"reward_std": 0.8459838628768921,
"rewards/mrr_reward": 0.5599144399166107,
"rewards/rank_analyze_format_reward": 0.7054120153188705,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 585.078125,
"epoch": 3.2720000000000002,
"grad_norm": 0.039599835872650146,
"kl": 0.014032363891601562,
"learning_rate": 2.0600960135216463e-06,
"loss": -0.0047,
"reward": 7.127155780792236,
"reward_std": 1.1092039048671722,
"rewards/mrr_reward": 0.6348276287317276,
"rewards/rank_analyze_format_reward": 0.7182396054267883,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9992559552192688,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 606.296875,
"epoch": 3.2800000000000002,
"grad_norm": 0.039675965905189514,
"kl": 0.013446807861328125,
"learning_rate": 2.022055604614289e-06,
"loss": -0.0103,
"reward": 6.730955481529236,
"reward_std": 0.9299670159816742,
"rewards/mrr_reward": 0.5159474387764931,
"rewards/rank_analyze_format_reward": 0.7542870342731476,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9945252537727356,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9945252537727356,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 636.3125,
"epoch": 3.288,
"grad_norm": 0.03656487911939621,
"kl": 0.012666702270507812,
"learning_rate": 1.984330151291233e-06,
"loss": -0.016,
"reward": 6.7821091413497925,
"reward_std": 0.9743772521615028,
"rewards/mrr_reward": 0.5557911694049835,
"rewards/rank_analyze_format_reward": 0.7064568400382996,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9819079041481018,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9662829041481018,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 631.171875,
"epoch": 3.296,
"grad_norm": 0.037856802344322205,
"kl": 0.01450347900390625,
"learning_rate": 1.9469211428887813e-06,
"loss": -0.0176,
"reward": 6.753392338752747,
"reward_std": 0.9574991762638092,
"rewards/mrr_reward": 0.5006696432828903,
"rewards/rank_analyze_format_reward": 0.8190732151269913,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 618.59375,
"epoch": 3.304,
"grad_norm": 0.040868956595659256,
"kl": 0.011432647705078125,
"learning_rate": 1.9098300562505266e-06,
"loss": 0.0035,
"reward": 6.817109823226929,
"reward_std": 0.7666200622916222,
"rewards/mrr_reward": 0.5467447973787785,
"rewards/rank_analyze_format_reward": 0.7559238225221634,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9966736733913422,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9966736733913422,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 616.671875,
"epoch": 3.312,
"grad_norm": 0.04021133482456207,
"kl": 0.013158798217773438,
"learning_rate": 1.8730583556690607e-06,
"loss": 0.0066,
"reward": 6.991716146469116,
"reward_std": 0.8390699215233326,
"rewards/mrr_reward": 0.5647073462605476,
"rewards/rank_analyze_format_reward": 0.8067970871925354,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 644.8125,
"epoch": 3.32,
"grad_norm": 0.037540923804044724,
"kl": 0.01172637939453125,
"learning_rate": 1.8366074928281608e-06,
"loss": 0.0074,
"reward": 7.544142961502075,
"reward_std": 1.008560985326767,
"rewards/mrr_reward": 0.6979166716337204,
"rewards/rank_analyze_format_reward": 0.8081740438938141,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.994612067937851,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.994612067937851,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 584.15625,
"epoch": 3.328,
"grad_norm": 0.04208629950881004,
"kl": 0.012399673461914062,
"learning_rate": 1.8004789067454763e-06,
"loss": -0.0386,
"reward": 7.588791251182556,
"reward_std": 1.3209501877427101,
"rewards/mrr_reward": 0.7485863268375397,
"rewards/rank_analyze_format_reward": 0.6915150880813599,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9924812018871307,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9924812018871307,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 635.203125,
"epoch": 3.336,
"grad_norm": 0.043379127979278564,
"kl": 0.012195587158203125,
"learning_rate": 1.7646740237157256e-06,
"loss": 0.0323,
"reward": 7.012084484100342,
"reward_std": 1.0143009573221207,
"rewards/mrr_reward": 0.5759734660387039,
"rewards/rank_analyze_format_reward": 0.8153042197227478,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9835526347160339,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 598.859375,
"epoch": 3.344,
"grad_norm": 0.04017612338066101,
"kl": 0.014421463012695312,
"learning_rate": 1.7291942572543806e-06,
"loss": -0.006,
"reward": 6.709989428520203,
"reward_std": 1.0495906621217728,
"rewards/mrr_reward": 0.529706098139286,
"rewards/rank_analyze_format_reward": 0.7245771586894989,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9967704266309738,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9967704266309738,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 580.203125,
"epoch": 3.352,
"grad_norm": 0.040608614683151245,
"kl": 0.012868881225585938,
"learning_rate": 1.6940410080418723e-06,
"loss": -0.0019,
"reward": 7.201984643936157,
"reward_std": 0.7756945788860321,
"rewards/mrr_reward": 0.6367187723517418,
"rewards/rank_analyze_format_reward": 0.7234692126512527,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.984375,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 634.3125,
"epoch": 3.36,
"grad_norm": 0.03586459904909134,
"kl": 0.011873245239257812,
"learning_rate": 1.6592156638682887e-06,
"loss": -0.0093,
"reward": 7.138599634170532,
"reward_std": 0.7431515604257584,
"rewards/mrr_reward": 0.6127170100808144,
"rewards/rank_analyze_format_reward": 0.7191510647535324,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9979619532823563,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9979619532823563,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 644.703125,
"epoch": 3.368,
"grad_norm": 0.03901754319667816,
"kl": 0.01136016845703125,
"learning_rate": 1.6247195995785836e-06,
"loss": 0.003,
"reward": 6.6844483613967896,
"reward_std": 0.7994736880064011,
"rewards/mrr_reward": 0.4797743149101734,
"rewards/rank_analyze_format_reward": 0.8219916969537735,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 605.625,
"epoch": 3.376,
"grad_norm": 0.03650727495551109,
"kl": 0.01389312744140625,
"learning_rate": 1.5905541770183096e-06,
"loss": -0.0195,
"reward": 6.6595494747161865,
"reward_std": 0.500478945672512,
"rewards/mrr_reward": 0.4967882111668587,
"rewards/rank_analyze_format_reward": 0.7016936540603638,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 622.78125,
"epoch": 3.384,
"grad_norm": 0.04294556751847267,
"kl": 0.012430191040039062,
"learning_rate": 1.5567207449798517e-06,
"loss": 0.0142,
"reward": 7.437200546264648,
"reward_std": 0.8200259059667587,
"rewards/mrr_reward": 0.687189981341362,
"rewards/rank_analyze_format_reward": 0.7644545584917068,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.013829787261784077,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 630.3125,
"epoch": 3.392,
"grad_norm": 0.03355753794312477,
"kl": 0.011930465698242188,
"learning_rate": 1.52322063914917e-06,
"loss": -0.0143,
"reward": 7.236422419548035,
"reward_std": 1.3275894522666931,
"rewards/mrr_reward": 0.6320932507514954,
"rewards/rank_analyze_format_reward": 0.7606691271066666,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 616.0,
"epoch": 3.4,
"grad_norm": 0.03680611029267311,
"kl": 0.0124969482421875,
"learning_rate": 1.490055182053083e-06,
"loss": -0.0241,
"reward": 7.067311525344849,
"reward_std": 0.7071668058633804,
"rewards/mrr_reward": 0.5894097089767456,
"rewards/rank_analyze_format_reward": 0.7389693707227707,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 618.25,
"epoch": 3.408,
"grad_norm": 0.03699398413300514,
"kl": 0.012582778930664062,
"learning_rate": 1.4572256830070497e-06,
"loss": 0.0013,
"reward": 7.260975360870361,
"reward_std": 0.6704662144184113,
"rewards/mrr_reward": 0.6398809552192688,
"rewards/rank_analyze_format_reward": 0.8204772174358368,
"rewards/rank_answer_foramt_reward": 0.890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 598.4375,
"epoch": 3.416,
"grad_norm": 0.04159224405884743,
"kl": 0.014894485473632812,
"learning_rate": 1.4247334380634792e-06,
"loss": -0.0191,
"reward": 7.206877589225769,
"reward_std": 0.8614709973335266,
"rewards/mrr_reward": 0.6289062574505806,
"rewards/rank_analyze_format_reward": 0.765241265296936,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 644.125,
"epoch": 3.424,
"grad_norm": 0.036023661494255066,
"kl": 0.011859893798828125,
"learning_rate": 1.3925797299605649e-06,
"loss": -0.0067,
"reward": 6.5737926959991455,
"reward_std": 0.9088378921151161,
"rewards/mrr_reward": 0.4585689455270767,
"rewards/rank_analyze_format_reward": 0.7960425764322281,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 624.1875,
"epoch": 3.432,
"grad_norm": 0.047732554376125336,
"kl": 0.011541366577148438,
"learning_rate": 1.3607658280716474e-06,
"loss": -0.028,
"reward": 7.2520798444747925,
"reward_std": 1.1335118561983109,
"rewards/mrr_reward": 0.6308097690343857,
"rewards/rank_analyze_format_reward": 0.7853662818670273,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 596.40625,
"epoch": 3.44,
"grad_norm": 0.03683609887957573,
"kl": 0.013605117797851562,
"learning_rate": 1.3292929883550998e-06,
"loss": -0.0073,
"reward": 8.030953884124756,
"reward_std": 0.5688543245196342,
"rewards/mrr_reward": 0.8156249970197678,
"rewards/rank_analyze_format_reward": 0.7977506220340729,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 605.25,
"epoch": 3.448,
"grad_norm": 0.03646247833967209,
"kl": 0.013217926025390625,
"learning_rate": 1.2981624533047432e-06,
"loss": 0.0074,
"reward": 6.805210113525391,
"reward_std": 0.8861361294984818,
"rewards/mrr_reward": 0.5289062447845936,
"rewards/rank_analyze_format_reward": 0.7668732404708862,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9994419664144516,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9994419664144516,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 609.875,
"epoch": 3.456,
"grad_norm": 0.03660254180431366,
"kl": 0.011661529541015625,
"learning_rate": 1.2673754519008008e-06,
"loss": -0.0102,
"reward": 6.942854642868042,
"reward_std": 0.8882918208837509,
"rewards/mrr_reward": 0.5530754029750824,
"rewards/rank_analyze_format_reward": 0.7735218703746796,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 620.671875,
"epoch": 3.464,
"grad_norm": 0.037669289857149124,
"kl": 0.013912200927734375,
"learning_rate": 1.2369331995613664e-06,
"loss": -0.0091,
"reward": 7.411279797554016,
"reward_std": 1.1482711285352707,
"rewards/mrr_reward": 0.673480898141861,
"rewards/rank_analyze_format_reward": 0.7954811006784439,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 642.984375,
"epoch": 3.472,
"grad_norm": 0.037749458104372025,
"kl": 0.01103973388671875,
"learning_rate": 1.206836898094439e-06,
"loss": -0.0045,
"reward": 7.326077461242676,
"reward_std": 0.8330521434545517,
"rewards/mrr_reward": 0.6379278004169464,
"rewards/rank_analyze_format_reward": 0.7898766249418259,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 642.78125,
"epoch": 3.48,
"grad_norm": 0.035765353590250015,
"kl": 0.012155532836914062,
"learning_rate": 1.1770877356504684e-06,
"loss": -0.02,
"reward": 7.883293986320496,
"reward_std": 0.8871591687202454,
"rewards/mrr_reward": 0.7798177152872086,
"rewards/rank_analyze_format_reward": 0.7656676918268204,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 611.78125,
"epoch": 3.488,
"grad_norm": 0.03705098107457161,
"kl": 0.012918472290039062,
"learning_rate": 1.1476868866754488e-06,
"loss": -0.0083,
"reward": 6.660637736320496,
"reward_std": 0.9346826821565628,
"rewards/mrr_reward": 0.500713050365448,
"rewards/rank_analyze_format_reward": 0.7608778774738312,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 628.71875,
"epoch": 3.496,
"grad_norm": 0.03969808667898178,
"kl": 0.015727996826171875,
"learning_rate": 1.1186355118645552e-06,
"loss": -0.0132,
"reward": 7.052313804626465,
"reward_std": 1.0558638274669647,
"rewards/mrr_reward": 0.5923363342881203,
"rewards/rank_analyze_format_reward": 0.7863690704107285,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 606.859375,
"epoch": 3.504,
"grad_norm": 0.037416357547044754,
"kl": 0.012334823608398438,
"learning_rate": 1.0899347581163222e-06,
"loss": -0.0176,
"reward": 7.955611228942871,
"reward_std": 0.7251264750957489,
"rewards/mrr_reward": 0.796651765704155,
"rewards/rank_analyze_format_reward": 0.8157641887664795,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 637.5,
"epoch": 3.512,
"grad_norm": 0.03700832277536392,
"kl": 0.011600494384765625,
"learning_rate": 1.0615857584873624e-06,
"loss": 0.0115,
"reward": 7.665433883666992,
"reward_std": 0.595935083925724,
"rewards/mrr_reward": 0.7211123704910278,
"rewards/rank_analyze_format_reward": 0.8138794153928757,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 615.5,
"epoch": 3.52,
"grad_norm": 0.038077887147665024,
"kl": 0.01373291015625,
"learning_rate": 1.0335896321476413e-06,
"loss": -0.0342,
"reward": 7.015413165092468,
"reward_std": 1.31123448908329,
"rewards/mrr_reward": 0.5999503880739212,
"rewards/rank_analyze_format_reward": 0.7072935104370117,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 658.0625,
"epoch": 3.528,
"grad_norm": 0.03242430090904236,
"kl": 0.013689041137695312,
"learning_rate": 1.0059474843362893e-06,
"loss": -0.0198,
"reward": 6.429423809051514,
"reward_std": 0.8046993911266327,
"rewards/mrr_reward": 0.43027032166719437,
"rewards/rank_analyze_format_reward": 0.7727955877780914,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 671.890625,
"epoch": 3.536,
"grad_norm": 0.034074753522872925,
"kl": 0.0110321044921875,
"learning_rate": 9.786604063179728e-07,
"loss": -0.0013,
"reward": 7.255928158760071,
"reward_std": 0.7968212515115738,
"rewards/mrr_reward": 0.6222656294703484,
"rewards/rank_analyze_format_reward": 0.8351100534200668,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 612.109375,
"epoch": 3.544,
"grad_norm": 0.040858954191207886,
"kl": 0.0142974853515625,
"learning_rate": 9.517294753398066e-07,
"loss": 0.0039,
"reward": 7.048562169075012,
"reward_std": 0.9897879660129547,
"rewards/mrr_reward": 0.5950024798512459,
"rewards/rank_analyze_format_reward": 0.7603489309549332,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.984375,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 630.3125,
"epoch": 3.552,
"grad_norm": 0.033798523247241974,
"kl": 0.01203155517578125,
"learning_rate": 9.251557545888312e-07,
"loss": -0.0221,
"reward": 7.666335582733154,
"reward_std": 0.6299453526735306,
"rewards/mrr_reward": 0.7138020843267441,
"rewards/rank_analyze_format_reward": 0.8006909340620041,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.01411290280520916,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 587.46875,
"epoch": 3.56,
"grad_norm": 0.039384886622428894,
"kl": 0.012357711791992188,
"learning_rate": 8.989402931500434e-07,
"loss": -0.0138,
"reward": 8.181223034858704,
"reward_std": 0.7046910002827644,
"rewards/mrr_reward": 0.875,
"rewards/rank_analyze_format_reward": 0.6948948577046394,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 606.15625,
"epoch": 3.568,
"grad_norm": 0.03874967247247696,
"kl": 0.015522003173828125,
"learning_rate": 8.730841259649725e-07,
"loss": -0.0254,
"reward": 7.0217931270599365,
"reward_std": 0.8901334404945374,
"rewards/mrr_reward": 0.6117807626724243,
"rewards/rank_analyze_format_reward": 0.633263885974884,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 623.296875,
"epoch": 3.576,
"grad_norm": 0.03479482978582382,
"kl": 0.011865615844726562,
"learning_rate": 8.475882737908248e-07,
"loss": 0.0008,
"reward": 7.554774522781372,
"reward_std": 0.9437515586614609,
"rewards/mrr_reward": 0.7163194417953491,
"rewards/rank_analyze_format_reward": 0.7170282900333405,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0127108134329319,
"rewards/rank_initial_format_reward": 0.9984335899353027,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9984335899353027,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 632.0625,
"epoch": 3.584,
"grad_norm": 0.03798670321702957,
"kl": 0.012453079223632812,
"learning_rate": 8.224537431601886e-07,
"loss": 0.0001,
"reward": 6.237062215805054,
"reward_std": 0.5429144222289324,
"rewards/mrr_reward": 0.3848772421479225,
"rewards/rank_analyze_format_reward": 0.7522407919168472,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 619.234375,
"epoch": 3.592,
"grad_norm": 0.036835283041000366,
"kl": 0.012708663940429688,
"learning_rate": 7.976815263412963e-07,
"loss": -0.0548,
"reward": 6.972110390663147,
"reward_std": 1.0418353527784348,
"rewards/mrr_reward": 0.6005208343267441,
"rewards/rank_analyze_format_reward": 0.6950270235538483,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.984375,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 606.84375,
"epoch": 3.6,
"grad_norm": 0.03548385202884674,
"kl": 0.01288604736328125,
"learning_rate": 7.732726012988512e-07,
"loss": -0.0231,
"reward": 7.123287677764893,
"reward_std": 0.9100038930773735,
"rewards/mrr_reward": 0.6060701757669449,
"rewards/rank_analyze_format_reward": 0.749788224697113,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 617.828125,
"epoch": 3.608,
"grad_norm": 0.03627340868115425,
"kl": 0.011266708374023438,
"learning_rate": 7.492279316554207e-07,
"loss": -0.0177,
"reward": 7.946985602378845,
"reward_std": 0.7550379931926727,
"rewards/mrr_reward": 0.80078125,
"rewards/rank_analyze_format_reward": 0.7886674106121063,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 612.265625,
"epoch": 3.616,
"grad_norm": 0.04377419501543045,
"kl": 0.0131683349609375,
"learning_rate": 7.255484666533874e-07,
"loss": -0.0026,
"reward": 6.993055105209351,
"reward_std": 1.2321006208658218,
"rewards/mrr_reward": 0.6157738119363785,
"rewards/rank_analyze_format_reward": 0.705741174519062,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.96875,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 572.984375,
"epoch": 3.624,
"grad_norm": 0.04109544679522514,
"kl": 0.01351165771484375,
"learning_rate": 7.022351411174866e-07,
"loss": 0.005,
"reward": 7.255419611930847,
"reward_std": 1.0537290424108505,
"rewards/mrr_reward": 0.6588541716337204,
"rewards/rank_analyze_format_reward": 0.7091151028871536,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 606.953125,
"epoch": 3.632,
"grad_norm": 0.03640174865722656,
"kl": 0.012472152709960938,
"learning_rate": 6.792888754178906e-07,
"loss": -0.0046,
"reward": 7.461974620819092,
"reward_std": 0.8970663994550705,
"rewards/mrr_reward": 0.688616082072258,
"rewards/rank_analyze_format_reward": 0.7289945930242538,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 609.234375,
"epoch": 3.64,
"grad_norm": 0.03923455998301506,
"kl": 0.0132598876953125,
"learning_rate": 6.567105754338798e-07,
"loss": -0.0055,
"reward": 7.089033126831055,
"reward_std": 1.0927991718053818,
"rewards/mrr_reward": 0.6148189604282379,
"rewards/rank_analyze_format_reward": 0.7315036952495575,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.013373362831771374,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 670.09375,
"epoch": 3.648,
"grad_norm": 0.03714577481150627,
"kl": 0.012269973754882812,
"learning_rate": 6.345011325180772e-07,
"loss": -0.006,
"reward": 6.888863801956177,
"reward_std": 0.7847508117556572,
"rewards/mrr_reward": 0.5322854816913605,
"rewards/rank_analyze_format_reward": 0.8160540610551834,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 576.171875,
"epoch": 3.656,
"grad_norm": 0.03968933969736099,
"kl": 0.014535903930664062,
"learning_rate": 6.126614234612593e-07,
"loss": -0.0031,
"reward": 6.883362054824829,
"reward_std": 1.1338584274053574,
"rewards/mrr_reward": 0.5649925693869591,
"rewards/rank_analyze_format_reward": 0.7210482209920883,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 609.625,
"epoch": 3.664,
"grad_norm": 0.0411507710814476,
"kl": 0.014348983764648438,
"learning_rate": 5.911923104577455e-07,
"loss": -0.017,
"reward": 6.8970195055007935,
"reward_std": 0.7303311824798584,
"rewards/mrr_reward": 0.544177807867527,
"rewards/rank_analyze_format_reward": 0.7589471489191055,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 600.109375,
"epoch": 3.672,
"grad_norm": 0.03855804726481438,
"kl": 0.013666152954101562,
"learning_rate": 5.700946410713548e-07,
"loss": 0.0051,
"reward": 7.619644403457642,
"reward_std": 0.8956380970776081,
"rewards/mrr_reward": 0.7256944477558136,
"rewards/rank_analyze_format_reward": 0.7556206434965134,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 622.765625,
"epoch": 3.68,
"grad_norm": 0.04349582642316818,
"kl": 0.011423110961914062,
"learning_rate": 5.49369248201953e-07,
"loss": 0.0175,
"reward": 6.194160223007202,
"reward_std": 0.9866833090782166,
"rewards/mrr_reward": 0.3742373511195183,
"rewards/rank_analyze_format_reward": 0.8241638392210007,
"rewards/rank_answer_foramt_reward": 0.927734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.984375,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 605.609375,
"epoch": 3.6879999999999997,
"grad_norm": 0.035147525370121,
"kl": 0.011508941650390625,
"learning_rate": 5.290169500525577e-07,
"loss": -0.0124,
"reward": 7.419980049133301,
"reward_std": 0.46903695818036795,
"rewards/mrr_reward": 0.7058593779802322,
"rewards/rank_analyze_format_reward": 0.5965423956513405,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 607.328125,
"epoch": 3.6959999999999997,
"grad_norm": 0.038031551986932755,
"kl": 0.0143585205078125,
"learning_rate": 5.090385500970551e-07,
"loss": -0.0282,
"reward": 7.524974226951599,
"reward_std": 0.7495295517146587,
"rewards/mrr_reward": 0.7476562410593033,
"rewards/rank_analyze_format_reward": 0.616380512714386,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 624.5625,
"epoch": 3.7039999999999997,
"grad_norm": 0.03687411919236183,
"kl": 0.01457977294921875,
"learning_rate": 4.894348370484648e-07,
"loss": -0.0306,
"reward": 6.9998191595077515,
"reward_std": 1.1533474028110504,
"rewards/mrr_reward": 0.5960689634084702,
"rewards/rank_analyze_format_reward": 0.72789466381073,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9985119104385376,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9828869104385376,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 662.578125,
"epoch": 3.7119999999999997,
"grad_norm": 0.035118553787469864,
"kl": 0.010381698608398438,
"learning_rate": 4.702065848278126e-07,
"loss": 0.001,
"reward": 7.487505078315735,
"reward_std": 1.193233162164688,
"rewards/mrr_reward": 0.7095052152872086,
"rewards/rank_analyze_format_reward": 0.7606973052024841,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9834558814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 613.640625,
"epoch": 3.7199999999999998,
"grad_norm": 0.039265792816877365,
"kl": 0.012542724609375,
"learning_rate": 4.5135455253357053e-07,
"loss": -0.02,
"reward": 6.985211730003357,
"reward_std": 0.8345009088516235,
"rewards/mrr_reward": 0.5719804167747498,
"rewards/rank_analyze_format_reward": 0.7383057624101639,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 598.15625,
"epoch": 3.7279999999999998,
"grad_norm": 0.03676972910761833,
"kl": 0.013540267944335938,
"learning_rate": 4.3287948441169457e-07,
"loss": -0.0256,
"reward": 7.533303260803223,
"reward_std": 0.6752141863107681,
"rewards/mrr_reward": 0.7332217246294022,
"rewards/rank_analyze_format_reward": 0.6644462794065475,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 623.875,
"epoch": 3.7359999999999998,
"grad_norm": 0.03843839094042778,
"kl": 0.012653350830078125,
"learning_rate": 4.1478210982624055e-07,
"loss": -0.0137,
"reward": 7.101514220237732,
"reward_std": 0.712131037376821,
"rewards/mrr_reward": 0.6059895902872086,
"rewards/rank_analyze_format_reward": 0.7317783385515213,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9836309552192688,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 627.5,
"epoch": 3.7439999999999998,
"grad_norm": 0.038445886224508286,
"kl": 0.0115203857421875,
"learning_rate": 3.9706314323056936e-07,
"loss": -0.001,
"reward": 7.2100324630737305,
"reward_std": 0.8441106081008911,
"rewards/mrr_reward": 0.6200706958770752,
"rewards/rank_analyze_format_reward": 0.8110490888357162,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 567.046875,
"epoch": 3.752,
"grad_norm": 0.0397077351808548,
"kl": 0.012897491455078125,
"learning_rate": 3.7972328413914074e-07,
"loss": -0.0162,
"reward": 7.725077509880066,
"reward_std": 1.1218221932649612,
"rewards/mrr_reward": 0.7771391421556473,
"rewards/rank_analyze_format_reward": 0.6673022508621216,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 634.859375,
"epoch": 3.76,
"grad_norm": 0.035067442804574966,
"kl": 0.01100921630859375,
"learning_rate": 3.627632170999029e-07,
"loss": -0.0112,
"reward": 7.772274732589722,
"reward_std": 0.4194560647010803,
"rewards/mrr_reward": 0.7544270902872086,
"rewards/rank_analyze_format_reward": 0.75864277780056,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9979619532823563,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9979619532823563,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 629.296875,
"epoch": 3.768,
"grad_norm": 0.03988838940858841,
"kl": 0.013881683349609375,
"learning_rate": 3.4618361166726123e-07,
"loss": 0.0089,
"reward": 6.951045513153076,
"reward_std": 0.9670315980911255,
"rewards/mrr_reward": 0.5557911768555641,
"rewards/rank_analyze_format_reward": 0.811864972114563,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 610.359375,
"epoch": 3.776,
"grad_norm": 0.04017311707139015,
"kl": 0.012922286987304688,
"learning_rate": 3.2998512237565005e-07,
"loss": 0.0021,
"reward": 6.655686378479004,
"reward_std": 0.8838780298829079,
"rewards/mrr_reward": 0.49754463881254196,
"rewards/rank_analyze_format_reward": 0.7454710304737091,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 598.546875,
"epoch": 3.784,
"grad_norm": 0.04061814397573471,
"kl": 0.015371322631835938,
"learning_rate": 3.1416838871368925e-07,
"loss": 0.0023,
"reward": 6.667187333106995,
"reward_std": 1.6450905501842499,
"rewards/mrr_reward": 0.546354167163372,
"rewards/rank_analyze_format_reward": 0.6706438362598419,
"rewards/rank_answer_foramt_reward": 0.822265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983368366956711,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9983368366956711,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 634.578125,
"epoch": 3.792,
"grad_norm": 0.03757631406188011,
"kl": 0.011842727661132812,
"learning_rate": 2.987340350989421e-07,
"loss": -0.0277,
"reward": 7.062578439712524,
"reward_std": 0.7171650826931,
"rewards/mrr_reward": 0.5821800529956818,
"rewards/rank_analyze_format_reward": 0.8331592828035355,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 630.390625,
"epoch": 3.8,
"grad_norm": 0.0385047122836113,
"kl": 0.013031005859375,
"learning_rate": 2.836826708532603e-07,
"loss": -0.0071,
"reward": 6.774095058441162,
"reward_std": 1.0064187571406364,
"rewards/mrr_reward": 0.5209139287471771,
"rewards/rank_analyze_format_reward": 0.80762679874897,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.984375,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 621.0,
"epoch": 3.808,
"grad_norm": 0.03585473448038101,
"kl": 0.013561248779296875,
"learning_rate": 2.6901489017873375e-07,
"loss": 0.0157,
"reward": 7.464797139167786,
"reward_std": 0.975187674164772,
"rewards/mrr_reward": 0.7072048783302307,
"rewards/rank_analyze_format_reward": 0.7355870008468628,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 597.921875,
"epoch": 3.816,
"grad_norm": 0.039436955004930496,
"kl": 0.013492584228515625,
"learning_rate": 2.547312721342277e-07,
"loss": -0.0445,
"reward": 6.666959643363953,
"reward_std": 0.9370896592736244,
"rewards/mrr_reward": 0.5154203921556473,
"rewards/rank_analyze_format_reward": 0.6805618405342102,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.997514471411705,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.997514471411705,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 642.046875,
"epoch": 3.824,
"grad_norm": 0.03860106319189072,
"kl": 0.013734817504882812,
"learning_rate": 2.4083238061252565e-07,
"loss": 0.0084,
"reward": 6.7049055099487305,
"reward_std": 0.43066950887441635,
"rewards/mrr_reward": 0.4952381029725075,
"rewards/rank_analyze_format_reward": 0.7649687975645065,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 616.34375,
"epoch": 3.832,
"grad_norm": 0.041024912148714066,
"kl": 0.012483596801757812,
"learning_rate": 2.273187643180652e-07,
"loss": -0.023,
"reward": 6.57793653011322,
"reward_std": 1.0732970535755157,
"rewards/mrr_reward": 0.4735739082098007,
"rewards/rank_analyze_format_reward": 0.7500470578670502,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 606.578125,
"epoch": 3.84,
"grad_norm": 0.034815359860658646,
"kl": 0.011325836181640625,
"learning_rate": 2.1419095674527934e-07,
"loss": 0.0153,
"reward": 6.932149052619934,
"reward_std": 1.0536329746246338,
"rewards/mrr_reward": 0.5726066380739212,
"rewards/rank_analyze_format_reward": 0.731004387140274,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9953981041908264,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9953981041908264,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 630.40625,
"epoch": 3.848,
"grad_norm": 0.036056023091077805,
"kl": 0.01219940185546875,
"learning_rate": 2.014494761575314e-07,
"loss": -0.0102,
"reward": 7.459859132766724,
"reward_std": 0.681392565369606,
"rewards/mrr_reward": 0.6882998645305634,
"rewards/rank_analyze_format_reward": 0.7418159544467926,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 632.6875,
"epoch": 3.856,
"grad_norm": 0.038142506033182144,
"kl": 0.011157989501953125,
"learning_rate": 1.8909482556666026e-07,
"loss": -0.0052,
"reward": 6.9656277894973755,
"reward_std": 1.1684068441390991,
"rewards/mrr_reward": 0.5538752377033234,
"rewards/rank_analyze_format_reward": 0.8277124911546707,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9953869134187698,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9953869134187698,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 588.96875,
"epoch": 3.864,
"grad_norm": 0.03848228603601456,
"kl": 0.01366424560546875,
"learning_rate": 1.7712749271311392e-07,
"loss": -0.0115,
"reward": 7.90727972984314,
"reward_std": 0.9489937871694565,
"rewards/mrr_reward": 0.8121279776096344,
"rewards/rank_analyze_format_reward": 0.7349397465586662,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 592.140625,
"epoch": 3.872,
"grad_norm": 0.03674139454960823,
"kl": 0.01291656494140625,
"learning_rate": 1.6554795004670389e-07,
"loss": -0.0121,
"reward": 7.151305317878723,
"reward_std": 1.3574425652623177,
"rewards/mrr_reward": 0.6450272798538208,
"rewards/rank_analyze_format_reward": 0.7020555436611176,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.984375,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 593.875,
"epoch": 3.88,
"grad_norm": 0.03965899720788002,
"kl": 0.012401580810546875,
"learning_rate": 1.543566547079467e-07,
"loss": -0.0167,
"reward": 6.482243657112122,
"reward_std": 0.920079916715622,
"rewards/mrr_reward": 0.4476686418056488,
"rewards/rank_analyze_format_reward": 0.7362185418605804,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.013944223523139954,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 614.71875,
"epoch": 3.888,
"grad_norm": 0.038730841130018234,
"kl": 0.015628814697265625,
"learning_rate": 1.4355404851001953e-07,
"loss": -0.0013,
"reward": 7.664029955863953,
"reward_std": 0.8843671232461929,
"rewards/mrr_reward": 0.7207217365503311,
"rewards/rank_analyze_format_reward": 0.8122782856225967,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 584.703125,
"epoch": 3.896,
"grad_norm": 0.03808212652802467,
"kl": 0.013837814331054688,
"learning_rate": 1.3314055792131964e-07,
"loss": 0.0003,
"reward": 7.29535174369812,
"reward_std": 0.6776691898703575,
"rewards/mrr_reward": 0.6445312649011612,
"rewards/rank_analyze_format_reward": 0.7601954787969589,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 632.71875,
"epoch": 3.904,
"grad_norm": 0.04019862040877342,
"kl": 0.012102127075195312,
"learning_rate": 1.231165940486234e-07,
"loss": 0.0098,
"reward": 7.2379196882247925,
"reward_std": 1.0804044008255005,
"rewards/mrr_reward": 0.6402343809604645,
"rewards/rank_analyze_format_reward": 0.7469863891601562,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 611.21875,
"epoch": 3.912,
"grad_norm": 0.04155530408024788,
"kl": 0.014692306518554688,
"learning_rate": 1.134825526208605e-07,
"loss": -0.0172,
"reward": 6.462707042694092,
"reward_std": 1.3991620540618896,
"rewards/mrr_reward": 0.4669705033302307,
"rewards/rank_analyze_format_reward": 0.7217782437801361,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 616.140625,
"epoch": 3.92,
"grad_norm": 0.038728028535842896,
"kl": 0.011913299560546875,
"learning_rate": 1.0423881397349067e-07,
"loss": -0.0274,
"reward": 6.584546685218811,
"reward_std": 0.987204298377037,
"rewards/mrr_reward": 0.47931547462940216,
"rewards/rank_analyze_format_reward": 0.7864252328872681,
"rewards/rank_answer_foramt_reward": 0.927734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 643.890625,
"epoch": 3.928,
"grad_norm": 0.037255771458148956,
"kl": 0.012048721313476562,
"learning_rate": 9.538574303348813e-08,
"loss": -0.0003,
"reward": 7.248134255409241,
"reward_std": 0.609087161719799,
"rewards/mrr_reward": 0.6106956899166107,
"rewards/rank_analyze_format_reward": 0.8346483111381531,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 607.09375,
"epoch": 3.936,
"grad_norm": 0.03653496131300926,
"kl": 0.014324188232421875,
"learning_rate": 8.692368930493522e-08,
"loss": -0.0078,
"reward": 7.079232692718506,
"reward_std": 0.7762657403945923,
"rewards/mrr_reward": 0.5865451470017433,
"rewards/rank_analyze_format_reward": 0.7700468897819519,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 630.9375,
"epoch": 3.944,
"grad_norm": 0.03887813538312912,
"kl": 0.01277923583984375,
"learning_rate": 7.885298685522235e-08,
"loss": 0.0019,
"reward": 6.853779196739197,
"reward_std": 0.5883737578988075,
"rewards/mrr_reward": 0.5194568485021591,
"rewards/rank_analyze_format_reward": 0.8206439018249512,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 628.890625,
"epoch": 3.952,
"grad_norm": 0.03760422766208649,
"kl": 0.012228012084960938,
"learning_rate": 7.117395430186414e-08,
"loss": -0.0105,
"reward": 7.296409845352173,
"reward_std": 1.1353522688150406,
"rewards/mrr_reward": 0.6470052301883698,
"rewards/rank_analyze_format_reward": 0.7630764245986938,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 632.453125,
"epoch": 3.96,
"grad_norm": 0.04088395833969116,
"kl": 0.011974334716796875,
"learning_rate": 6.388689479991606e-08,
"loss": -0.014,
"reward": 6.792389988899231,
"reward_std": 0.9225097447633743,
"rewards/mrr_reward": 0.5218750275671482,
"rewards/rank_analyze_format_reward": 0.7651283890008926,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 586.046875,
"epoch": 3.968,
"grad_norm": 0.038039859384298325,
"kl": 0.012334823608398438,
"learning_rate": 5.699209603001077e-08,
"loss": -0.0243,
"reward": 7.0821181535720825,
"reward_std": 1.2305900156497955,
"rewards/mrr_reward": 0.6263020932674408,
"rewards/rank_analyze_format_reward": 0.676519088447094,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 619.65625,
"epoch": 3.976,
"grad_norm": 0.042080074548721313,
"kl": 0.012048721313476562,
"learning_rate": 5.048983018699827e-08,
"loss": 0.0112,
"reward": 7.104416489601135,
"reward_std": 1.2093525528907776,
"rewards/mrr_reward": 0.5863157212734222,
"rewards/rank_analyze_format_reward": 0.7825910001993179,
"rewards/rank_answer_foramt_reward": 0.984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 602.734375,
"epoch": 3.984,
"grad_norm": 0.038315799087285995,
"kl": 0.010751724243164062,
"learning_rate": 4.438035396920004e-08,
"loss": -0.0141,
"reward": 6.625136733055115,
"reward_std": 0.9687140211462975,
"rewards/mrr_reward": 0.4734809100627899,
"rewards/rank_analyze_format_reward": 0.7722287178039551,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 618.96875,
"epoch": 3.992,
"grad_norm": 0.03733893856406212,
"kl": 0.01275634765625,
"learning_rate": 3.866390856827495e-08,
"loss": -0.0313,
"reward": 7.275609493255615,
"reward_std": 0.918476015329361,
"rewards/mrr_reward": 0.6309213787317276,
"rewards/rank_analyze_format_reward": 0.8395065367221832,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 582.34375,
"epoch": 4.0,
"grad_norm": 0.03825841844081879,
"kl": 0.015573501586914062,
"learning_rate": 3.3340719659701315e-08,
"loss": -0.0178,
"reward": 7.2012619972229,
"reward_std": 0.7794432565569878,
"rewards/mrr_reward": 0.6161458268761635,
"rewards/rank_analyze_format_reward": 0.795157715678215,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 500
},
{
"epoch": 4.0,
"step": 500,
"total_flos": 0.0,
"train_loss": -0.0018289514125790446,
"train_runtime": 36870.3642,
"train_samples_per_second": 0.868,
"train_steps_per_second": 0.014
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}