jnian's picture
Model save
302688f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 528.203125,
"epoch": 0.008,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0249,
"reward": 4.16574627161026,
"reward_std": 1.0997275561094284,
"rewards/mrr_reward": 0.13278149627149105,
"rewards/rank_analyze_format_reward": 0.20883905701339245,
"rewards/rank_answer_foramt_reward": 0.59765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.875,
"rewards/rank_verify_format_reward": 0.96875,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 482.859375,
"epoch": 0.016,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0459,
"reward": 4.315212845802307,
"reward_std": 1.325284257531166,
"rewards/mrr_reward": 0.18193825148046017,
"rewards/rank_analyze_format_reward": 0.16742298379540443,
"rewards/rank_answer_foramt_reward": 0.59375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.859375,
"rewards/rank_verify_format_reward": 0.9678308814764023,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 504.859375,
"epoch": 0.024,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0326,
"reward": 4.513009071350098,
"reward_std": 1.3176036477088928,
"rewards/mrr_reward": 0.23714657500386238,
"rewards/rank_analyze_format_reward": 0.1366883972659707,
"rewards/rank_answer_foramt_reward": 0.662109375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.828125,
"rewards/rank_verify_format_reward": 0.953125,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 501.5625,
"epoch": 0.032,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0358,
"reward": 4.18767774105072,
"reward_std": 0.9816120713949203,
"rewards/mrr_reward": 0.1309461873024702,
"rewards/rank_analyze_format_reward": 0.190928403288126,
"rewards/rank_answer_foramt_reward": 0.646484375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.828125,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 517.453125,
"epoch": 0.04,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0459,
"reward": 4.4382476806640625,
"reward_std": 1.167427971959114,
"rewards/mrr_reward": 0.18655134364962578,
"rewards/rank_analyze_format_reward": 0.23099003173410892,
"rewards/rank_answer_foramt_reward": 0.634765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.84375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 523.921875,
"epoch": 0.048,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0513,
"reward": 4.052561104297638,
"reward_std": 1.0998588353395462,
"rewards/mrr_reward": 0.09990699402987957,
"rewards/rank_analyze_format_reward": 0.3598833493888378,
"rewards/rank_answer_foramt_reward": 0.490234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9912513643503189,
"rewards/rank_overall_format_reward_more": 0.8359375,
"rewards/rank_verify_format_reward": 0.9756263643503189,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 511.203125,
"epoch": 0.056,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0118,
"reward": 3.817609965801239,
"reward_std": 1.3295144587755203,
"rewards/mrr_reward": 0.10291418805718422,
"rewards/rank_analyze_format_reward": 0.22595737129449844,
"rewards/rank_answer_foramt_reward": 0.509765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9523026347160339,
"rewards/rank_overall_format_reward_more": 0.78125,
"rewards/rank_verify_format_reward": 0.9366776347160339,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 501.234375,
"epoch": 0.064,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0317,
"reward": 3.9616005420684814,
"reward_std": 1.4605186134576797,
"rewards/mrr_reward": 0.12518600933253765,
"rewards/rank_analyze_format_reward": 0.24394467286765575,
"rewards/rank_answer_foramt_reward": 0.4765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9834558814764023,
"rewards/rank_overall_format_reward_more": 0.7890625,
"rewards/rank_verify_format_reward": 0.9678308814764023,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 485.953125,
"epoch": 0.072,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": -0.0058,
"reward": 3.873459815979004,
"reward_std": 1.0441433489322662,
"rewards/mrr_reward": 0.10626240447163582,
"rewards/rank_analyze_format_reward": 0.14176952932029963,
"rewards/rank_answer_foramt_reward": 0.525390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.828125,
"rewards/rank_verify_format_reward": 0.96875,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 515.296875,
"epoch": 0.08,
"grad_norm": 0.01915793865919113,
"kl": 0.0,
"learning_rate": 1.9999999684172664e-05,
"loss": -0.0341,
"reward": 4.031236290931702,
"reward_std": 1.048377439379692,
"rewards/mrr_reward": 0.09990079700946808,
"rewards/rank_analyze_format_reward": 0.19022684637457132,
"rewards/rank_answer_foramt_reward": 0.55078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.890625,
"rewards/rank_verify_format_reward": 1.0,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 487.359375,
"epoch": 0.088,
"grad_norm": 0.02235390432178974,
"kl": -6.154179573059082e-06,
"learning_rate": 1.9999998736690666e-05,
"loss": -0.0483,
"reward": 4.058919072151184,
"reward_std": 1.0137622952461243,
"rewards/mrr_reward": 0.12898686341941357,
"rewards/rank_analyze_format_reward": 0.1387843620032072,
"rewards/rank_answer_foramt_reward": 0.603515625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9940857887268066,
"rewards/rank_overall_format_reward_more": 0.8125,
"rewards/rank_verify_format_reward": 0.9940857887268066,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 500.703125,
"epoch": 0.096,
"grad_norm": 0.019736869260668755,
"kl": -5.140900611877441e-06,
"learning_rate": 1.999999715755407e-05,
"loss": -0.0413,
"reward": 4.11133998632431,
"reward_std": 1.2341832220554352,
"rewards/mrr_reward": 0.12740575149655342,
"rewards/rank_analyze_format_reward": 0.3057297058403492,
"rewards/rank_answer_foramt_reward": 0.556640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9829545468091965,
"rewards/rank_overall_format_reward_more": 0.7890625,
"rewards/rank_verify_format_reward": 0.9673295468091965,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 499.734375,
"epoch": 0.104,
"grad_norm": 0.019736869260668755,
"kl": -3.725290298461914e-06,
"learning_rate": 1.999999715755407e-05,
"loss": -0.0211,
"reward": 4.32198166847229,
"reward_std": 0.9904958009719849,
"rewards/mrr_reward": 0.13051215931773186,
"rewards/rank_analyze_format_reward": 0.2530582267791033,
"rewards/rank_answer_foramt_reward": 0.65625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.890625,
"rewards/rank_verify_format_reward": 1.0,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 474.59375,
"epoch": 0.112,
"grad_norm": 0.020436184480786324,
"kl": -3.859400749206543e-06,
"learning_rate": 1.9999994946762974e-05,
"loss": -0.0097,
"reward": 4.348296344280243,
"reward_std": 1.4071729183197021,
"rewards/mrr_reward": 0.19649058394134045,
"rewards/rank_analyze_format_reward": 0.12483388930559158,
"rewards/rank_answer_foramt_reward": 0.5546875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.8828125,
"rewards/rank_verify_format_reward": 1.0,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 517.84375,
"epoch": 0.12,
"grad_norm": 0.019303128123283386,
"kl": -2.339482307434082e-06,
"learning_rate": 1.999999210431752e-05,
"loss": -0.0125,
"reward": 4.1298569440841675,
"reward_std": 1.0909616947174072,
"rewards/mrr_reward": 0.11587301827967167,
"rewards/rank_analyze_format_reward": 0.19317593052983284,
"rewards/rank_answer_foramt_reward": 0.607421875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992897808551788,
"rewards/rank_overall_format_reward_more": 0.8671875,
"rewards/rank_verify_format_reward": 0.9992897808551788,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 489.984375,
"epoch": 0.128,
"grad_norm": 0.02006162703037262,
"kl": -5.513429641723633e-07,
"learning_rate": 1.9999988630217885e-05,
"loss": 0.004,
"reward": 4.19925457239151,
"reward_std": 1.0583490580320358,
"rewards/mrr_reward": 0.15414806827902794,
"rewards/rank_analyze_format_reward": 0.2584436684846878,
"rewards/rank_answer_foramt_reward": 0.55859375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.96875,
"rewards/rank_overall_format_reward_more": 0.84375,
"rewards/rank_verify_format_reward": 0.953125,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 519.984375,
"epoch": 0.136,
"grad_norm": 0.02006162703037262,
"kl": -3.8743019104003906e-07,
"learning_rate": 1.9999988630217885e-05,
"loss": -0.0332,
"reward": 3.918194353580475,
"reward_std": 1.2305240333080292,
"rewards/mrr_reward": 0.12146577425301075,
"rewards/rank_analyze_format_reward": 0.1569407321512699,
"rewards/rank_answer_foramt_reward": 0.556640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.953125,
"rewards/rank_overall_format_reward_more": 0.8125,
"rewards/rank_verify_format_reward": 0.953125,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 520.96875,
"epoch": 0.144,
"grad_norm": 0.01881454698741436,
"kl": 1.4901161193847656e-08,
"learning_rate": 1.999998452446429e-05,
"loss": -0.0496,
"reward": 4.462850987911224,
"reward_std": 1.4440096318721771,
"rewards/mrr_reward": 0.19556052424013615,
"rewards/rank_analyze_format_reward": 0.2645931877195835,
"rewards/rank_answer_foramt_reward": 0.689453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.96875,
"rewards/rank_overall_format_reward_more": 0.8046875,
"rewards/rank_verify_format_reward": 0.953125,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 520.109375,
"epoch": 0.152,
"grad_norm": 0.01960228569805622,
"kl": 2.5480985641479492e-06,
"learning_rate": 1.9999979787056998e-05,
"loss": -0.0073,
"reward": 4.165937960147858,
"reward_std": 1.3161405473947525,
"rewards/mrr_reward": 0.11880580708384514,
"rewards/rank_analyze_format_reward": 0.3187452331185341,
"rewards/rank_answer_foramt_reward": 0.568359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9682112038135529,
"rewards/rank_overall_format_reward_more": 0.8671875,
"rewards/rank_verify_format_reward": 0.9682112038135529,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 510.09375,
"epoch": 0.16,
"grad_norm": 0.02011404000222683,
"kl": 5.4389238357543945e-06,
"learning_rate": 1.9999974417996303e-05,
"loss": -0.0173,
"reward": 4.031284391880035,
"reward_std": 1.1271260976791382,
"rewards/mrr_reward": 0.09977678954601288,
"rewards/rank_analyze_format_reward": 0.31458218209445477,
"rewards/rank_answer_foramt_reward": 0.4765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.998641312122345,
"rewards/rank_overall_format_reward_more": 0.859375,
"rewards/rank_verify_format_reward": 0.983016312122345,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 491.15625,
"epoch": 0.168,
"grad_norm": 0.020767828449606895,
"kl": 1.1399388313293457e-05,
"learning_rate": 1.9999968417282542e-05,
"loss": 0.0079,
"reward": 4.090369284152985,
"reward_std": 0.8935733437538147,
"rewards/mrr_reward": 0.10092386044561863,
"rewards/rank_analyze_format_reward": 0.18777898885309696,
"rewards/rank_answer_foramt_reward": 0.6328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.995541125535965,
"rewards/rank_overall_format_reward_more": 0.875,
"rewards/rank_verify_format_reward": 0.995541125535965,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 497.125,
"epoch": 0.176,
"grad_norm": 0.020388908684253693,
"kl": 1.4647841453552246e-05,
"learning_rate": 1.99999617849161e-05,
"loss": -0.0264,
"reward": 4.866297721862793,
"reward_std": 1.3940207660198212,
"rewards/mrr_reward": 0.27614088356494904,
"rewards/rank_analyze_format_reward": 0.23634351417422295,
"rewards/rank_answer_foramt_reward": 0.705078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.8671875,
"rewards/rank_verify_format_reward": 0.96875,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 493.0,
"epoch": 0.184,
"grad_norm": 0.021081620827317238,
"kl": 2.060830593109131e-05,
"learning_rate": 1.9999954520897394e-05,
"loss": -0.0198,
"reward": 4.1482550501823425,
"reward_std": 0.7457377761602402,
"rewards/mrr_reward": 0.09423363115638494,
"rewards/rank_analyze_format_reward": 0.24007043987512589,
"rewards/rank_answer_foramt_reward": 0.640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.890625,
"rewards/rank_verify_format_reward": 1.0,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 507.25,
"epoch": 0.192,
"grad_norm": 0.021528450772166252,
"kl": 2.8684735298156738e-05,
"learning_rate": 1.999994662522688e-05,
"loss": -0.0459,
"reward": 4.055303335189819,
"reward_std": 1.1271640360355377,
"rewards/mrr_reward": 0.12201760895550251,
"rewards/rank_analyze_format_reward": 0.23910778760910034,
"rewards/rank_answer_foramt_reward": 0.5546875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.8046875,
"rewards/rank_verify_format_reward": 0.96875,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 514.78125,
"epoch": 0.2,
"grad_norm": 0.020565951243042946,
"kl": 3.625452518463135e-05,
"learning_rate": 1.9999938097905064e-05,
"loss": 0.022,
"reward": 4.250920534133911,
"reward_std": 1.3971717804670334,
"rewards/mrr_reward": 0.1861669160425663,
"rewards/rank_analyze_format_reward": 0.20351847913116217,
"rewards/rank_answer_foramt_reward": 0.638671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.96875,
"rewards/rank_overall_format_reward_more": 0.7578125,
"rewards/rank_verify_format_reward": 0.9375,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 525.25,
"epoch": 0.208,
"grad_norm": 0.01966329663991928,
"kl": 3.674626350402832e-05,
"learning_rate": 1.9999928938932473e-05,
"loss": -0.0257,
"reward": 4.075399398803711,
"reward_std": 1.2551968395709991,
"rewards/mrr_reward": 0.10590897873044014,
"rewards/rank_analyze_format_reward": 0.28262292593717575,
"rewards/rank_answer_foramt_reward": 0.564453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.8515625,
"rewards/rank_verify_format_reward": 0.96875,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 477.421875,
"epoch": 0.216,
"grad_norm": 0.02318265102803707,
"kl": 5.188584327697754e-05,
"learning_rate": 1.99999191483097e-05,
"loss": -0.0383,
"reward": 4.07093209028244,
"reward_std": 1.1806218922138214,
"rewards/mrr_reward": 0.13093997910618782,
"rewards/rank_analyze_format_reward": 0.20123931858688593,
"rewards/rank_answer_foramt_reward": 0.498046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.8515625,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 500.625,
"epoch": 0.224,
"grad_norm": 0.02106453664600849,
"kl": 5.2034854888916016e-05,
"learning_rate": 1.999990872603735e-05,
"loss": -0.0142,
"reward": 4.171126127243042,
"reward_std": 0.9384299516677856,
"rewards/mrr_reward": 0.10311880148947239,
"rewards/rank_analyze_format_reward": 0.24095792695879936,
"rewards/rank_answer_foramt_reward": 0.66015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.875,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 500.96875,
"epoch": 0.232,
"grad_norm": 0.021827075630426407,
"kl": 6.267428398132324e-05,
"learning_rate": 1.999989767211609e-05,
"loss": -0.0406,
"reward": 4.739075601100922,
"reward_std": 1.122992992401123,
"rewards/mrr_reward": 0.2242001499980688,
"rewards/rank_analyze_format_reward": 0.1782125374302268,
"rewards/rank_answer_foramt_reward": 0.7890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.890625,
"rewards/rank_verify_format_reward": 0.984375,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 518.5625,
"epoch": 0.24,
"grad_norm": 0.019971711561083794,
"kl": 6.35683536529541e-05,
"learning_rate": 1.9999885986546613e-05,
"loss": -0.0358,
"reward": 4.3094329833984375,
"reward_std": 0.8498065173625946,
"rewards/mrr_reward": 0.09677579626441002,
"rewards/rank_analyze_format_reward": 0.29673536494374275,
"rewards/rank_answer_foramt_reward": 0.705078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9993206560611725,
"rewards/rank_overall_format_reward_more": 0.921875,
"rewards/rank_verify_format_reward": 0.9993206560611725,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 520.0,
"epoch": 0.248,
"grad_norm": 0.02139338292181492,
"kl": 0.00010059773921966553,
"learning_rate": 1.999987366932966e-05,
"loss": -0.0341,
"reward": 4.280443549156189,
"reward_std": 1.3128504306077957,
"rewards/mrr_reward": 0.1447172649204731,
"rewards/rank_analyze_format_reward": 0.3246212564408779,
"rewards/rank_answer_foramt_reward": 0.572265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.8359375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 511.296875,
"epoch": 0.256,
"grad_norm": 0.02021283283829689,
"kl": 9.936094284057617e-05,
"learning_rate": 1.9999860720466007e-05,
"loss": -0.0208,
"reward": 4.884114027023315,
"reward_std": 1.141958087682724,
"rewards/mrr_reward": 0.22592385485768318,
"rewards/rank_analyze_format_reward": 0.31244974583387375,
"rewards/rank_answer_foramt_reward": 0.80078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.8671875,
"rewards/rank_verify_format_reward": 1.0,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 500.859375,
"epoch": 0.264,
"grad_norm": 0.02373325079679489,
"kl": 0.00011742115020751953,
"learning_rate": 1.9999847139956477e-05,
"loss": -0.0074,
"reward": 4.154786288738251,
"reward_std": 1.1190759539604187,
"rewards/mrr_reward": 0.15658481419086456,
"rewards/rank_analyze_format_reward": 0.17102508060634136,
"rewards/rank_answer_foramt_reward": 0.552734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.96875,
"rewards/rank_overall_format_reward_more": 0.8671875,
"rewards/rank_verify_format_reward": 0.96875,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 461.6875,
"epoch": 0.272,
"grad_norm": 0.024350695312023163,
"kl": 0.00012856721878051758,
"learning_rate": 1.9999832927801922e-05,
"loss": -0.0463,
"reward": 4.32455313205719,
"reward_std": 1.5562799572944641,
"rewards/mrr_reward": 0.20357143133878708,
"rewards/rank_analyze_format_reward": 0.10987668856978416,
"rewards/rank_answer_foramt_reward": 0.525390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.90625,
"rewards/rank_verify_format_reward": 0.984375,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 534.375,
"epoch": 0.28,
"grad_norm": 0.019665928557515144,
"kl": 0.00014293193817138672,
"learning_rate": 1.9999818084003243e-05,
"loss": -0.0164,
"reward": 4.593672394752502,
"reward_std": 1.1871068179607391,
"rewards/mrr_reward": 0.1902901791036129,
"rewards/rank_analyze_format_reward": 0.28130697179585695,
"rewards/rank_answer_foramt_reward": 0.6875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.8828125,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 512.890625,
"epoch": 0.288,
"grad_norm": 0.02193881757557392,
"kl": 0.00016960501670837402,
"learning_rate": 1.999980260856137e-05,
"loss": -0.0276,
"reward": 4.122673153877258,
"reward_std": 0.916993722319603,
"rewards/mrr_reward": 0.11755332630127668,
"rewards/rank_analyze_format_reward": 0.20117305219173431,
"rewards/rank_answer_foramt_reward": 0.59375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.875,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 501.9375,
"epoch": 0.296,
"grad_norm": 0.020937107503414154,
"kl": 0.00017371773719787598,
"learning_rate": 1.9999786501477298e-05,
"loss": 0.002,
"reward": 4.0404258370399475,
"reward_std": 1.075580656528473,
"rewards/mrr_reward": 0.10114087350666523,
"rewards/rank_analyze_format_reward": 0.22352311667054892,
"rewards/rank_answer_foramt_reward": 0.619140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9825367629528046,
"rewards/rank_overall_format_reward_more": 0.84375,
"rewards/rank_verify_format_reward": 0.9669117629528046,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 518.5625,
"epoch": 0.304,
"grad_norm": 0.022719040513038635,
"kl": 0.00017246603965759277,
"learning_rate": 1.9999769762752024e-05,
"loss": -0.0087,
"reward": 4.427178978919983,
"reward_std": 1.1200510263442993,
"rewards/mrr_reward": 0.14256572909653187,
"rewards/rank_analyze_format_reward": 0.3164584683254361,
"rewards/rank_answer_foramt_reward": 0.646484375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9977678656578064,
"rewards/rank_overall_format_reward_more": 0.9140625,
"rewards/rank_verify_format_reward": 0.9821428656578064,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 486.53125,
"epoch": 0.312,
"grad_norm": 0.023954233154654503,
"kl": 0.00021010637283325195,
"learning_rate": 1.999975239238662e-05,
"loss": 0.0227,
"reward": 4.685232400894165,
"reward_std": 1.559970200061798,
"rewards/mrr_reward": 0.2501426115632057,
"rewards/rank_analyze_format_reward": 0.19994227308779955,
"rewards/rank_answer_foramt_reward": 0.599609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9972426444292068,
"rewards/rank_overall_format_reward_more": 0.90625,
"rewards/rank_verify_format_reward": 0.9816176444292068,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 503.546875,
"epoch": 0.32,
"grad_norm": 0.021838972344994545,
"kl": 0.00023385882377624512,
"learning_rate": 1.999973439038218e-05,
"loss": -0.0279,
"reward": 4.507627367973328,
"reward_std": 1.3758054077625275,
"rewards/mrr_reward": 0.1917472742497921,
"rewards/rank_analyze_format_reward": 0.2892364487051964,
"rewards/rank_answer_foramt_reward": 0.564453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9825367629528046,
"rewards/rank_overall_format_reward_more": 0.921875,
"rewards/rank_verify_format_reward": 0.9825367629528046,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 510.25,
"epoch": 0.328,
"grad_norm": 0.02145545743405819,
"kl": 0.00022032856941223145,
"learning_rate": 1.9999715756739833e-05,
"loss": -0.0426,
"reward": 4.766237854957581,
"reward_std": 1.3724510371685028,
"rewards/mrr_reward": 0.24400422349572182,
"rewards/rank_analyze_format_reward": 0.22967395186424255,
"rewards/rank_answer_foramt_reward": 0.677734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.8984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 535.625,
"epoch": 0.336,
"grad_norm": 0.02246415615081787,
"kl": 0.0002751350402832031,
"learning_rate": 1.9999696491460764e-05,
"loss": -0.0425,
"reward": 4.64515745639801,
"reward_std": 0.9094655960798264,
"rewards/mrr_reward": 0.16312624514102936,
"rewards/rank_analyze_format_reward": 0.3751567006111145,
"rewards/rank_answer_foramt_reward": 0.744140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.890625,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 502.515625,
"epoch": 0.344,
"grad_norm": 0.024739902466535568,
"kl": 0.0003191530704498291,
"learning_rate": 1.9999676594546187e-05,
"loss": -0.038,
"reward": 4.759453654289246,
"reward_std": 1.3180456161499023,
"rewards/mrr_reward": 0.2570870481431484,
"rewards/rank_analyze_format_reward": 0.16787387989461422,
"rewards/rank_answer_foramt_reward": 0.646484375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.921875,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 529.53125,
"epoch": 0.352,
"grad_norm": 0.022288991138339043,
"kl": 0.0002784132957458496,
"learning_rate": 1.999965606599736e-05,
"loss": -0.0421,
"reward": 4.620839357376099,
"reward_std": 1.0206461399793625,
"rewards/mrr_reward": 0.16845857724547386,
"rewards/rank_analyze_format_reward": 0.30247366055846214,
"rewards/rank_answer_foramt_reward": 0.74609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.8984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 520.03125,
"epoch": 0.36,
"grad_norm": 0.022286290302872658,
"kl": 0.00043213367462158203,
"learning_rate": 1.999963490581558e-05,
"loss": -0.0056,
"reward": 4.52034318447113,
"reward_std": 0.9805040061473846,
"rewards/mrr_reward": 0.16817336902022362,
"rewards/rank_analyze_format_reward": 0.2824648283421993,
"rewards/rank_answer_foramt_reward": 0.6484375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.921875,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 526.46875,
"epoch": 0.368,
"grad_norm": 0.023201555013656616,
"kl": 0.00047147274017333984,
"learning_rate": 1.9999613114002184e-05,
"loss": -0.0461,
"reward": 4.407416224479675,
"reward_std": 1.2047448754310608,
"rewards/mrr_reward": 0.1641245037317276,
"rewards/rank_analyze_format_reward": 0.3212462067604065,
"rewards/rank_answer_foramt_reward": 0.64453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9667892158031464,
"rewards/rank_overall_format_reward_more": 0.8671875,
"rewards/rank_verify_format_reward": 0.9511642158031464,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 524.46875,
"epoch": 0.376,
"grad_norm": 0.023201555013656616,
"kl": 0.00044339895248413086,
"learning_rate": 1.9999613114002184e-05,
"loss": -0.0536,
"reward": 4.553882956504822,
"reward_std": 1.4308572709560394,
"rewards/mrr_reward": 0.17046131566166878,
"rewards/rank_analyze_format_reward": 0.4204007051885128,
"rewards/rank_answer_foramt_reward": 0.609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.84375,
"rewards/rank_verify_format_reward": 0.9992559552192688,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 510.984375,
"epoch": 0.384,
"grad_norm": 0.02642284519970417,
"kl": 0.00043976306915283203,
"learning_rate": 1.9999590690558545e-05,
"loss": 0.0066,
"reward": 4.590193271636963,
"reward_std": 1.2087296098470688,
"rewards/mrr_reward": 0.20808532275259495,
"rewards/rank_analyze_format_reward": 0.13480500131845474,
"rewards/rank_answer_foramt_reward": 0.755859375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.8984375,
"rewards/rank_verify_format_reward": 0.96875,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 539.734375,
"epoch": 0.392,
"grad_norm": 0.0221265759319067,
"kl": 0.00046002864837646484,
"learning_rate": 1.9999567635486086e-05,
"loss": -0.0091,
"reward": 4.26533442735672,
"reward_std": 1.1292133778333664,
"rewards/mrr_reward": 0.1356088798493147,
"rewards/rank_analyze_format_reward": 0.27040086686611176,
"rewards/rank_answer_foramt_reward": 0.568359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9967568069696426,
"rewards/rank_overall_format_reward_more": 0.90625,
"rewards/rank_verify_format_reward": 0.9811318069696426,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 507.84375,
"epoch": 0.4,
"grad_norm": 0.02278602309525013,
"kl": 0.0005121231079101562,
"learning_rate": 1.9999543948786258e-05,
"loss": -0.0093,
"reward": 4.813909411430359,
"reward_std": 1.5017302483320236,
"rewards/mrr_reward": 0.25014261342585087,
"rewards/rank_analyze_format_reward": 0.2544366829097271,
"rewards/rank_answer_foramt_reward": 0.693359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.8984375,
"rewards/rank_verify_format_reward": 0.9679276347160339,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 526.59375,
"epoch": 0.408,
"grad_norm": 0.022586733102798462,
"kl": 0.0003803372383117676,
"learning_rate": 1.9999519630460554e-05,
"loss": -0.0174,
"reward": 4.623661637306213,
"reward_std": 0.6876689344644547,
"rewards/mrr_reward": 0.14153646305203438,
"rewards/rank_analyze_format_reward": 0.31521353125572205,
"rewards/rank_answer_foramt_reward": 0.861328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9140625,
"rewards/rank_verify_format_reward": 0.9678308814764023,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 535.28125,
"epoch": 0.416,
"grad_norm": 0.022564509883522987,
"kl": 0.0006885528564453125,
"learning_rate": 1.999949468051052e-05,
"loss": -0.0146,
"reward": 4.366745591163635,
"reward_std": 1.1478500664234161,
"rewards/mrr_reward": 0.1415860652923584,
"rewards/rank_analyze_format_reward": 0.39610453229397535,
"rewards/rank_answer_foramt_reward": 0.529296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.875,
"rewards/rank_verify_format_reward": 1.0,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 523.671875,
"epoch": 0.424,
"grad_norm": 0.02404443360865116,
"kl": 0.0006675124168395996,
"learning_rate": 1.9999469098937726e-05,
"loss": -0.0412,
"reward": 4.3376225233078,
"reward_std": 0.9486726224422455,
"rewards/mrr_reward": 0.10815352387726307,
"rewards/rank_analyze_format_reward": 0.36356932669878006,
"rewards/rank_answer_foramt_reward": 0.630859375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9140625,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 565.125,
"epoch": 0.432,
"grad_norm": 0.024322351440787315,
"kl": 0.000644683837890625,
"learning_rate": 1.9999442885743785e-05,
"loss": -0.0459,
"reward": 4.411644458770752,
"reward_std": 1.2008217573165894,
"rewards/mrr_reward": 0.12810019869357347,
"rewards/rank_analyze_format_reward": 0.3992435559630394,
"rewards/rank_answer_foramt_reward": 0.6484375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.96875,
"rewards/rank_overall_format_reward_more": 0.9140625,
"rewards/rank_verify_format_reward": 0.96875,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 525.125,
"epoch": 0.44,
"grad_norm": 0.02503729611635208,
"kl": 0.0007516145706176758,
"learning_rate": 1.9999416040930354e-05,
"loss": -0.0562,
"reward": 4.842287182807922,
"reward_std": 1.241357833147049,
"rewards/mrr_reward": 0.20032241940498352,
"rewards/rank_analyze_format_reward": 0.4095212556421757,
"rewards/rank_answer_foramt_reward": 0.736328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9827302694320679,
"rewards/rank_overall_format_reward_more": 0.9296875,
"rewards/rank_verify_format_reward": 0.9827302694320679,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 539.4375,
"epoch": 0.448,
"grad_norm": 0.02243031933903694,
"kl": 0.0006896257400512695,
"learning_rate": 1.9999388564499135e-05,
"loss": -0.0226,
"reward": 4.513183832168579,
"reward_std": 1.038706436753273,
"rewards/mrr_reward": 0.13224826380610466,
"rewards/rank_analyze_format_reward": 0.35126328840851784,
"rewards/rank_answer_foramt_reward": 0.705078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9296875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 534.796875,
"epoch": 0.456,
"grad_norm": 0.026454076170921326,
"kl": 0.0011850595474243164,
"learning_rate": 1.999936045645186e-05,
"loss": 0.0105,
"reward": 4.213799595832825,
"reward_std": 0.925405740737915,
"rewards/mrr_reward": 0.09674479439854622,
"rewards/rank_analyze_format_reward": 0.32846502028405666,
"rewards/rank_answer_foramt_reward": 0.59375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9835526347160339,
"rewards/rank_overall_format_reward_more": 0.9375,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 521.25,
"epoch": 0.464,
"grad_norm": 0.0244609247893095,
"kl": 0.0009926557540893555,
"learning_rate": 1.9999331716790303e-05,
"loss": -0.0587,
"reward": 4.537912011146545,
"reward_std": 1.1836341470479965,
"rewards/mrr_reward": 0.16721230559051037,
"rewards/rank_analyze_format_reward": 0.22257816419005394,
"rewards/rank_answer_foramt_reward": 0.748046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.9296875,
"rewards/rank_verify_format_reward": 0.984375,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 505.328125,
"epoch": 0.472,
"grad_norm": 0.02554202266037464,
"kl": 0.0009477138519287109,
"learning_rate": 1.9999302345516278e-05,
"loss": -0.04,
"reward": 4.834690093994141,
"reward_std": 1.2422936260700226,
"rewards/mrr_reward": 0.23472222685813904,
"rewards/rank_analyze_format_reward": 0.2765456959605217,
"rewards/rank_answer_foramt_reward": 0.67578125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 525.5,
"epoch": 0.48,
"grad_norm": 0.02595827914774418,
"kl": 0.0011942386627197266,
"learning_rate": 1.9999272342631644e-05,
"loss": -0.0433,
"reward": 4.970677137374878,
"reward_std": 1.340297669172287,
"rewards/mrr_reward": 0.24613716453313828,
"rewards/rank_analyze_format_reward": 0.4353472888469696,
"rewards/rank_answer_foramt_reward": 0.65234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9296875,
"rewards/rank_verify_format_reward": 0.96875,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 557.796875,
"epoch": 0.488,
"grad_norm": 0.023065784946084023,
"kl": 0.0008342266082763672,
"learning_rate": 1.9999241708138296e-05,
"loss": -0.0182,
"reward": 5.042990684509277,
"reward_std": 1.06068916618824,
"rewards/mrr_reward": 0.21142732724547386,
"rewards/rank_analyze_format_reward": 0.5171288028359413,
"rewards/rank_answer_foramt_reward": 0.720703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9992559552192688,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 544.265625,
"epoch": 0.496,
"grad_norm": 0.02461962401866913,
"kl": 0.0009695291519165039,
"learning_rate": 1.9999210442038164e-05,
"loss": -0.0215,
"reward": 4.853347659111023,
"reward_std": 0.9557467103004456,
"rewards/mrr_reward": 0.19696180522441864,
"rewards/rank_analyze_format_reward": 0.3876512125134468,
"rewards/rank_answer_foramt_reward": 0.7734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9834558814764023,
"rewards/rank_overall_format_reward_more": 0.9375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 537.484375,
"epoch": 0.504,
"grad_norm": 0.024909336119890213,
"kl": 0.0018236637115478516,
"learning_rate": 1.9999178544333228e-05,
"loss": -0.0161,
"reward": 4.388993203639984,
"reward_std": 1.0270372480154037,
"rewards/mrr_reward": 0.09973958693444729,
"rewards/rank_analyze_format_reward": 0.47624821215867996,
"rewards/rank_answer_foramt_reward": 0.6640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9834558814764023,
"rewards/rank_overall_format_reward_more": 0.9140625,
"rewards/rank_verify_format_reward": 0.9522058814764023,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 542.828125,
"epoch": 0.512,
"grad_norm": 0.024513162672519684,
"kl": 0.0010799169540405273,
"learning_rate": 1.9999146015025503e-05,
"loss": -0.0241,
"reward": 4.562963366508484,
"reward_std": 0.8578247427940369,
"rewards/mrr_reward": 0.12247024103999138,
"rewards/rank_analyze_format_reward": 0.3348013088107109,
"rewards/rank_answer_foramt_reward": 0.80078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9375,
"rewards/rank_verify_format_reward": 1.0,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 551.046875,
"epoch": 0.52,
"grad_norm": 0.025237975642085075,
"kl": 0.0012438297271728516,
"learning_rate": 1.999911285411704e-05,
"loss": -0.032,
"reward": 4.392790853977203,
"reward_std": 0.9753015786409378,
"rewards/mrr_reward": 0.11674107611179352,
"rewards/rank_analyze_format_reward": 0.36466294899582863,
"rewards/rank_answer_foramt_reward": 0.705078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9827302694320679,
"rewards/rank_overall_format_reward_more": 0.90625,
"rewards/rank_verify_format_reward": 0.9671052694320679,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 541.96875,
"epoch": 0.528,
"grad_norm": 0.025285648182034492,
"kl": 0.0014480352401733398,
"learning_rate": 1.9999079061609933e-05,
"loss": -0.0342,
"reward": 4.65506386756897,
"reward_std": 1.2143934965133667,
"rewards/mrr_reward": 0.16987847164273262,
"rewards/rank_analyze_format_reward": 0.4049289934337139,
"rewards/rank_answer_foramt_reward": 0.634765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 520.1875,
"epoch": 0.536,
"grad_norm": 0.02740044705569744,
"kl": 0.001478433609008789,
"learning_rate": 1.999904463750632e-05,
"loss": -0.0097,
"reward": 4.9612908363342285,
"reward_std": 1.1755748093128204,
"rewards/mrr_reward": 0.24047619476914406,
"rewards/rank_analyze_format_reward": 0.309462770819664,
"rewards/rank_answer_foramt_reward": 0.734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.993399053812027,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.993399053812027,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 561.734375,
"epoch": 0.544,
"grad_norm": 0.026321450248360634,
"kl": 0.001252889633178711,
"learning_rate": 1.999900958180838e-05,
"loss": -0.0504,
"reward": 5.649916648864746,
"reward_std": 1.1421409100294113,
"rewards/mrr_reward": 0.3350074402987957,
"rewards/rank_analyze_format_reward": 0.4505118057131767,
"rewards/rank_answer_foramt_reward": 0.875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 559.59375,
"epoch": 0.552,
"grad_norm": 0.02405831590294838,
"kl": 0.0014369487762451172,
"learning_rate": 1.9998973894518318e-05,
"loss": 0.0015,
"reward": 4.788713574409485,
"reward_std": 0.9084204286336899,
"rewards/mrr_reward": 0.1370349731296301,
"rewards/rank_analyze_format_reward": 0.5101048266515136,
"rewards/rank_answer_foramt_reward": 0.81640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.984375,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 526.03125,
"epoch": 0.56,
"grad_norm": 0.0266768429428339,
"kl": 0.001832723617553711,
"learning_rate": 1.999893757563839e-05,
"loss": -0.0401,
"reward": 4.823967456817627,
"reward_std": 1.1708943247795105,
"rewards/mrr_reward": 0.20266617462038994,
"rewards/rank_analyze_format_reward": 0.36864544451236725,
"rewards/rank_answer_foramt_reward": 0.705078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9971333742141724,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9815083742141724,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 583.609375,
"epoch": 0.568,
"grad_norm": 0.025261225178837776,
"kl": 0.001687765121459961,
"learning_rate": 1.9998900625170897e-05,
"loss": -0.0282,
"reward": 5.068133354187012,
"reward_std": 1.3164568394422531,
"rewards/mrr_reward": 0.24970858544111252,
"rewards/rank_analyze_format_reward": 0.37027864158153534,
"rewards/rank_answer_foramt_reward": 0.74609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9959945678710938,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9959945678710938,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 579.515625,
"epoch": 0.576,
"grad_norm": 0.024856774136424065,
"kl": 0.0020842552185058594,
"learning_rate": 1.9998863043118163e-05,
"loss": -0.0296,
"reward": 4.706835389137268,
"reward_std": 0.9307773113250732,
"rewards/mrr_reward": 0.12280506081879139,
"rewards/rank_analyze_format_reward": 0.5460995584726334,
"rewards/rank_answer_foramt_reward": 0.70703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9968671798706055,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9968671798706055,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 534.359375,
"epoch": 0.584,
"grad_norm": 0.025794433429837227,
"kl": 0.002407550811767578,
"learning_rate": 1.999882482948257e-05,
"loss": -0.0073,
"reward": 4.6565152406692505,
"reward_std": 0.9558501094579697,
"rewards/mrr_reward": 0.15843874588608742,
"rewards/rank_analyze_format_reward": 0.38018228113651276,
"rewards/rank_answer_foramt_reward": 0.759765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.9296875,
"rewards/rank_verify_format_reward": 0.96875,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 545.03125,
"epoch": 0.592,
"grad_norm": 0.025461561977863312,
"kl": 0.0017633438110351562,
"learning_rate": 1.999878598426653e-05,
"loss": -0.0305,
"reward": 5.3728920221328735,
"reward_std": 0.9629544615745544,
"rewards/mrr_reward": 0.28620412945747375,
"rewards/rank_analyze_format_reward": 0.46884432435035706,
"rewards/rank_answer_foramt_reward": 0.78125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9968030601739883,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9968030601739883,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 508.765625,
"epoch": 0.6,
"grad_norm": 0.03176024928689003,
"kl": 0.002418041229248047,
"learning_rate": 1.9998746507472493e-05,
"loss": -0.046,
"reward": 4.1894320249557495,
"reward_std": 0.9330323338508606,
"rewards/mrr_reward": 0.08410218358039856,
"rewards/rank_analyze_format_reward": 0.287723608314991,
"rewards/rank_answer_foramt_reward": 0.626953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9808920323848724,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9808920323848724,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 543.21875,
"epoch": 0.608,
"grad_norm": 0.0266107227653265,
"kl": 0.0020155906677246094,
"learning_rate": 1.999870639910296e-05,
"loss": -0.0428,
"reward": 4.562186181545258,
"reward_std": 0.9480538219213486,
"rewards/mrr_reward": 0.13458581641316414,
"rewards/rank_analyze_format_reward": 0.4180658236145973,
"rewards/rank_answer_foramt_reward": 0.669921875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 557.546875,
"epoch": 0.616,
"grad_norm": 0.027047261595726013,
"kl": 0.002165555953979492,
"learning_rate": 1.9998665659160453e-05,
"loss": 0.003,
"reward": 5.2997212409973145,
"reward_std": 1.2724156975746155,
"rewards/mrr_reward": 0.2895585522055626,
"rewards/rank_analyze_format_reward": 0.3944082595407963,
"rewards/rank_answer_foramt_reward": 0.771484375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9917034357786179,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9917034357786179,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 539.265625,
"epoch": 0.624,
"grad_norm": 0.026604430750012398,
"kl": 0.0022330284118652344,
"learning_rate": 1.999862428764756e-05,
"loss": -0.051,
"reward": 4.927618980407715,
"reward_std": 1.0934423208236694,
"rewards/mrr_reward": 0.18596850894391537,
"rewards/rank_analyze_format_reward": 0.4122604951262474,
"rewards/rank_answer_foramt_reward": 0.794921875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 553.671875,
"epoch": 0.632,
"grad_norm": 0.025129646062850952,
"kl": 0.001984834671020508,
"learning_rate": 1.9998582284566878e-05,
"loss": -0.0399,
"reward": 4.638971567153931,
"reward_std": 1.0905082076787949,
"rewards/mrr_reward": 0.13253968209028244,
"rewards/rank_analyze_format_reward": 0.4443269595503807,
"rewards/rank_answer_foramt_reward": 0.68359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 596.5625,
"epoch": 0.64,
"grad_norm": 0.025474058464169502,
"kl": 0.0023970603942871094,
"learning_rate": 1.999853964992107e-05,
"loss": -0.0208,
"reward": 5.141489744186401,
"reward_std": 0.9898062199354172,
"rewards/mrr_reward": 0.19254092685878277,
"rewards/rank_analyze_format_reward": 0.6555207520723343,
"rewards/rank_answer_foramt_reward": 0.787109375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9955979138612747,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9799729138612747,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 557.5625,
"epoch": 0.648,
"grad_norm": 0.029823826625943184,
"kl": 0.002605438232421875,
"learning_rate": 1.9998496383712828e-05,
"loss": -0.0459,
"reward": 4.708982348442078,
"reward_std": 0.832055389881134,
"rewards/mrr_reward": 0.11339906044304371,
"rewards/rank_analyze_format_reward": 0.5364454686641693,
"rewards/rank_answer_foramt_reward": 0.75390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9903296828269958,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9903296828269958,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 567.6875,
"epoch": 0.656,
"grad_norm": 0.02752668969333172,
"kl": 0.002631664276123047,
"learning_rate": 1.999845248594489e-05,
"loss": -0.0398,
"reward": 4.8155412673950195,
"reward_std": 0.9520252794027328,
"rewards/mrr_reward": 0.14484747499227524,
"rewards/rank_analyze_format_reward": 0.5442800670862198,
"rewards/rank_answer_foramt_reward": 0.724609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9836309552192688,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 573.71875,
"epoch": 0.664,
"grad_norm": 0.028707344084978104,
"kl": 0.002571582794189453,
"learning_rate": 1.9998407956620017e-05,
"loss": -0.0306,
"reward": 5.175315976142883,
"reward_std": 1.2197599858045578,
"rewards/mrr_reward": 0.24100322648882866,
"rewards/rank_analyze_format_reward": 0.45521388202905655,
"rewards/rank_answer_foramt_reward": 0.775390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 557.078125,
"epoch": 0.672,
"grad_norm": 0.02770584635436535,
"kl": 0.003272533416748047,
"learning_rate": 1.9998362795741027e-05,
"loss": 0.0127,
"reward": 4.81751024723053,
"reward_std": 1.0414631068706512,
"rewards/mrr_reward": 0.18727059103548527,
"rewards/rank_analyze_format_reward": 0.4546085884794593,
"rewards/rank_answer_foramt_reward": 0.681640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9817143976688385,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9817143976688385,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 562.15625,
"epoch": 0.68,
"grad_norm": 0.029665743932127953,
"kl": 0.002528667449951172,
"learning_rate": 1.9998317003310775e-05,
"loss": -0.0482,
"reward": 4.359201908111572,
"reward_std": 0.8430032134056091,
"rewards/mrr_reward": 0.08872148208320141,
"rewards/rank_analyze_format_reward": 0.400491826236248,
"rewards/rank_answer_foramt_reward": 0.66796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9679276347160339,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 574.3125,
"epoch": 0.688,
"grad_norm": 0.029378948733210564,
"kl": 0.0034728050231933594,
"learning_rate": 1.9998270579332154e-05,
"loss": 0.0053,
"reward": 5.317029237747192,
"reward_std": 1.1373002529144287,
"rewards/mrr_reward": 0.2627728134393692,
"rewards/rank_analyze_format_reward": 0.5021510571241379,
"rewards/rank_answer_foramt_reward": 0.7890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 579.984375,
"epoch": 0.696,
"grad_norm": 0.027385709807276726,
"kl": 0.0028448104858398438,
"learning_rate": 1.9998223523808092e-05,
"loss": -0.0373,
"reward": 5.256059646606445,
"reward_std": 0.8636089265346527,
"rewards/mrr_reward": 0.21436012163758278,
"rewards/rank_analyze_format_reward": 0.5581663772463799,
"rewards/rank_answer_foramt_reward": 0.845703125,
"rewards/rank_contrast_format_reward": 0.013700738549232483,
"rewards/rank_initial_format_reward": 0.9983368366956711,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9983368366956711,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 569.390625,
"epoch": 0.704,
"grad_norm": 0.02710539661347866,
"kl": 0.0027756690979003906,
"learning_rate": 1.9998175836741564e-05,
"loss": -0.0384,
"reward": 4.694380164146423,
"reward_std": 0.9186579138040543,
"rewards/mrr_reward": 0.10895957797765732,
"rewards/rank_analyze_format_reward": 0.5749479159712791,
"rewards/rank_answer_foramt_reward": 0.74609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.984375,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 579.078125,
"epoch": 0.712,
"grad_norm": 0.027813483029603958,
"kl": 0.0035161972045898438,
"learning_rate": 1.999812751813558e-05,
"loss": -0.0233,
"reward": 4.924348711967468,
"reward_std": 1.096758782863617,
"rewards/mrr_reward": 0.17403894662857056,
"rewards/rank_analyze_format_reward": 0.5381231904029846,
"rewards/rank_answer_foramt_reward": 0.779296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.9296875,
"rewards/rank_verify_format_reward": 0.9827302694320679,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 597.78125,
"epoch": 0.72,
"grad_norm": 0.026623884215950966,
"kl": 0.003002643585205078,
"learning_rate": 1.9998078567993197e-05,
"loss": -0.0256,
"reward": 5.420621871948242,
"reward_std": 1.2407971769571304,
"rewards/mrr_reward": 0.2805493548512459,
"rewards/rank_analyze_format_reward": 0.5728173106908798,
"rewards/rank_answer_foramt_reward": 0.76171875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9936629235744476,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9936629235744476,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 585.40625,
"epoch": 0.728,
"grad_norm": 0.027379710227251053,
"kl": 0.0037784576416015625,
"learning_rate": 1.9998028986317504e-05,
"loss": -0.0222,
"reward": 5.071754574775696,
"reward_std": 1.0615117102861404,
"rewards/mrr_reward": 0.21678448282182217,
"rewards/rank_analyze_format_reward": 0.39732154086232185,
"rewards/rank_answer_foramt_reward": 0.88671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9954443722963333,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.9798193722963333,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 596.796875,
"epoch": 0.736,
"grad_norm": 0.028991688042879105,
"kl": 0.0032110214233398438,
"learning_rate": 1.999797877311163e-05,
"loss": -0.0177,
"reward": 5.152738690376282,
"reward_std": 1.0150837451219559,
"rewards/mrr_reward": 0.1862909272313118,
"rewards/rank_analyze_format_reward": 0.6470415517687798,
"rewards/rank_answer_foramt_reward": 0.78515625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9955011606216431,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9955011606216431,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 554.484375,
"epoch": 0.744,
"grad_norm": 0.028571411967277527,
"kl": 0.004134178161621094,
"learning_rate": 1.9997927928378753e-05,
"loss": -0.0372,
"reward": 4.580929517745972,
"reward_std": 0.7480403929948807,
"rewards/mrr_reward": 0.11786334402859211,
"rewards/rank_analyze_format_reward": 0.36851008981466293,
"rewards/rank_answer_foramt_reward": 0.76171875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 591.625,
"epoch": 0.752,
"grad_norm": 0.02991093136370182,
"kl": 0.0034475326538085938,
"learning_rate": 1.999787645212208e-05,
"loss": -0.0337,
"reward": 4.8702027797698975,
"reward_std": 0.8430802449584007,
"rewards/mrr_reward": 0.13307911716401577,
"rewards/rank_analyze_format_reward": 0.6129686385393143,
"rewards/rank_answer_foramt_reward": 0.734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 590.765625,
"epoch": 0.76,
"grad_norm": 0.03038324974477291,
"kl": 0.00400543212890625,
"learning_rate": 1.999782434434486e-05,
"loss": -0.0202,
"reward": 5.1996424198150635,
"reward_std": 0.8570089638233185,
"rewards/mrr_reward": 0.20330480858683586,
"rewards/rank_analyze_format_reward": 0.5464644953608513,
"rewards/rank_answer_foramt_reward": 0.857421875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 609.40625,
"epoch": 0.768,
"grad_norm": 0.028416253626346588,
"kl": 0.004006385803222656,
"learning_rate": 1.999777160505039e-05,
"loss": -0.0094,
"reward": 5.031667947769165,
"reward_std": 0.6344560533761978,
"rewards/mrr_reward": 0.14536210522055626,
"rewards/rank_analyze_format_reward": 0.6181881725788116,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.96875,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 585.09375,
"epoch": 0.776,
"grad_norm": 0.0321107916533947,
"kl": 0.0050525665283203125,
"learning_rate": 1.9997718234242e-05,
"loss": -0.0221,
"reward": 5.048359394073486,
"reward_std": 1.0561828166246414,
"rewards/mrr_reward": 0.19980159029364586,
"rewards/rank_analyze_format_reward": 0.47314807027578354,
"rewards/rank_answer_foramt_reward": 0.84765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9915180057287216,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9758930057287216,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 596.359375,
"epoch": 0.784,
"grad_norm": 0.03290560096502304,
"kl": 0.004299163818359375,
"learning_rate": 1.999766423192306e-05,
"loss": -0.0549,
"reward": 5.590569615364075,
"reward_std": 0.9068724364042282,
"rewards/mrr_reward": 0.2803075537085533,
"rewards/rank_analyze_format_reward": 0.6597586870193481,
"rewards/rank_answer_foramt_reward": 0.798828125,
"rewards/rank_contrast_format_reward": 0.012397300451993942,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 631.375,
"epoch": 0.792,
"grad_norm": 0.028949948027729988,
"kl": 0.0046710968017578125,
"learning_rate": 1.9997609598096982e-05,
"loss": -0.017,
"reward": 4.953081727027893,
"reward_std": 0.8293813019990921,
"rewards/mrr_reward": 0.14769965037703514,
"rewards/rank_analyze_format_reward": 0.6502636969089508,
"rewards/rank_answer_foramt_reward": 0.763671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9976112246513367,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9976112246513367,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 605.71875,
"epoch": 0.8,
"grad_norm": 0.028032353147864342,
"kl": 0.004805564880371094,
"learning_rate": 1.9997554332767214e-05,
"loss": -0.0352,
"reward": 5.1254483461380005,
"reward_std": 0.728003740310669,
"rewards/mrr_reward": 0.14519469253718853,
"rewards/rank_analyze_format_reward": 0.7232527583837509,
"rewards/rank_answer_foramt_reward": 0.83203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9985989332199097,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9985989332199097,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 616.296875,
"epoch": 0.808,
"grad_norm": 0.028648706153035164,
"kl": 0.004345893859863281,
"learning_rate": 1.9997498435937254e-05,
"loss": 0.0106,
"reward": 5.0007301568984985,
"reward_std": 0.9343436509370804,
"rewards/mrr_reward": 0.17731894738972187,
"rewards/rank_analyze_format_reward": 0.5347120687365532,
"rewards/rank_answer_foramt_reward": 0.791015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9945820420980453,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9945820420980453,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 606.828125,
"epoch": 0.816,
"grad_norm": 0.02675345167517662,
"kl": 0.004836559295654297,
"learning_rate": 1.9997441907610624e-05,
"loss": -0.0189,
"reward": 5.016782879829407,
"reward_std": 0.9857280552387238,
"rewards/mrr_reward": 0.17276166006922722,
"rewards/rank_analyze_format_reward": 0.5417740494012833,
"rewards/rank_answer_foramt_reward": 0.791015625,
"rewards/rank_contrast_format_reward": 0.010216346010565758,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 630.0625,
"epoch": 0.824,
"grad_norm": 0.027744626626372337,
"kl": 0.004809379577636719,
"learning_rate": 1.9997384747790903e-05,
"loss": -0.0061,
"reward": 5.06945013999939,
"reward_std": 1.0500756949186325,
"rewards/mrr_reward": 0.19982018508017063,
"rewards/rank_analyze_format_reward": 0.5732885971665382,
"rewards/rank_answer_foramt_reward": 0.76171875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9792998284101486,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9792998284101486,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 567.90625,
"epoch": 0.832,
"grad_norm": 0.03045082278549671,
"kl": 0.00579833984375,
"learning_rate": 1.9997326956481693e-05,
"loss": -0.0106,
"reward": 4.810132622718811,
"reward_std": 0.8808675408363342,
"rewards/mrr_reward": 0.14889632910490036,
"rewards/rank_analyze_format_reward": 0.48346175998449326,
"rewards/rank_answer_foramt_reward": 0.7578125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 635.234375,
"epoch": 0.84,
"grad_norm": 0.030195703729987144,
"kl": 0.00594329833984375,
"learning_rate": 1.999726853368665e-05,
"loss": -0.0475,
"reward": 5.089649319648743,
"reward_std": 0.8489270955324173,
"rewards/mrr_reward": 0.1736421212553978,
"rewards/rank_analyze_format_reward": 0.5801311060786247,
"rewards/rank_answer_foramt_reward": 0.861328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9963420033454895,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9963420033454895,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 638.328125,
"epoch": 0.848,
"grad_norm": 0.03000379353761673,
"kl": 0.005078315734863281,
"learning_rate": 1.9997209479409464e-05,
"loss": 0.0322,
"reward": 5.540584683418274,
"reward_std": 0.9829646348953247,
"rewards/mrr_reward": 0.2296379003673792,
"rewards/rank_analyze_format_reward": 0.794389545917511,
"rewards/rank_answer_foramt_reward": 0.828125,
"rewards/rank_contrast_format_reward": 0.015083539299666882,
"rewards/rank_initial_format_reward": 0.9961237162351608,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9961237162351608,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 634.65625,
"epoch": 0.856,
"grad_norm": 0.028458530083298683,
"kl": 0.005078315734863281,
"learning_rate": 1.9997149793653862e-05,
"loss": -0.0095,
"reward": 4.97307014465332,
"reward_std": 0.578310415148735,
"rewards/mrr_reward": 0.1025235615670681,
"rewards/rank_analyze_format_reward": 0.713058277964592,
"rewards/rank_answer_foramt_reward": 0.859375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 647.03125,
"epoch": 0.864,
"grad_norm": 0.02653643861413002,
"kl": 0.006175994873046875,
"learning_rate": 1.9997089476423617e-05,
"loss": -0.0059,
"reward": 4.949914813041687,
"reward_std": 0.7462972551584244,
"rewards/mrr_reward": 0.11983507312834263,
"rewards/rank_analyze_format_reward": 0.7087408900260925,
"rewards/rank_answer_foramt_reward": 0.818359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 616.09375,
"epoch": 0.872,
"grad_norm": 0.028843272477388382,
"kl": 0.0054416656494140625,
"learning_rate": 1.999702852772254e-05,
"loss": -0.0267,
"reward": 5.561126232147217,
"reward_std": 0.9516362547874451,
"rewards/mrr_reward": 0.2666604742407799,
"rewards/rank_analyze_format_reward": 0.7099277526140213,
"rewards/rank_answer_foramt_reward": 0.806640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9967704266309738,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9967704266309738,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 640.828125,
"epoch": 0.88,
"grad_norm": 0.02726060152053833,
"kl": 0.005953788757324219,
"learning_rate": 1.9996966947554476e-05,
"loss": -0.0389,
"reward": 5.334239721298218,
"reward_std": 1.061211720108986,
"rewards/mrr_reward": 0.22806919366121292,
"rewards/rank_analyze_format_reward": 0.7086126804351807,
"rewards/rank_answer_foramt_reward": 0.798828125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9806985259056091,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9806985259056091,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 648.421875,
"epoch": 0.888,
"grad_norm": 0.02812885120511055,
"kl": 0.005530357360839844,
"learning_rate": 1.9996904735923325e-05,
"loss": -0.0271,
"reward": 5.212171792984009,
"reward_std": 1.2261153161525726,
"rewards/mrr_reward": 0.19998760521411896,
"rewards/rank_analyze_format_reward": 0.7305806577205658,
"rewards/rank_answer_foramt_reward": 0.798828125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.953125,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 621.640625,
"epoch": 0.896,
"grad_norm": 0.030628954991698265,
"kl": 0.005995750427246094,
"learning_rate": 1.9996841892833e-05,
"loss": -0.0294,
"reward": 4.731189131736755,
"reward_std": 0.8310668021440506,
"rewards/mrr_reward": 0.10221974551677704,
"rewards/rank_analyze_format_reward": 0.6598925739526749,
"rewards/rank_answer_foramt_reward": 0.75,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9296875,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 625.71875,
"epoch": 0.904,
"grad_norm": 0.02799079939723015,
"kl": 0.006671905517578125,
"learning_rate": 1.9996778418287486e-05,
"loss": -0.0042,
"reward": 4.8320887088775635,
"reward_std": 0.781333327293396,
"rewards/mrr_reward": 0.12659970112144947,
"rewards/rank_analyze_format_reward": 0.6272406578063965,
"rewards/rank_answer_foramt_reward": 0.771484375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9791073650121689,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9791073650121689,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 656.5625,
"epoch": 0.912,
"grad_norm": 0.028244731947779655,
"kl": 0.0065898895263671875,
"learning_rate": 1.9996714312290784e-05,
"loss": -0.0069,
"reward": 5.762642502784729,
"reward_std": 1.0326203405857086,
"rewards/mrr_reward": 0.292788939550519,
"rewards/rank_analyze_format_reward": 0.7934152334928513,
"rewards/rank_answer_foramt_reward": 0.8203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.996692106127739,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.996692106127739,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 638.53125,
"epoch": 0.92,
"grad_norm": 0.03135911375284195,
"kl": 0.006131172180175781,
"learning_rate": 1.9996649574846948e-05,
"loss": 0.0157,
"reward": 5.772169351577759,
"reward_std": 0.9267353266477585,
"rewards/mrr_reward": 0.30804190039634705,
"rewards/rank_analyze_format_reward": 0.7891548573970795,
"rewards/rank_answer_foramt_reward": 0.84375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9965170323848724,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.9652670323848724,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 613.453125,
"epoch": 0.928,
"grad_norm": 0.028077326714992523,
"kl": 0.005988121032714844,
"learning_rate": 1.9996584205960063e-05,
"loss": -0.0113,
"reward": 5.589708924293518,
"reward_std": 1.1055090427398682,
"rewards/mrr_reward": 0.2775483652949333,
"rewards/rank_analyze_format_reward": 0.6549882963299751,
"rewards/rank_answer_foramt_reward": 0.833984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 639.6875,
"epoch": 0.936,
"grad_norm": 0.028432337567210197,
"kl": 0.006014823913574219,
"learning_rate": 1.999651820563426e-05,
"loss": -0.0167,
"reward": 5.5568296909332275,
"reward_std": 1.1293076276779175,
"rewards/mrr_reward": 0.25022321194410324,
"rewards/rank_analyze_format_reward": 0.7510844320058823,
"rewards/rank_answer_foramt_reward": 0.84375,
"rewards/rank_contrast_format_reward": 0.009815705008804798,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 617.359375,
"epoch": 0.944,
"grad_norm": 0.02918749675154686,
"kl": 0.007138252258300781,
"learning_rate": 1.999645157387371e-05,
"loss": -0.0352,
"reward": 5.3101993799209595,
"reward_std": 0.9095352292060852,
"rewards/mrr_reward": 0.18025793880224228,
"rewards/rank_analyze_format_reward": 0.7623788416385651,
"rewards/rank_answer_foramt_reward": 0.861328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9827302694320679,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 664.078125,
"epoch": 0.952,
"grad_norm": 0.029070457443594933,
"kl": 0.00724029541015625,
"learning_rate": 1.9996384310682615e-05,
"loss": -0.0279,
"reward": 5.085901737213135,
"reward_std": 0.9678252041339874,
"rewards/mrr_reward": 0.18282490503042936,
"rewards/rank_analyze_format_reward": 0.6765593886375427,
"rewards/rank_answer_foramt_reward": 0.75,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 654.546875,
"epoch": 0.96,
"grad_norm": 0.030889704823493958,
"kl": 0.0070590972900390625,
"learning_rate": 1.999631641606523e-05,
"loss": 0.0058,
"reward": 5.475605249404907,
"reward_std": 1.2632475644350052,
"rewards/mrr_reward": 0.2467137910425663,
"rewards/rank_analyze_format_reward": 0.7685735672712326,
"rewards/rank_answer_foramt_reward": 0.77734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9948538690805435,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9792288690805435,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 682.0625,
"epoch": 0.968,
"grad_norm": 0.0283079631626606,
"kl": 0.0067138671875,
"learning_rate": 1.9996247890025845e-05,
"loss": 0.0112,
"reward": 5.4859858751297,
"reward_std": 0.9399373084306717,
"rewards/mrr_reward": 0.22347470000386238,
"rewards/rank_analyze_format_reward": 0.7617006599903107,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9835526347160339,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 631.34375,
"epoch": 0.976,
"grad_norm": 0.028440656140446663,
"kl": 0.006336212158203125,
"learning_rate": 1.9996178732568784e-05,
"loss": -0.0263,
"reward": 5.595025658607483,
"reward_std": 1.2384063154459,
"rewards/mrr_reward": 0.2738715261220932,
"rewards/rank_analyze_format_reward": 0.7087783962488174,
"rewards/rank_answer_foramt_reward": 0.814453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9959664940834045,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9959664940834045,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 610.578125,
"epoch": 0.984,
"grad_norm": 0.031218891963362694,
"kl": 0.006499290466308594,
"learning_rate": 1.9996108943698412e-05,
"loss": -0.0205,
"reward": 5.309447526931763,
"reward_std": 0.8753966242074966,
"rewards/mrr_reward": 0.22593005746603012,
"rewards/rank_analyze_format_reward": 0.5845683068037033,
"rewards/rank_answer_foramt_reward": 0.8359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9965170323848724,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9965170323848724,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 679.9375,
"epoch": 0.992,
"grad_norm": 0.025655120611190796,
"kl": 0.006500244140625,
"learning_rate": 1.9996038523419148e-05,
"loss": 0.0137,
"reward": 5.34429144859314,
"reward_std": 0.6560061201453209,
"rewards/mrr_reward": 0.16470114514231682,
"rewards/rank_analyze_format_reward": 0.8025594502687454,
"rewards/rank_answer_foramt_reward": 0.900390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 621.8125,
"epoch": 1.0,
"grad_norm": 0.02733149379491806,
"kl": 0.006840705871582031,
"learning_rate": 1.9995967471735433e-05,
"loss": -0.0051,
"reward": 5.065644264221191,
"reward_std": 0.8055929243564606,
"rewards/mrr_reward": 0.14942336827516556,
"rewards/rank_analyze_format_reward": 0.6398258581757545,
"rewards/rank_answer_foramt_reward": 0.875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 643.5,
"epoch": 1.008,
"grad_norm": 0.028533369302749634,
"kl": 0.0065708160400390625,
"learning_rate": 1.9995895788651753e-05,
"loss": -0.0093,
"reward": 5.7667927742004395,
"reward_std": 1.0514316856861115,
"rewards/mrr_reward": 0.3164868615567684,
"rewards/rank_analyze_format_reward": 0.6641343683004379,
"rewards/rank_answer_foramt_reward": 0.84765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9984335899353027,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9984335899353027,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 648.609375,
"epoch": 1.016,
"grad_norm": 0.029412733390927315,
"kl": 0.0068683624267578125,
"learning_rate": 1.9995823474172644e-05,
"loss": -0.0102,
"reward": 5.423908352851868,
"reward_std": 0.9281527996063232,
"rewards/mrr_reward": 0.20751488581299782,
"rewards/rank_analyze_format_reward": 0.7598643451929092,
"rewards/rank_answer_foramt_reward": 0.833984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 640.015625,
"epoch": 1.024,
"grad_norm": 0.030668683350086212,
"kl": 0.008924484252929688,
"learning_rate": 1.9995750528302668e-05,
"loss": -0.0259,
"reward": 5.635333180427551,
"reward_std": 0.6666858419775963,
"rewards/mrr_reward": 0.2542472630739212,
"rewards/rank_analyze_format_reward": 0.7405444979667664,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9965170323848724,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9965170323848724,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 650.09375,
"epoch": 1.032,
"grad_norm": 0.029225200414657593,
"kl": 0.0071849822998046875,
"learning_rate": 1.999567695104643e-05,
"loss": 0.002,
"reward": 5.3161762952804565,
"reward_std": 0.9796330630779266,
"rewards/mrr_reward": 0.21669147536158562,
"rewards/rank_analyze_format_reward": 0.7243717163801193,
"rewards/rank_answer_foramt_reward": 0.7890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9948723018169403,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9801664054393768,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 637.6875,
"epoch": 1.04,
"grad_norm": 0.03244978189468384,
"kl": 0.006844520568847656,
"learning_rate": 1.9995602742408584e-05,
"loss": -0.0215,
"reward": 5.631897449493408,
"reward_std": 1.2725486308336258,
"rewards/mrr_reward": 0.2755270190536976,
"rewards/rank_analyze_format_reward": 0.7383408695459366,
"rewards/rank_answer_foramt_reward": 0.80078125,
"rewards/rank_contrast_format_reward": 0.01306460052728653,
"rewards/rank_initial_format_reward": 0.9966137707233429,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9966137707233429,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 669.21875,
"epoch": 1.048,
"grad_norm": 0.02654576487839222,
"kl": 0.007208824157714844,
"learning_rate": 1.9995527902393814e-05,
"loss": -0.0166,
"reward": 5.244002819061279,
"reward_std": 0.7309348955750465,
"rewards/mrr_reward": 0.16037946939468384,
"rewards/rank_analyze_format_reward": 0.7431098967790604,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 653.578125,
"epoch": 1.056,
"grad_norm": 0.029960671439766884,
"kl": 0.0071048736572265625,
"learning_rate": 1.9995452431006844e-05,
"loss": -0.0318,
"reward": 5.312576532363892,
"reward_std": 1.0043585747480392,
"rewards/mrr_reward": 0.1904141791164875,
"rewards/rank_analyze_format_reward": 0.7370295971632004,
"rewards/rank_answer_foramt_reward": 0.8203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9967888593673706,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9967888593673706,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 602.296875,
"epoch": 1.064,
"grad_norm": 0.034355491399765015,
"kl": 0.008157730102539062,
"learning_rate": 1.999537632825245e-05,
"loss": -0.0341,
"reward": 5.146085500717163,
"reward_std": 0.7237976565957069,
"rewards/mrr_reward": 0.15810392051935196,
"rewards/rank_analyze_format_reward": 0.6832832396030426,
"rewards/rank_answer_foramt_reward": 0.84765625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 674.625,
"epoch": 1.072,
"grad_norm": 0.02834927663207054,
"kl": 0.006766319274902344,
"learning_rate": 1.9995299594135434e-05,
"loss": -0.0002,
"reward": 5.701120138168335,
"reward_std": 0.5981364026665688,
"rewards/mrr_reward": 0.24760044924914837,
"rewards/rank_analyze_format_reward": 0.7354921996593475,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 659.953125,
"epoch": 1.08,
"grad_norm": 0.030475998297333717,
"kl": 0.009418487548828125,
"learning_rate": 1.999522222866064e-05,
"loss": -0.0179,
"reward": 5.579203009605408,
"reward_std": 1.1411446928977966,
"rewards/mrr_reward": 0.24535591155290604,
"rewards/rank_analyze_format_reward": 0.8200124651193619,
"rewards/rank_answer_foramt_reward": 0.8046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 664.5625,
"epoch": 1.088,
"grad_norm": 0.02837698720395565,
"kl": 0.007771492004394531,
"learning_rate": 1.999514423183296e-05,
"loss": -0.022,
"reward": 5.316519737243652,
"reward_std": 0.48504022508859634,
"rewards/mrr_reward": 0.15651662088930607,
"rewards/rank_analyze_format_reward": 0.7586470544338226,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.006024893838912249,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 640.40625,
"epoch": 1.096,
"grad_norm": 0.03035576269030571,
"kl": 0.009632110595703125,
"learning_rate": 1.9995065603657317e-05,
"loss": -0.0195,
"reward": 5.066559195518494,
"reward_std": 0.7069189697504044,
"rewards/mrr_reward": 0.1323784776031971,
"rewards/rank_analyze_format_reward": 0.7358406782150269,
"rewards/rank_answer_foramt_reward": 0.8203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 672.09375,
"epoch": 1.104,
"grad_norm": 0.02903159335255623,
"kl": 0.007931709289550781,
"learning_rate": 1.999498634413868e-05,
"loss": -0.0238,
"reward": 5.2656556367874146,
"reward_std": 0.7046982049942017,
"rewards/mrr_reward": 0.17012028582394123,
"rewards/rank_analyze_format_reward": 0.7452157586812973,
"rewards/rank_answer_foramt_reward": 0.873046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 645.5625,
"epoch": 1.112,
"grad_norm": 0.029243705794215202,
"kl": 0.007679939270019531,
"learning_rate": 1.9994906453282055e-05,
"loss": -0.0086,
"reward": 5.43647313117981,
"reward_std": 0.6669348478317261,
"rewards/mrr_reward": 0.19714161939918995,
"rewards/rank_analyze_format_reward": 0.7475159168243408,
"rewards/rank_answer_foramt_reward": 0.900390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 652.046875,
"epoch": 1.12,
"grad_norm": 0.030478352680802345,
"kl": 0.008630752563476562,
"learning_rate": 1.9994825931092486e-05,
"loss": -0.034,
"reward": 5.387316823005676,
"reward_std": 0.8800464794039726,
"rewards/mrr_reward": 0.20342883095145226,
"rewards/rank_analyze_format_reward": 0.6943867355585098,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 654.4375,
"epoch": 1.1280000000000001,
"grad_norm": 0.029138056561350822,
"kl": 0.00916290283203125,
"learning_rate": 1.9994744777575064e-05,
"loss": 0.0121,
"reward": 5.340026617050171,
"reward_std": 1.1701116859912872,
"rewards/mrr_reward": 0.21713170036673546,
"rewards/rank_analyze_format_reward": 0.7348632365465164,
"rewards/rank_answer_foramt_reward": 0.81640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9835526347160339,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 649.765625,
"epoch": 1.1360000000000001,
"grad_norm": 0.027982011437416077,
"kl": 0.007659912109375,
"learning_rate": 1.999466299273491e-05,
"loss": -0.0359,
"reward": 5.487318634986877,
"reward_std": 0.7155840322375298,
"rewards/mrr_reward": 0.19176588580012321,
"rewards/rank_analyze_format_reward": 0.780378520488739,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 666.28125,
"epoch": 1.144,
"grad_norm": 0.030734730884432793,
"kl": 0.007786750793457031,
"learning_rate": 1.9994580576577193e-05,
"loss": 0.0205,
"reward": 5.202921390533447,
"reward_std": 1.0715601295232773,
"rewards/mrr_reward": 0.17847222834825516,
"rewards/rank_analyze_format_reward": 0.7116889655590057,
"rewards/rank_answer_foramt_reward": 0.87109375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.96875,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.96875,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 650.546875,
"epoch": 1.152,
"grad_norm": 0.031996339559555054,
"kl": 0.008441925048828125,
"learning_rate": 1.9994497529107118e-05,
"loss": 0.0216,
"reward": 5.737574934959412,
"reward_std": 1.1086364686489105,
"rewards/mrr_reward": 0.30818453058600426,
"rewards/rank_analyze_format_reward": 0.7176125943660736,
"rewards/rank_answer_foramt_reward": 0.8203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 673.328125,
"epoch": 1.16,
"grad_norm": 0.03075815737247467,
"kl": 0.008372306823730469,
"learning_rate": 1.999441385032993e-05,
"loss": 0.0058,
"reward": 5.309332966804504,
"reward_std": 1.2192674428224564,
"rewards/mrr_reward": 0.19598214887082577,
"rewards/rank_analyze_format_reward": 0.820211187005043,
"rewards/rank_answer_foramt_reward": 0.81640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9834558814764023,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9522058814764023,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 647.265625,
"epoch": 1.168,
"grad_norm": 0.029912114143371582,
"kl": 0.008427619934082031,
"learning_rate": 1.9994329540250918e-05,
"loss": -0.0094,
"reward": 5.250480055809021,
"reward_std": 0.6816554740071297,
"rewards/mrr_reward": 0.17541543021798134,
"rewards/rank_analyze_format_reward": 0.6894431859254837,
"rewards/rank_answer_foramt_reward": 0.875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 678.734375,
"epoch": 1.176,
"grad_norm": 0.03161380812525749,
"kl": 0.008414268493652344,
"learning_rate": 1.99942445988754e-05,
"loss": -0.0024,
"reward": 6.2228370904922485,
"reward_std": 0.809022843837738,
"rewards/mrr_reward": 0.3891865164041519,
"rewards/rank_analyze_format_reward": 0.7911781966686249,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.998003289103508,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.998003289103508,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 649.125,
"epoch": 1.184,
"grad_norm": 0.029842013493180275,
"kl": 0.009199142456054688,
"learning_rate": 1.999415902620875e-05,
"loss": -0.0172,
"reward": 5.441818833351135,
"reward_std": 0.9356655329465866,
"rewards/mrr_reward": 0.21156993880867958,
"rewards/rank_analyze_format_reward": 0.7728083282709122,
"rewards/rank_answer_foramt_reward": 0.86328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9992559552192688,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 643.71875,
"epoch": 1.192,
"grad_norm": 0.029202759265899658,
"kl": 0.007966995239257812,
"learning_rate": 1.999407282225637e-05,
"loss": 0.0142,
"reward": 6.0522788763046265,
"reward_std": 0.693210706114769,
"rewards/mrr_reward": 0.3665984570980072,
"rewards/rank_analyze_format_reward": 0.7421348392963409,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.96875,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 679.75,
"epoch": 1.2,
"grad_norm": 0.030646566301584244,
"kl": 0.00763702392578125,
"learning_rate": 1.9993985987023703e-05,
"loss": 0.025,
"reward": 5.18368136882782,
"reward_std": 0.5240126103162766,
"rewards/mrr_reward": 0.12678571417927742,
"rewards/rank_analyze_format_reward": 0.7496970891952515,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9995535761117935,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9995535761117935,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 665.375,
"epoch": 1.208,
"grad_norm": 0.029021795839071274,
"kl": 0.0076656341552734375,
"learning_rate": 1.9993898520516233e-05,
"loss": -0.0027,
"reward": 5.496846318244934,
"reward_std": 0.7196042984724045,
"rewards/mrr_reward": 0.20720486715435982,
"rewards/rank_analyze_format_reward": 0.7718491405248642,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.00924379751086235,
"rewards/rank_initial_format_reward": 0.9962014406919479,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9962014406919479,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 654.1875,
"epoch": 1.216,
"grad_norm": 0.030017321929335594,
"kl": 0.008272171020507812,
"learning_rate": 1.9993810422739496e-05,
"loss": 0.0039,
"reward": 5.460033655166626,
"reward_std": 1.0394816249608994,
"rewards/mrr_reward": 0.24322297610342503,
"rewards/rank_analyze_format_reward": 0.7152340114116669,
"rewards/rank_answer_foramt_reward": 0.830078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9826335161924362,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 675.9375,
"epoch": 1.224,
"grad_norm": 0.03180088475346565,
"kl": 0.00795745849609375,
"learning_rate": 1.999372169369904e-05,
"loss": -0.0025,
"reward": 5.007642865180969,
"reward_std": 0.6734954938292503,
"rewards/mrr_reward": 0.11055307649075985,
"rewards/rank_analyze_format_reward": 0.7876633703708649,
"rewards/rank_answer_foramt_reward": 0.828125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9826335161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 667.8125,
"epoch": 1.232,
"grad_norm": 0.030025500804185867,
"kl": 0.00997161865234375,
"learning_rate": 1.999363233340048e-05,
"loss": -0.0225,
"reward": 5.430343270301819,
"reward_std": 0.8861743956804276,
"rewards/mrr_reward": 0.21569321304559708,
"rewards/rank_analyze_format_reward": 0.7130149006843567,
"rewards/rank_answer_foramt_reward": 0.900390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9966137856245041,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9966137856245041,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 676.65625,
"epoch": 1.24,
"grad_norm": 0.028847267851233482,
"kl": 0.008131027221679688,
"learning_rate": 1.9993542341849462e-05,
"loss": 0.0216,
"reward": 5.840638756752014,
"reward_std": 0.8792509809136391,
"rewards/mrr_reward": 0.3192274421453476,
"rewards/rank_analyze_format_reward": 0.809514582157135,
"rewards/rank_answer_foramt_reward": 0.802734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 675.4375,
"epoch": 1.248,
"grad_norm": 0.02900339849293232,
"kl": 0.0074920654296875,
"learning_rate": 1.9993451719051663e-05,
"loss": 0.0185,
"reward": 5.44918155670166,
"reward_std": 1.0219481438398361,
"rewards/mrr_reward": 0.2371651791036129,
"rewards/rank_analyze_format_reward": 0.8032551407814026,
"rewards/rank_answer_foramt_reward": 0.751953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 673.5625,
"epoch": 1.256,
"grad_norm": 0.030704988166689873,
"kl": 0.008420944213867188,
"learning_rate": 1.999336046501281e-05,
"loss": 0.0103,
"reward": 5.615517616271973,
"reward_std": 0.6072335783392191,
"rewards/mrr_reward": 0.21866939775645733,
"rewards/rank_analyze_format_reward": 0.8225629776716232,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 621.53125,
"epoch": 1.264,
"grad_norm": 0.03495262190699577,
"kl": 0.010213851928710938,
"learning_rate": 1.999326857973867e-05,
"loss": 0.0143,
"reward": 5.4609445333480835,
"reward_std": 1.0773909091949463,
"rewards/mrr_reward": 0.24725942313671112,
"rewards/rank_analyze_format_reward": 0.7313768267631531,
"rewards/rank_answer_foramt_reward": 0.80078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9972181469202042,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9815931469202042,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 659.21875,
"epoch": 1.272,
"grad_norm": 0.030684208497405052,
"kl": 0.008328437805175781,
"learning_rate": 1.9993176063235046e-05,
"loss": -0.0004,
"reward": 5.58943784236908,
"reward_std": 0.7682318538427353,
"rewards/mrr_reward": 0.23807664960622787,
"rewards/rank_analyze_format_reward": 0.7750714123249054,
"rewards/rank_answer_foramt_reward": 0.875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 664.078125,
"epoch": 1.28,
"grad_norm": 0.03072858415544033,
"kl": 0.008135795593261719,
"learning_rate": 1.9993082915507776e-05,
"loss": -0.0049,
"reward": 5.318588376045227,
"reward_std": 0.7114385366439819,
"rewards/mrr_reward": 0.1840277872979641,
"rewards/rank_analyze_format_reward": 0.7484926581382751,
"rewards/rank_answer_foramt_reward": 0.927734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.96875,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 688.203125,
"epoch": 1.288,
"grad_norm": 0.029995588585734367,
"kl": 0.0068206787109375,
"learning_rate": 1.999298913656275e-05,
"loss": 0.0295,
"reward": 5.286089658737183,
"reward_std": 0.985167570412159,
"rewards/mrr_reward": 0.20013641566038132,
"rewards/rank_analyze_format_reward": 0.7478553950786591,
"rewards/rank_answer_foramt_reward": 0.828125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9899380803108215,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.9743130803108215,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 645.953125,
"epoch": 1.296,
"grad_norm": 0.030260177329182625,
"kl": 0.0070362091064453125,
"learning_rate": 1.9992894726405894e-05,
"loss": -0.0186,
"reward": 5.705159902572632,
"reward_std": 0.8278112560510635,
"rewards/mrr_reward": 0.26141493394970894,
"rewards/rank_analyze_format_reward": 0.7882915586233139,
"rewards/rank_answer_foramt_reward": 0.873046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 663.359375,
"epoch": 1.304,
"grad_norm": 0.029093291610479355,
"kl": 0.007663726806640625,
"learning_rate": 1.9992799685043165e-05,
"loss": -0.0021,
"reward": 5.266281008720398,
"reward_std": 0.7597630694508553,
"rewards/mrr_reward": 0.15634300746023655,
"rewards/rank_analyze_format_reward": 0.8102140724658966,
"rewards/rank_answer_foramt_reward": 0.849609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 649.34375,
"epoch": 1.312,
"grad_norm": 0.030477292835712433,
"kl": 0.009012222290039062,
"learning_rate": 1.999270401248057e-05,
"loss": 0.005,
"reward": 5.561887741088867,
"reward_std": 0.9077768623828888,
"rewards/mrr_reward": 0.24484127387404442,
"rewards/rank_analyze_format_reward": 0.7151724994182587,
"rewards/rank_answer_foramt_reward": 0.859375,
"rewards/rank_contrast_format_reward": 0.013020833022892475,
"rewards/rank_initial_format_reward": 0.997477263212204,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.997477263212204,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 631.546875,
"epoch": 1.32,
"grad_norm": 0.030477292835712433,
"kl": 0.007292747497558594,
"learning_rate": 1.999270401248057e-05,
"loss": -0.0001,
"reward": 5.293234348297119,
"reward_std": 0.6412914916872978,
"rewards/mrr_reward": 0.16529638320207596,
"rewards/rank_analyze_format_reward": 0.7485833615064621,
"rewards/rank_answer_foramt_reward": 0.890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9964202791452408,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9964202791452408,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 638.6875,
"epoch": 1.328,
"grad_norm": 0.031677015125751495,
"kl": 0.008253097534179688,
"learning_rate": 1.999260770872415e-05,
"loss": -0.0139,
"reward": 5.761396527290344,
"reward_std": 0.675188884139061,
"rewards/mrr_reward": 0.29050719179213047,
"rewards/rank_analyze_format_reward": 0.7996452897787094,
"rewards/rank_answer_foramt_reward": 0.80859375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9955643564462662,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9955643564462662,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 647.5625,
"epoch": 1.336,
"grad_norm": 0.02820313349366188,
"kl": 0.00614166259765625,
"learning_rate": 1.999251077377999e-05,
"loss": 0.0049,
"reward": 5.31193470954895,
"reward_std": 0.7302871681749821,
"rewards/mrr_reward": 0.1731150783598423,
"rewards/rank_analyze_format_reward": 0.7047950625419617,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 667.875,
"epoch": 1.3439999999999999,
"grad_norm": 0.02908385545015335,
"kl": 0.007312774658203125,
"learning_rate": 1.999241320765421e-05,
"loss": 0.005,
"reward": 5.744455337524414,
"reward_std": 0.6932341083884239,
"rewards/mrr_reward": 0.25336061976850033,
"rewards/rank_analyze_format_reward": 0.8142653256654739,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 620.890625,
"epoch": 1.3519999999999999,
"grad_norm": 0.03778872266411781,
"kl": 0.01438140869140625,
"learning_rate": 1.9992315010352978e-05,
"loss": -0.05,
"reward": 5.296718597412109,
"reward_std": 1.1021133363246918,
"rewards/mrr_reward": 0.20252975821495056,
"rewards/rank_analyze_format_reward": 0.7184047400951385,
"rewards/rank_answer_foramt_reward": 0.779296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 647.734375,
"epoch": 1.3599999999999999,
"grad_norm": 0.032183870673179626,
"kl": 0.0066356658935546875,
"learning_rate": 1.9992216181882492e-05,
"loss": 0.014,
"reward": 5.106685400009155,
"reward_std": 0.636107549071312,
"rewards/mrr_reward": 0.12468378245830536,
"rewards/rank_analyze_format_reward": 0.7776422798633575,
"rewards/rank_answer_foramt_reward": 0.833984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 652.328125,
"epoch": 1.3679999999999999,
"grad_norm": 0.02835104614496231,
"kl": 0.007082939147949219,
"learning_rate": 1.9992116722248997e-05,
"loss": -0.0227,
"reward": 5.181930780410767,
"reward_std": 0.7388085126876831,
"rewards/mrr_reward": 0.1869481634348631,
"rewards/rank_analyze_format_reward": 0.7075169235467911,
"rewards/rank_answer_foramt_reward": 0.75,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9961231350898743,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9961231350898743,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 671.078125,
"epoch": 1.376,
"grad_norm": 0.02974073402583599,
"kl": 0.006779670715332031,
"learning_rate": 1.9992016631458774e-05,
"loss": -0.0135,
"reward": 5.27878475189209,
"reward_std": 0.6198801323771477,
"rewards/mrr_reward": 0.15861235558986664,
"rewards/rank_analyze_format_reward": 0.7224602103233337,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 643.046875,
"epoch": 1.384,
"grad_norm": 0.031048448756337166,
"kl": 0.006651878356933594,
"learning_rate": 1.9991915909518146e-05,
"loss": 0.0086,
"reward": 5.175734996795654,
"reward_std": 0.6161081194877625,
"rewards/mrr_reward": 0.14768726006150246,
"rewards/rank_analyze_format_reward": 0.788111001253128,
"rewards/rank_answer_foramt_reward": 0.84375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.984375,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 649.421875,
"epoch": 1.392,
"grad_norm": 0.032564930617809296,
"kl": 0.006524085998535156,
"learning_rate": 1.9991814556433475e-05,
"loss": 0.0131,
"reward": 5.380582928657532,
"reward_std": 0.7940613478422165,
"rewards/mrr_reward": 0.1871589906513691,
"rewards/rank_analyze_format_reward": 0.7780078798532486,
"rewards/rank_answer_foramt_reward": 0.873046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 636.640625,
"epoch": 1.4,
"grad_norm": 0.027925679460167885,
"kl": 0.008939743041992188,
"learning_rate": 1.9991712572211163e-05,
"loss": 0.0071,
"reward": 4.99386203289032,
"reward_std": 0.8465652763843536,
"rewards/mrr_reward": 0.13313491828739643,
"rewards/rank_analyze_format_reward": 0.6585879027843475,
"rewards/rank_answer_foramt_reward": 0.857421875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.984375,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 631.265625,
"epoch": 1.408,
"grad_norm": 0.02996164932847023,
"kl": 0.0084228515625,
"learning_rate": 1.999160995685765e-05,
"loss": 0.004,
"reward": 5.5598554611206055,
"reward_std": 0.8114011436700821,
"rewards/mrr_reward": 0.22632689774036407,
"rewards/rank_analyze_format_reward": 0.7654528021812439,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 633.15625,
"epoch": 1.416,
"grad_norm": 0.03037683106958866,
"kl": 0.006896018981933594,
"learning_rate": 1.9991506710379424e-05,
"loss": 0.0169,
"reward": 5.7041707038879395,
"reward_std": 1.061623454093933,
"rewards/mrr_reward": 0.28074776753783226,
"rewards/rank_analyze_format_reward": 0.7275251597166061,
"rewards/rank_answer_foramt_reward": 0.87109375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9912803918123245,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9912803918123245,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 642.078125,
"epoch": 1.424,
"grad_norm": 0.02834421582520008,
"kl": 0.006161689758300781,
"learning_rate": 1.9991402832783e-05,
"loss": -0.0165,
"reward": 5.452821254730225,
"reward_std": 0.7667314857244492,
"rewards/mrr_reward": 0.22672370821237564,
"rewards/rank_analyze_format_reward": 0.6621106863021851,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9965953528881073,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9809703528881073,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 611.515625,
"epoch": 1.432,
"grad_norm": 0.03392954543232918,
"kl": 0.007987022399902344,
"learning_rate": 1.9991298324074942e-05,
"loss": 0.0131,
"reward": 5.4409414529800415,
"reward_std": 0.9278188347816467,
"rewards/mrr_reward": 0.22803819552063942,
"rewards/rank_analyze_format_reward": 0.7239862233400345,
"rewards/rank_answer_foramt_reward": 0.830078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 665.4375,
"epoch": 1.44,
"grad_norm": 0.02850930020213127,
"kl": 0.006137847900390625,
"learning_rate": 1.999119318426185e-05,
"loss": 0.0109,
"reward": 5.493017673492432,
"reward_std": 0.5988549739122391,
"rewards/mrr_reward": 0.19139384850859642,
"rewards/rank_analyze_format_reward": 0.8054522722959518,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 652.328125,
"epoch": 1.448,
"grad_norm": 0.03058907762169838,
"kl": 0.006911277770996094,
"learning_rate": 1.9991087413350367e-05,
"loss": -0.0328,
"reward": 5.574859023094177,
"reward_std": 1.08903668820858,
"rewards/mrr_reward": 0.26563740335404873,
"rewards/rank_analyze_format_reward": 0.7526647448539734,
"rewards/rank_answer_foramt_reward": 0.8359375,
"rewards/rank_contrast_format_reward": 0.013681219890713692,
"rewards/rank_initial_format_reward": 0.99407559633255,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.96282559633255,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 629.359375,
"epoch": 1.456,
"grad_norm": 0.032083019614219666,
"kl": 0.007262229919433594,
"learning_rate": 1.9990981011347172e-05,
"loss": -0.0432,
"reward": 5.140234351158142,
"reward_std": 0.8412456661462784,
"rewards/mrr_reward": 0.14855531603097916,
"rewards/rank_analyze_format_reward": 0.7074387818574905,
"rewards/rank_answer_foramt_reward": 0.86328125,
"rewards/rank_contrast_format_reward": 0.014168431982398033,
"rewards/rank_initial_format_reward": 0.9961873590946198,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9805623590946198,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 605.046875,
"epoch": 1.464,
"grad_norm": 0.03227703273296356,
"kl": 0.007939338684082031,
"learning_rate": 1.999087397825899e-05,
"loss": -0.0575,
"reward": 4.993819952011108,
"reward_std": 0.7700471132993698,
"rewards/mrr_reward": 0.13467882573604584,
"rewards/rank_analyze_format_reward": 0.70456662774086,
"rewards/rank_answer_foramt_reward": 0.763671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9973393976688385,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9973393976688385,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 631.796875,
"epoch": 1.472,
"grad_norm": 0.02891026809811592,
"kl": 0.00670623779296875,
"learning_rate": 1.9990766314092575e-05,
"loss": -0.0023,
"reward": 5.713563561439514,
"reward_std": 0.8618924953043461,
"rewards/mrr_reward": 0.2501860074698925,
"rewards/rank_analyze_format_reward": 0.8630950748920441,
"rewards/rank_answer_foramt_reward": 0.890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 639.734375,
"epoch": 1.48,
"grad_norm": 0.030336197465658188,
"kl": 0.0074062347412109375,
"learning_rate": 1.9990658018854737e-05,
"loss": -0.0133,
"reward": 5.221981048583984,
"reward_std": 0.584196537733078,
"rewards/mrr_reward": 0.1581659186631441,
"rewards/rank_analyze_format_reward": 0.7678205668926239,
"rewards/rank_answer_foramt_reward": 0.85546875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9986388385295868,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9986388385295868,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 629.21875,
"epoch": 1.488,
"grad_norm": 0.03278738632798195,
"kl": 0.0077762603759765625,
"learning_rate": 1.9990549092552307e-05,
"loss": 0.0038,
"reward": 5.634615182876587,
"reward_std": 0.598552655428648,
"rewards/mrr_reward": 0.23745040595531464,
"rewards/rank_analyze_format_reward": 0.7684896737337112,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 635.484375,
"epoch": 1.496,
"grad_norm": 0.03179846331477165,
"kl": 0.007142066955566406,
"learning_rate": 1.999043953519217e-05,
"loss": 0.0049,
"reward": 5.652546405792236,
"reward_std": 0.9589240476489067,
"rewards/mrr_reward": 0.2782428301870823,
"rewards/rank_analyze_format_reward": 0.6794685870409012,
"rewards/rank_answer_foramt_reward": 0.873046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 603.75,
"epoch": 1.504,
"grad_norm": 0.029573241248726845,
"kl": 0.00788116455078125,
"learning_rate": 1.999032934678125e-05,
"loss": -0.018,
"reward": 5.10038423538208,
"reward_std": 0.5980538204312325,
"rewards/mrr_reward": 0.14192708767950535,
"rewards/rank_analyze_format_reward": 0.627951592206955,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.995330885052681,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.995330885052681,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 646.3125,
"epoch": 1.512,
"grad_norm": 0.03184956684708595,
"kl": 0.007494926452636719,
"learning_rate": 1.99902185273265e-05,
"loss": 0.0342,
"reward": 5.1105430126190186,
"reward_std": 0.4383184686303139,
"rewards/mrr_reward": 0.10293898917734623,
"rewards/rank_analyze_format_reward": 0.8510159552097321,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 634.046875,
"epoch": 1.52,
"grad_norm": 0.03131790831685066,
"kl": 0.006718635559082031,
"learning_rate": 1.999010707683492e-05,
"loss": -0.0412,
"reward": 5.197941780090332,
"reward_std": 0.6700362041592598,
"rewards/mrr_reward": 0.16383928433060646,
"rewards/rank_analyze_format_reward": 0.6899384260177612,
"rewards/rank_answer_foramt_reward": 0.876953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9917527735233307,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9917527735233307,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 606.578125,
"epoch": 1.528,
"grad_norm": 0.031343378126621246,
"kl": 0.0063381195068359375,
"learning_rate": 1.998999499531356e-05,
"loss": -0.0108,
"reward": 6.47118866443634,
"reward_std": 1.109221488237381,
"rewards/mrr_reward": 0.4581039249897003,
"rewards/rank_analyze_format_reward": 0.6661168932914734,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 622.703125,
"epoch": 1.536,
"grad_norm": 0.03459320589900017,
"kl": 0.007761955261230469,
"learning_rate": 1.9989882282769485e-05,
"loss": -0.0315,
"reward": 5.452162504196167,
"reward_std": 0.8313007205724716,
"rewards/mrr_reward": 0.20660342648625374,
"rewards/rank_analyze_format_reward": 0.7848766297101974,
"rewards/rank_answer_foramt_reward": 0.861328125,
"rewards/rank_contrast_format_reward": 0.010794081725180149,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 589.8125,
"epoch": 1.544,
"grad_norm": 0.034033406525850296,
"kl": 0.007235527038574219,
"learning_rate": 1.9989768939209826e-05,
"loss": -0.0153,
"reward": 5.301190137863159,
"reward_std": 0.7263774573802948,
"rewards/mrr_reward": 0.22665550373494625,
"rewards/rank_analyze_format_reward": 0.5403951182961464,
"rewards/rank_answer_foramt_reward": 0.876953125,
"rewards/rank_contrast_format_reward": 0.012031249701976776,
"rewards/rank_initial_format_reward": 0.9826335161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9981800019741058,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 611.953125,
"epoch": 1.552,
"grad_norm": 0.03618309274315834,
"kl": 0.0074520111083984375,
"learning_rate": 1.9989654964641737e-05,
"loss": -0.0302,
"reward": 5.4693708419799805,
"reward_std": 1.3693420886993408,
"rewards/mrr_reward": 0.2628844305872917,
"rewards/rank_analyze_format_reward": 0.7108018025755882,
"rewards/rank_answer_foramt_reward": 0.77734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 622.015625,
"epoch": 1.56,
"grad_norm": 0.0314011313021183,
"kl": 0.008198738098144531,
"learning_rate": 1.998954035907242e-05,
"loss": -0.0073,
"reward": 5.535840272903442,
"reward_std": 0.847998857498169,
"rewards/mrr_reward": 0.2551587335765362,
"rewards/rank_analyze_format_reward": 0.7528696805238724,
"rewards/rank_answer_foramt_reward": 0.8203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9827302694320679,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 687.484375,
"epoch": 1.568,
"grad_norm": 0.029867514967918396,
"kl": 0.0064868927001953125,
"learning_rate": 1.9989425122509113e-05,
"loss": -0.0081,
"reward": 5.249784588813782,
"reward_std": 0.7226946577429771,
"rewards/mrr_reward": 0.14637276344001293,
"rewards/rank_analyze_format_reward": 0.8121927380561829,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9973393976688385,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9817143976688385,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 662.015625,
"epoch": 1.576,
"grad_norm": 0.029847877100110054,
"kl": 0.0074748992919921875,
"learning_rate": 1.9989309254959096e-05,
"loss": -0.0112,
"reward": 5.975355625152588,
"reward_std": 1.1915720701217651,
"rewards/mrr_reward": 0.32516741193830967,
"rewards/rank_analyze_format_reward": 0.8367954790592194,
"rewards/rank_answer_foramt_reward": 0.900390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.984375,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 645.421875,
"epoch": 1.584,
"grad_norm": 0.030575547367334366,
"kl": 0.00701141357421875,
"learning_rate": 1.998919275642968e-05,
"loss": -0.0259,
"reward": 5.382705450057983,
"reward_std": 0.5365985631942749,
"rewards/mrr_reward": 0.17048611491918564,
"rewards/rank_analyze_format_reward": 0.7825622856616974,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 627.40625,
"epoch": 1.592,
"grad_norm": 0.0321117527782917,
"kl": 0.006863594055175781,
"learning_rate": 1.9989075626928237e-05,
"loss": -0.0073,
"reward": 5.219160199165344,
"reward_std": 0.6420910395681858,
"rewards/mrr_reward": 0.16282862052321434,
"rewards/rank_analyze_format_reward": 0.6769122779369354,
"rewards/rank_answer_foramt_reward": 0.900390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 656.796875,
"epoch": 1.6,
"grad_norm": 0.03242700546979904,
"kl": 0.007241249084472656,
"learning_rate": 1.9988957866462155e-05,
"loss": -0.0011,
"reward": 5.902221083641052,
"reward_std": 0.630526065826416,
"rewards/mrr_reward": 0.2948412746191025,
"rewards/rank_analyze_format_reward": 0.8107465952634811,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 1.0,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 654.90625,
"epoch": 1.608,
"grad_norm": 0.031312599778175354,
"kl": 0.006711006164550781,
"learning_rate": 1.998883947503888e-05,
"loss": -0.0324,
"reward": 5.3306708335876465,
"reward_std": 0.6714678555727005,
"rewards/mrr_reward": 0.18062996119260788,
"rewards/rank_analyze_format_reward": 0.7797176241874695,
"rewards/rank_answer_foramt_reward": 0.861328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 658.296875,
"epoch": 1.616,
"grad_norm": 0.03099161572754383,
"kl": 0.008016586303710938,
"learning_rate": 1.9988720452665885e-05,
"loss": -0.0357,
"reward": 5.8526880741119385,
"reward_std": 0.7714151293039322,
"rewards/mrr_reward": 0.2807725705206394,
"rewards/rank_analyze_format_reward": 0.7999103516340256,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 622.78125,
"epoch": 1.624,
"grad_norm": 0.031240420415997505,
"kl": 0.007191658020019531,
"learning_rate": 1.9988600799350685e-05,
"loss": -0.0077,
"reward": 5.453703999519348,
"reward_std": 0.7480319663882256,
"rewards/mrr_reward": 0.2080853171646595,
"rewards/rank_analyze_format_reward": 0.7671940922737122,
"rewards/rank_answer_foramt_reward": 0.876953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9964202791452408,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9964202791452408,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 626.09375,
"epoch": 1.6320000000000001,
"grad_norm": 0.033636048436164856,
"kl": 0.008371353149414062,
"learning_rate": 1.998848051510085e-05,
"loss": 0.0101,
"reward": 5.431292653083801,
"reward_std": 0.7651955038309097,
"rewards/mrr_reward": 0.20585318095982075,
"rewards/rank_analyze_format_reward": 0.7279091775417328,
"rewards/rank_answer_foramt_reward": 0.927734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9956494122743607,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9800244122743607,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 653.421875,
"epoch": 1.6400000000000001,
"grad_norm": 0.03052719309926033,
"kl": 0.008235931396484375,
"learning_rate": 1.9988359599923964e-05,
"loss": -0.0077,
"reward": 5.360659718513489,
"reward_std": 0.564825750887394,
"rewards/mrr_reward": 0.1643043179064989,
"rewards/rank_analyze_format_reward": 0.8024349361658096,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 640.65625,
"epoch": 1.6480000000000001,
"grad_norm": 0.030079178512096405,
"kl": 0.0066680908203125,
"learning_rate": 1.9988238053827677e-05,
"loss": -0.0209,
"reward": 5.6915318965911865,
"reward_std": 0.9940572530031204,
"rewards/mrr_reward": 0.24549851939082146,
"rewards/rank_analyze_format_reward": 0.8010262995958328,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 635.15625,
"epoch": 1.6560000000000001,
"grad_norm": 0.03138303756713867,
"kl": 0.008875846862792969,
"learning_rate": 1.9988115876819654e-05,
"loss": -0.0312,
"reward": 5.914277911186218,
"reward_std": 0.8364528864622116,
"rewards/mrr_reward": 0.3293774798512459,
"rewards/rank_analyze_format_reward": 0.7483799606561661,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9661861509084702,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 606.828125,
"epoch": 1.6640000000000001,
"grad_norm": 0.031958747655153275,
"kl": 0.009166717529296875,
"learning_rate": 1.9987993068907624e-05,
"loss": -0.0156,
"reward": 5.197036981582642,
"reward_std": 0.5283743739128113,
"rewards/mrr_reward": 0.1414062473922968,
"rewards/rank_analyze_format_reward": 0.7246968895196915,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9992559552192688,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 631.703125,
"epoch": 1.6720000000000002,
"grad_norm": 0.03019367717206478,
"kl": 0.007928848266601562,
"learning_rate": 1.9987869630099333e-05,
"loss": -0.0281,
"reward": 5.924240350723267,
"reward_std": 1.284628689289093,
"rewards/mrr_reward": 0.3399987667798996,
"rewards/rank_analyze_format_reward": 0.7304840087890625,
"rewards/rank_answer_foramt_reward": 0.849609375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9959821403026581,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9959821403026581,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 621.828125,
"epoch": 1.6800000000000002,
"grad_norm": 0.03171711415052414,
"kl": 0.008306503295898438,
"learning_rate": 1.998774556040259e-05,
"loss": -0.0146,
"reward": 5.582144498825073,
"reward_std": 0.7671663761138916,
"rewards/mrr_reward": 0.23291171342134476,
"rewards/rank_analyze_format_reward": 0.7457775175571442,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 678.453125,
"epoch": 1.688,
"grad_norm": 0.03076333925127983,
"kl": 0.0064449310302734375,
"learning_rate": 1.9987620859825225e-05,
"loss": -0.0194,
"reward": 5.3084797859191895,
"reward_std": 0.4021785408258438,
"rewards/mrr_reward": 0.13763020560145378,
"rewards/rank_analyze_format_reward": 0.8322708457708359,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.997023805975914,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.997023805975914,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 645.4375,
"epoch": 1.696,
"grad_norm": 0.03446948900818825,
"kl": 0.007775306701660156,
"learning_rate": 1.9987495528375115e-05,
"loss": 0.0253,
"reward": 5.778676629066467,
"reward_std": 0.4526245817542076,
"rewards/mrr_reward": 0.2587859593331814,
"rewards/rank_analyze_format_reward": 0.8156834691762924,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 686.171875,
"epoch": 1.704,
"grad_norm": 0.03106614388525486,
"kl": 0.0079193115234375,
"learning_rate": 1.998736956606018e-05,
"loss": -0.0063,
"reward": 5.9417431354522705,
"reward_std": 0.816119559109211,
"rewards/mrr_reward": 0.31315724551677704,
"rewards/rank_analyze_format_reward": 0.7785349041223526,
"rewards/rank_answer_foramt_reward": 0.9140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 666.65625,
"epoch": 1.712,
"grad_norm": 0.03145559877157211,
"kl": 0.0077838897705078125,
"learning_rate": 1.9987242972888368e-05,
"loss": 0.0266,
"reward": 5.924510478973389,
"reward_std": 1.2289659082889557,
"rewards/mrr_reward": 0.32067212648689747,
"rewards/rank_analyze_format_reward": 0.7929693013429642,
"rewards/rank_answer_foramt_reward": 0.875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9947387874126434,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9947387874126434,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 649.828125,
"epoch": 1.72,
"grad_norm": 0.030902279540896416,
"kl": 0.007624626159667969,
"learning_rate": 1.9987115748867685e-05,
"loss": 0.0034,
"reward": 5.730231523513794,
"reward_std": 0.720735490322113,
"rewards/mrr_reward": 0.24854290671646595,
"rewards/rank_analyze_format_reward": 0.8314545601606369,
"rewards/rank_answer_foramt_reward": 0.9140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 630.578125,
"epoch": 1.728,
"grad_norm": 0.03282896429300308,
"kl": 0.0073909759521484375,
"learning_rate": 1.9986987894006164e-05,
"loss": -0.0497,
"reward": 5.874699831008911,
"reward_std": 0.8969438448548317,
"rewards/mrr_reward": 0.2993737608194351,
"rewards/rank_analyze_format_reward": 0.7611890435218811,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 591.046875,
"epoch": 1.736,
"grad_norm": 0.031988587230443954,
"kl": 0.009072303771972656,
"learning_rate": 1.9986859408311878e-05,
"loss": -0.0426,
"reward": 5.34618878364563,
"reward_std": 0.9178062975406647,
"rewards/mrr_reward": 0.21951264888048172,
"rewards/rank_analyze_format_reward": 0.6615510508418083,
"rewards/rank_answer_foramt_reward": 0.833984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9941138625144958,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9941138625144958,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 651.53125,
"epoch": 1.744,
"grad_norm": 0.0311751589179039,
"kl": 0.0080718994140625,
"learning_rate": 1.9986730291792945e-05,
"loss": -0.0354,
"reward": 5.135533690452576,
"reward_std": 0.5661691799759865,
"rewards/mrr_reward": 0.12121155858039856,
"rewards/rank_analyze_format_reward": 0.8167032152414322,
"rewards/rank_answer_foramt_reward": 0.833984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 645.828125,
"epoch": 1.752,
"grad_norm": 0.03084523230791092,
"kl": 0.007534980773925781,
"learning_rate": 1.9986600544457524e-05,
"loss": -0.0277,
"reward": 5.917717456817627,
"reward_std": 0.8369560539722443,
"rewards/mrr_reward": 0.3187128081917763,
"rewards/rank_analyze_format_reward": 0.7128703743219376,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 654.984375,
"epoch": 1.76,
"grad_norm": 0.033047858625650406,
"kl": 0.0073394775390625,
"learning_rate": 1.9986470166313805e-05,
"loss": 0.0205,
"reward": 5.320756673812866,
"reward_std": 0.4847453236579895,
"rewards/mrr_reward": 0.14311756193637848,
"rewards/rank_analyze_format_reward": 0.857081413269043,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983368366956711,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9983368366956711,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 635.125,
"epoch": 1.768,
"grad_norm": 0.035302143543958664,
"kl": 0.00738525390625,
"learning_rate": 1.9986339157370026e-05,
"loss": -0.049,
"reward": 5.3989468812942505,
"reward_std": 0.8359893411397934,
"rewards/mrr_reward": 0.19479167088866234,
"rewards/rank_analyze_format_reward": 0.7908818274736404,
"rewards/rank_answer_foramt_reward": 0.83203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9984335899353027,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9984335899353027,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 651.96875,
"epoch": 1.776,
"grad_norm": 0.03167646750807762,
"kl": 0.008357048034667969,
"learning_rate": 1.9986207517634466e-05,
"loss": -0.0245,
"reward": 5.336655378341675,
"reward_std": 0.7819190472364426,
"rewards/mrr_reward": 0.17885665223002434,
"rewards/rank_analyze_format_reward": 0.7340867817401886,
"rewards/rank_answer_foramt_reward": 0.8984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 651.890625,
"epoch": 1.784,
"grad_norm": 0.03162270411849022,
"kl": 0.007929801940917969,
"learning_rate": 1.998607524711543e-05,
"loss": -0.0394,
"reward": 5.528796672821045,
"reward_std": 0.6584100723266602,
"rewards/mrr_reward": 0.2038008477538824,
"rewards/rank_analyze_format_reward": 0.7580919712781906,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 635.4375,
"epoch": 1.792,
"grad_norm": 0.0319884791970253,
"kl": 0.008496284484863281,
"learning_rate": 1.9985942345821285e-05,
"loss": -0.0326,
"reward": 5.295137047767639,
"reward_std": 0.6776984333992004,
"rewards/mrr_reward": 0.15868675522506237,
"rewards/rank_analyze_format_reward": 0.7750860899686813,
"rewards/rank_answer_foramt_reward": 0.890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9973393976688385,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9973393976688385,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 660.25,
"epoch": 1.8,
"grad_norm": 0.032160449773073196,
"kl": 0.0078582763671875,
"learning_rate": 1.998580881376042e-05,
"loss": -0.0341,
"reward": 5.92940092086792,
"reward_std": 1.2256246581673622,
"rewards/mrr_reward": 0.33193204551935196,
"rewards/rank_analyze_format_reward": 0.772309273481369,
"rewards/rank_answer_foramt_reward": 0.873046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9976895451545715,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9820645451545715,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 682.046875,
"epoch": 1.808,
"grad_norm": 0.0322372205555439,
"kl": 0.0072021484375,
"learning_rate": 1.9985674650941265e-05,
"loss": -0.0123,
"reward": 6.088730692863464,
"reward_std": 1.1017219424247742,
"rewards/mrr_reward": 0.3432415649294853,
"rewards/rank_analyze_format_reward": 0.7877216339111328,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 662.0,
"epoch": 1.8159999999999998,
"grad_norm": 0.03190344572067261,
"kl": 0.008536338806152344,
"learning_rate": 1.9985539857372303e-05,
"loss": -0.0181,
"reward": 6.0037089586257935,
"reward_std": 1.1030287593603134,
"rewards/mrr_reward": 0.31786955520510674,
"rewards/rank_analyze_format_reward": 0.7933959513902664,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9977376908063889,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9977376908063889,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 637.328125,
"epoch": 1.8239999999999998,
"grad_norm": 0.035804688930511475,
"kl": 0.008830070495605469,
"learning_rate": 1.998540443306204e-05,
"loss": -0.0227,
"reward": 5.816386938095093,
"reward_std": 0.5872849300503731,
"rewards/mrr_reward": 0.2789682596921921,
"rewards/rank_analyze_format_reward": 0.7708265483379364,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 662.359375,
"epoch": 1.8319999999999999,
"grad_norm": 0.030482899397611618,
"kl": 0.00791168212890625,
"learning_rate": 1.998526837801904e-05,
"loss": -0.0145,
"reward": 5.763516783714294,
"reward_std": 0.9057382866740227,
"rewards/mrr_reward": 0.2729290686547756,
"rewards/rank_analyze_format_reward": 0.7831287831068039,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 688.25,
"epoch": 1.8399999999999999,
"grad_norm": 0.03433312475681305,
"kl": 0.009449005126953125,
"learning_rate": 1.9985131692251887e-05,
"loss": 0.0284,
"reward": 5.669819235801697,
"reward_std": 0.6757724024355412,
"rewards/mrr_reward": 0.21458952501416206,
"rewards/rank_analyze_format_reward": 0.8934923410415649,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 658.875,
"epoch": 1.8479999999999999,
"grad_norm": 0.030251074582338333,
"kl": 0.008405685424804688,
"learning_rate": 1.9984994375769222e-05,
"loss": -0.0343,
"reward": 5.75708794593811,
"reward_std": 0.4988391697406769,
"rewards/mrr_reward": 0.2610367089509964,
"rewards/rank_analyze_format_reward": 0.7592362314462662,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983368366956711,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983368366956711,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 643.390625,
"epoch": 1.8559999999999999,
"grad_norm": 0.03255580738186836,
"kl": 0.010509490966796875,
"learning_rate": 1.9984856428579717e-05,
"loss": -0.0253,
"reward": 5.649736762046814,
"reward_std": 0.8045858144760132,
"rewards/mrr_reward": 0.23132441379129887,
"rewards/rank_analyze_format_reward": 0.8100680112838745,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 646.625,
"epoch": 1.8639999999999999,
"grad_norm": 0.03155457600951195,
"kl": 0.008493423461914062,
"learning_rate": 1.998471785069208e-05,
"loss": 0.0024,
"reward": 5.295361399650574,
"reward_std": 0.409699484705925,
"rewards/mrr_reward": 0.1384796667844057,
"rewards/rank_analyze_format_reward": 0.8116403520107269,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 639.734375,
"epoch": 1.8719999999999999,
"grad_norm": 0.03446485847234726,
"kl": 0.008045196533203125,
"learning_rate": 1.9984578642115072e-05,
"loss": 0.0044,
"reward": 5.8424142599105835,
"reward_std": 0.7500941399484873,
"rewards/mrr_reward": 0.28348215110599995,
"rewards/rank_analyze_format_reward": 0.7956449091434479,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 632.15625,
"epoch": 1.88,
"grad_norm": 0.03296066075563431,
"kl": 0.007839202880859375,
"learning_rate": 1.998443880285748e-05,
"loss": 0.0001,
"reward": 6.259778738021851,
"reward_std": 1.2440795004367828,
"rewards/mrr_reward": 0.4108507037162781,
"rewards/rank_analyze_format_reward": 0.7565357685089111,
"rewards/rank_answer_foramt_reward": 0.861328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9992559552192688,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 633.890625,
"epoch": 1.888,
"grad_norm": 0.03121933341026306,
"kl": 0.009092330932617188,
"learning_rate": 1.9984298332928142e-05,
"loss": -0.02,
"reward": 6.254051685333252,
"reward_std": 1.0369019284844398,
"rewards/mrr_reward": 0.3944692611694336,
"rewards/rank_analyze_format_reward": 0.780887171626091,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.01442819181829691,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 675.0,
"epoch": 1.896,
"grad_norm": 0.03492776304483414,
"kl": 0.008260726928710938,
"learning_rate": 1.9984157232335926e-05,
"loss": -0.0176,
"reward": 5.5136624574661255,
"reward_std": 0.5710221119225025,
"rewards/mrr_reward": 0.20375123620033264,
"rewards/rank_analyze_format_reward": 0.7764740437269211,
"rewards/rank_answer_foramt_reward": 0.955078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 677.265625,
"epoch": 1.904,
"grad_norm": 0.03103082999587059,
"kl": 0.009332656860351562,
"learning_rate": 1.998401550108975e-05,
"loss": -0.0236,
"reward": 5.937364459037781,
"reward_std": 0.6469563692808151,
"rewards/mrr_reward": 0.28705357387661934,
"rewards/rank_analyze_format_reward": 0.820400133728981,
"rewards/rank_answer_foramt_reward": 0.96875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 673.921875,
"epoch": 1.912,
"grad_norm": 0.030047811567783356,
"kl": 0.008413314819335938,
"learning_rate": 1.9983873139198565e-05,
"loss": 0.016,
"reward": 5.476260185241699,
"reward_std": 0.6603549867868423,
"rewards/mrr_reward": 0.18887649476528168,
"rewards/rank_analyze_format_reward": 0.8105982840061188,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 1.0,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 658.5,
"epoch": 1.92,
"grad_norm": 0.03159939870238304,
"kl": 0.007993698120117188,
"learning_rate": 1.9983730146671363e-05,
"loss": 0.0115,
"reward": 5.526045322418213,
"reward_std": 0.9571312367916107,
"rewards/mrr_reward": 0.23610491305589676,
"rewards/rank_analyze_format_reward": 0.7670577019453049,
"rewards/rank_answer_foramt_reward": 0.83203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 626.828125,
"epoch": 1.928,
"grad_norm": 0.029709013178944588,
"kl": 0.008957862854003906,
"learning_rate": 1.9983586523517175e-05,
"loss": -0.0143,
"reward": 6.118180155754089,
"reward_std": 0.942285418510437,
"rewards/mrr_reward": 0.36377108097076416,
"rewards/rank_analyze_format_reward": 0.7112708389759064,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9964202791452408,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9964202791452408,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 686.921875,
"epoch": 1.936,
"grad_norm": 0.032562170177698135,
"kl": 0.00933074951171875,
"learning_rate": 1.9983442269745073e-05,
"loss": -0.0013,
"reward": 5.5490440130233765,
"reward_std": 0.6350295543670654,
"rewards/mrr_reward": 0.20087425410747528,
"rewards/rank_analyze_format_reward": 0.833014503121376,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 651.125,
"epoch": 1.944,
"grad_norm": 0.034032415598630905,
"kl": 0.008647918701171875,
"learning_rate": 1.9983297385364166e-05,
"loss": 0.0212,
"reward": 5.475777506828308,
"reward_std": 0.5296052135527134,
"rewards/mrr_reward": 0.19027157872915268,
"rewards/rank_analyze_format_reward": 0.7654724419116974,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 678.125,
"epoch": 1.952,
"grad_norm": 0.03090350329875946,
"kl": 0.009029388427734375,
"learning_rate": 1.9983151870383614e-05,
"loss": -0.0167,
"reward": 5.95075786113739,
"reward_std": 0.9621450752019882,
"rewards/mrr_reward": 0.3183469697833061,
"rewards/rank_analyze_format_reward": 0.8020616918802261,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 659.34375,
"epoch": 1.96,
"grad_norm": 0.031676217913627625,
"kl": 0.009099960327148438,
"learning_rate": 1.99830057248126e-05,
"loss": -0.0003,
"reward": 5.529332995414734,
"reward_std": 0.665337011218071,
"rewards/mrr_reward": 0.19913194328546524,
"rewards/rank_analyze_format_reward": 0.84761643409729,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9826335161924362,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 634.1875,
"epoch": 1.968,
"grad_norm": 0.03511827066540718,
"kl": 0.00841522216796875,
"learning_rate": 1.9982858948660363e-05,
"loss": -0.0093,
"reward": 5.8812315464019775,
"reward_std": 0.9640037417411804,
"rewards/mrr_reward": 0.3079365137964487,
"rewards/rank_analyze_format_reward": 0.8350324183702469,
"rewards/rank_answer_foramt_reward": 0.845703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 646.09375,
"epoch": 1.976,
"grad_norm": 0.03402575105428696,
"kl": 0.008482933044433594,
"learning_rate": 1.9982711541936167e-05,
"loss": 0.0024,
"reward": 5.7571070194244385,
"reward_std": 1.066563904285431,
"rewards/mrr_reward": 0.3015996962785721,
"rewards/rank_analyze_format_reward": 0.7728263139724731,
"rewards/rank_answer_foramt_reward": 0.845703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9973393976688385,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9660893976688385,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 640.421875,
"epoch": 1.984,
"grad_norm": 0.0321134515106678,
"kl": 0.010736465454101562,
"learning_rate": 1.9982563504649327e-05,
"loss": -0.0097,
"reward": 5.307972192764282,
"reward_std": 0.8248837888240814,
"rewards/mrr_reward": 0.1758122555911541,
"rewards/rank_analyze_format_reward": 0.7355825752019882,
"rewards/rank_answer_foramt_reward": 0.900390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 657.71875,
"epoch": 1.992,
"grad_norm": 0.030712375417351723,
"kl": 0.009143829345703125,
"learning_rate": 1.998241483680919e-05,
"loss": -0.0232,
"reward": 5.721879601478577,
"reward_std": 0.6676881909370422,
"rewards/mrr_reward": 0.2327939011156559,
"rewards/rank_analyze_format_reward": 0.8345478177070618,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.014133165590465069,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 678.15625,
"epoch": 2.0,
"grad_norm": 0.03419603407382965,
"kl": 0.010951995849609375,
"learning_rate": 1.9982265538425157e-05,
"loss": 0.026,
"reward": 5.291154146194458,
"reward_std": 0.8919351100921631,
"rewards/mrr_reward": 0.19232391379773617,
"rewards/rank_analyze_format_reward": 0.7730166912078857,
"rewards/rank_answer_foramt_reward": 0.833984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9964912384748459,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9652412384748459,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 671.21875,
"epoch": 2.008,
"grad_norm": 0.033153921365737915,
"kl": 0.010362625122070312,
"learning_rate": 1.9982115609506648e-05,
"loss": -0.0114,
"reward": 5.587164759635925,
"reward_std": 0.6778712831437588,
"rewards/mrr_reward": 0.20784350484609604,
"rewards/rank_analyze_format_reward": 0.8671189993619919,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 664.625,
"epoch": 2.016,
"grad_norm": 0.03214149549603462,
"kl": 0.00939178466796875,
"learning_rate": 1.9981965050063134e-05,
"loss": 0.0123,
"reward": 5.736871004104614,
"reward_std": 0.46378058195114136,
"rewards/mrr_reward": 0.2288566492497921,
"rewards/rank_analyze_format_reward": 0.8448817729949951,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 686.265625,
"epoch": 2.024,
"grad_norm": 0.031552232801914215,
"kl": 0.008321762084960938,
"learning_rate": 1.998181386010413e-05,
"loss": -0.0037,
"reward": 5.441847443580627,
"reward_std": 0.4925037622451782,
"rewards/mrr_reward": 0.16958706080913544,
"rewards/rank_analyze_format_reward": 0.83710116147995,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 674.078125,
"epoch": 2.032,
"grad_norm": 0.030457468703389168,
"kl": 0.0079193115234375,
"learning_rate": 1.9981662039639182e-05,
"loss": -0.0145,
"reward": 5.886712431907654,
"reward_std": 0.6978030279278755,
"rewards/mrr_reward": 0.29466763883829117,
"rewards/rank_analyze_format_reward": 0.8364828526973724,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9836309552192688,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 617.484375,
"epoch": 2.04,
"grad_norm": 0.033485788851976395,
"kl": 0.010133743286132812,
"learning_rate": 1.9981509588677883e-05,
"loss": -0.0244,
"reward": 5.449910879135132,
"reward_std": 0.9446172118186951,
"rewards/mrr_reward": 0.21412449702620506,
"rewards/rank_analyze_format_reward": 0.7960726916790009,
"rewards/rank_answer_foramt_reward": 0.845703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9836309552192688,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 639.15625,
"epoch": 2.048,
"grad_norm": 0.03260404244065285,
"kl": 0.011867523193359375,
"learning_rate": 1.9981356507229862e-05,
"loss": -0.0329,
"reward": 5.332253098487854,
"reward_std": 0.8807602822780609,
"rewards/mrr_reward": 0.2012710850685835,
"rewards/rank_analyze_format_reward": 0.7252326309680939,
"rewards/rank_answer_foramt_reward": 0.833984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9956946671009064,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9956946671009064,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 689.078125,
"epoch": 2.056,
"grad_norm": 0.03169155865907669,
"kl": 0.008187294006347656,
"learning_rate": 1.9981202795304787e-05,
"loss": -0.0265,
"reward": 5.487691640853882,
"reward_std": 0.5739990789443254,
"rewards/mrr_reward": 0.19051960110664368,
"rewards/rank_analyze_format_reward": 0.7804432064294815,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.009314903989434242,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 661.890625,
"epoch": 2.064,
"grad_norm": 0.03203202411532402,
"kl": 0.010366439819335938,
"learning_rate": 1.9981048452912364e-05,
"loss": -0.036,
"reward": 5.627110004425049,
"reward_std": 0.7383934706449509,
"rewards/mrr_reward": 0.21320684999227524,
"rewards/rank_analyze_format_reward": 0.8495243489742279,
"rewards/rank_answer_foramt_reward": 0.927734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9985119104385376,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9985119104385376,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 665.34375,
"epoch": 2.072,
"grad_norm": 0.031892187893390656,
"kl": 0.008783340454101562,
"learning_rate": 1.998089348006235e-05,
"loss": -0.0254,
"reward": 5.347836494445801,
"reward_std": 0.6561598926782608,
"rewards/mrr_reward": 0.16795635037124157,
"rewards/rank_analyze_format_reward": 0.8007026761770248,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 672.203125,
"epoch": 2.08,
"grad_norm": 0.02936052717268467,
"kl": 0.008337020874023438,
"learning_rate": 1.998073787676453e-05,
"loss": -0.0013,
"reward": 5.713126063346863,
"reward_std": 0.6685744076967239,
"rewards/mrr_reward": 0.2293526791036129,
"rewards/rank_analyze_format_reward": 0.8230589926242828,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 678.53125,
"epoch": 2.088,
"grad_norm": 0.03272569924592972,
"kl": 0.008749008178710938,
"learning_rate": 1.9980581643028732e-05,
"loss": -0.0006,
"reward": 5.963893890380859,
"reward_std": 0.8036399632692337,
"rewards/mrr_reward": 0.29525669291615486,
"rewards/rank_analyze_format_reward": 0.8180233091115952,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 689.125,
"epoch": 2.096,
"grad_norm": 0.032431650906801224,
"kl": 0.009164810180664062,
"learning_rate": 1.9980424778864825e-05,
"loss": 0.0079,
"reward": 5.4519102573394775,
"reward_std": 0.6584747061133385,
"rewards/mrr_reward": 0.17532242834568024,
"rewards/rank_analyze_format_reward": 0.8502301275730133,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 637.734375,
"epoch": 2.104,
"grad_norm": 0.033844608813524246,
"kl": 0.008821487426757812,
"learning_rate": 1.9980267284282718e-05,
"loss": -0.0156,
"reward": 5.51776909828186,
"reward_std": 0.661251924932003,
"rewards/mrr_reward": 0.21566840261220932,
"rewards/rank_analyze_format_reward": 0.7573655396699905,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 641.515625,
"epoch": 2.112,
"grad_norm": 0.03329053893685341,
"kl": 0.009187698364257812,
"learning_rate": 1.998010915929236e-05,
"loss": 0.0021,
"reward": 5.585361957550049,
"reward_std": 0.6540718153119087,
"rewards/mrr_reward": 0.20502851717174053,
"rewards/rank_analyze_format_reward": 0.8237265795469284,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 670.28125,
"epoch": 2.12,
"grad_norm": 0.03286939114332199,
"kl": 0.008611679077148438,
"learning_rate": 1.9979950403903732e-05,
"loss": -0.0075,
"reward": 5.393820524215698,
"reward_std": 0.587890163064003,
"rewards/mrr_reward": 0.1768353171646595,
"rewards/rank_analyze_format_reward": 0.795545905828476,
"rewards/rank_answer_foramt_reward": 0.900390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 651.484375,
"epoch": 2.128,
"grad_norm": 0.03381568565964699,
"kl": 0.009261131286621094,
"learning_rate": 1.9979791018126874e-05,
"loss": -0.0234,
"reward": 5.706428170204163,
"reward_std": 0.9137073345482349,
"rewards/mrr_reward": 0.25823412649333477,
"rewards/rank_analyze_format_reward": 0.8322576582431793,
"rewards/rank_answer_foramt_reward": 0.85546875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9967888593673706,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9967888593673706,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 664.21875,
"epoch": 2.136,
"grad_norm": 0.03102073445916176,
"kl": 0.0093231201171875,
"learning_rate": 1.9979631001971848e-05,
"loss": -0.0296,
"reward": 5.800906300544739,
"reward_std": 1.087342880666256,
"rewards/mrr_reward": 0.30744667910039425,
"rewards/rank_analyze_format_reward": 0.7718681544065475,
"rewards/rank_answer_foramt_reward": 0.873046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9826335161924362,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9826335161924362,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 662.421875,
"epoch": 2.144,
"grad_norm": 0.031345415860414505,
"kl": 0.008287429809570312,
"learning_rate": 1.9979470355448756e-05,
"loss": -0.0282,
"reward": 5.249317646026611,
"reward_std": 0.5138699784874916,
"rewards/mrr_reward": 0.144283227622509,
"rewards/rank_analyze_format_reward": 0.7632499039173126,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9818111509084702,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 673.96875,
"epoch": 2.152,
"grad_norm": 0.03057401068508625,
"kl": 0.00954437255859375,
"learning_rate": 1.9979309078567756e-05,
"loss": 0.0248,
"reward": 5.779935836791992,
"reward_std": 0.8248593732714653,
"rewards/mrr_reward": 0.28400298207998276,
"rewards/rank_analyze_format_reward": 0.7388757467269897,
"rewards/rank_answer_foramt_reward": 0.8984375,
"rewards/rank_contrast_format_reward": 0.014423076994717121,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 682.84375,
"epoch": 2.16,
"grad_norm": 0.02877761982381344,
"kl": 0.008660316467285156,
"learning_rate": 1.9979147171339022e-05,
"loss": 0.0112,
"reward": 5.953364610671997,
"reward_std": 1.0477607250213623,
"rewards/mrr_reward": 0.30703745037317276,
"rewards/rank_analyze_format_reward": 0.8443555235862732,
"rewards/rank_answer_foramt_reward": 0.896484375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 637.125,
"epoch": 2.168,
"grad_norm": 0.03201289847493172,
"kl": 0.011377334594726562,
"learning_rate": 1.9978984633772795e-05,
"loss": -0.0223,
"reward": 5.34253454208374,
"reward_std": 1.0002544522285461,
"rewards/mrr_reward": 0.2072482742369175,
"rewards/rank_analyze_format_reward": 0.7518228143453598,
"rewards/rank_answer_foramt_reward": 0.81640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.984375,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 660.765625,
"epoch": 2.176,
"grad_norm": 0.033372726291418076,
"kl": 0.008876800537109375,
"learning_rate": 1.9978821465879332e-05,
"loss": 0.0035,
"reward": 5.907817602157593,
"reward_std": 0.8581305295228958,
"rewards/mrr_reward": 0.3209015391767025,
"rewards/rank_analyze_format_reward": 0.7235125303268433,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 638.609375,
"epoch": 2.184,
"grad_norm": 0.03363556042313576,
"kl": 0.009851455688476562,
"learning_rate": 1.9978657667668945e-05,
"loss": 0.0006,
"reward": 5.586699724197388,
"reward_std": 0.679840974509716,
"rewards/mrr_reward": 0.22039930522441864,
"rewards/rank_analyze_format_reward": 0.7949463129043579,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 652.359375,
"epoch": 2.192,
"grad_norm": 0.030939241871237755,
"kl": 0.008955001831054688,
"learning_rate": 1.9978493239151976e-05,
"loss": 0.0165,
"reward": 5.984506607055664,
"reward_std": 1.1070766001939774,
"rewards/mrr_reward": 0.35327382013201714,
"rewards/rank_analyze_format_reward": 0.7415075749158859,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9794049561023712,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9794049561023712,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 621.21875,
"epoch": 2.2,
"grad_norm": 0.03531678020954132,
"kl": 0.008274078369140625,
"learning_rate": 1.997832818033881e-05,
"loss": -0.0297,
"reward": 5.348074316978455,
"reward_std": 0.7185068726539612,
"rewards/mrr_reward": 0.1775855701416731,
"rewards/rank_analyze_format_reward": 0.7607788443565369,
"rewards/rank_answer_foramt_reward": 0.876953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 666.109375,
"epoch": 2.208,
"grad_norm": 0.03195611387491226,
"kl": 0.00879669189453125,
"learning_rate": 1.9978162491239882e-05,
"loss": -0.0057,
"reward": 5.110758066177368,
"reward_std": 0.6071035340428352,
"rewards/mrr_reward": 0.13315972313284874,
"rewards/rank_analyze_format_reward": 0.7337524592876434,
"rewards/rank_answer_foramt_reward": 0.86328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 661.46875,
"epoch": 2.216,
"grad_norm": 0.03165001794695854,
"kl": 0.009108543395996094,
"learning_rate": 1.997799617186565e-05,
"loss": 0.0002,
"reward": 5.748872637748718,
"reward_std": 0.6040460020303726,
"rewards/mrr_reward": 0.23888888955116272,
"rewards/rank_analyze_format_reward": 0.8343324214220047,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 650.9375,
"epoch": 2.224,
"grad_norm": 0.03219401836395264,
"kl": 0.008958816528320312,
"learning_rate": 1.9977829222226622e-05,
"loss": -0.0102,
"reward": 6.228976130485535,
"reward_std": 1.0323386192321777,
"rewards/mrr_reward": 0.3954737111926079,
"rewards/rank_analyze_format_reward": 0.7388781309127808,
"rewards/rank_answer_foramt_reward": 0.955078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 650.625,
"epoch": 2.232,
"grad_norm": 0.03455818444490433,
"kl": 0.008190155029296875,
"learning_rate": 1.9977661642333344e-05,
"loss": -0.0149,
"reward": 5.373299837112427,
"reward_std": 0.8532019183039665,
"rewards/mrr_reward": 0.17197420820593834,
"rewards/rank_analyze_format_reward": 0.8064966201782227,
"rewards/rank_answer_foramt_reward": 0.88671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 668.390625,
"epoch": 2.24,
"grad_norm": 0.029518628492951393,
"kl": 0.008675575256347656,
"learning_rate": 1.99774934321964e-05,
"loss": 0.0097,
"reward": 5.885105848312378,
"reward_std": 0.8752723336219788,
"rewards/mrr_reward": 0.29821430146694183,
"rewards/rank_analyze_format_reward": 0.8387329578399658,
"rewards/rank_answer_foramt_reward": 0.861328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 695.234375,
"epoch": 2.248,
"grad_norm": 0.0311732180416584,
"kl": 0.00806427001953125,
"learning_rate": 1.9977324591826415e-05,
"loss": 0.0088,
"reward": 5.816622257232666,
"reward_std": 0.8411147147417068,
"rewards/mrr_reward": 0.2692398317158222,
"rewards/rank_analyze_format_reward": 0.8213856071233749,
"rewards/rank_answer_foramt_reward": 0.927734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 662.3125,
"epoch": 2.2560000000000002,
"grad_norm": 0.032965514808893204,
"kl": 0.008653640747070312,
"learning_rate": 1.9977155121234056e-05,
"loss": -0.0151,
"reward": 5.504308104515076,
"reward_std": 0.7789564803242683,
"rewards/mrr_reward": 0.2098772320896387,
"rewards/rank_analyze_format_reward": 0.7714656293392181,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9954948574304581,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9954948574304581,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 658.296875,
"epoch": 2.2640000000000002,
"grad_norm": 0.033497847616672516,
"kl": 0.009673118591308594,
"learning_rate": 1.9976985020430022e-05,
"loss": -0.0045,
"reward": 6.024387836456299,
"reward_std": 0.6360235512256622,
"rewards/mrr_reward": 0.32351189479231834,
"rewards/rank_analyze_format_reward": 0.8103033602237701,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 692.0,
"epoch": 2.2720000000000002,
"grad_norm": 0.03325760364532471,
"kl": 0.008056640625,
"learning_rate": 1.9976814289425066e-05,
"loss": 0.0057,
"reward": 5.442414402961731,
"reward_std": 0.6741410419344902,
"rewards/mrr_reward": 0.16527777537703514,
"rewards/rank_analyze_format_reward": 0.8516157567501068,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 622.953125,
"epoch": 2.2800000000000002,
"grad_norm": 0.035167232155799866,
"kl": 0.009555816650390625,
"learning_rate": 1.9976642928229965e-05,
"loss": 0.0079,
"reward": 5.635341167449951,
"reward_std": 0.738533541560173,
"rewards/mrr_reward": 0.22927208244800568,
"rewards/rank_analyze_format_reward": 0.8002839833498001,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.984375,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 689.6875,
"epoch": 2.288,
"grad_norm": 0.031026914715766907,
"kl": 0.009326934814453125,
"learning_rate": 1.997647093685555e-05,
"loss": -0.0035,
"reward": 6.1463258266448975,
"reward_std": 0.7740766424685717,
"rewards/mrr_reward": 0.34895833767950535,
"rewards/rank_analyze_format_reward": 0.8179265707731247,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9975328892469406,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9975328892469406,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 661.875,
"epoch": 2.296,
"grad_norm": 0.03047414869070053,
"kl": 0.0081024169921875,
"learning_rate": 1.9976298315312675e-05,
"loss": -0.0073,
"reward": 5.633584260940552,
"reward_std": 0.7135986983776093,
"rewards/mrr_reward": 0.22126116044819355,
"rewards/rank_analyze_format_reward": 0.816898986697197,
"rewards/rank_answer_foramt_reward": 0.939453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 622.734375,
"epoch": 2.304,
"grad_norm": 0.033806800842285156,
"kl": 0.008437156677246094,
"learning_rate": 1.9976125063612254e-05,
"loss": -0.0368,
"reward": 5.522616624832153,
"reward_std": 1.1119669452309608,
"rewards/mrr_reward": 0.2499070018529892,
"rewards/rank_analyze_format_reward": 0.6830301284790039,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 653.484375,
"epoch": 2.312,
"grad_norm": 0.031600214540958405,
"kl": 0.008672714233398438,
"learning_rate": 1.9975951181765226e-05,
"loss": -0.0142,
"reward": 5.305689573287964,
"reward_std": 0.5672206580638885,
"rewards/mrr_reward": 0.1442832387983799,
"rewards/rank_analyze_format_reward": 0.7910565435886383,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 623.421875,
"epoch": 2.32,
"grad_norm": 0.08158010989427567,
"kl": 0.02387237548828125,
"learning_rate": 1.9975776669782572e-05,
"loss": -0.0073,
"reward": 5.95476496219635,
"reward_std": 1.296246200799942,
"rewards/mrr_reward": 0.35463789105415344,
"rewards/rank_analyze_format_reward": 0.7332646250724792,
"rewards/rank_answer_foramt_reward": 0.833984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9962009787559509,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9962009787559509,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 625.859375,
"epoch": 2.328,
"grad_norm": 0.031481146812438965,
"kl": 0.008309364318847656,
"learning_rate": 1.997560152767532e-05,
"loss": -0.0067,
"reward": 5.631876707077026,
"reward_std": 0.6546346843242645,
"rewards/mrr_reward": 0.23791542649269104,
"rewards/rank_analyze_format_reward": 0.7969877421855927,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.013663419522345066,
"rewards/rank_initial_format_reward": 0.9826335161924362,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9670085161924362,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 623.84375,
"epoch": 2.336,
"grad_norm": 0.037191130220890045,
"kl": 0.007427215576171875,
"learning_rate": 1.997542575545453e-05,
"loss": -0.0003,
"reward": 5.354902744293213,
"reward_std": 0.7260187715291977,
"rewards/mrr_reward": 0.19120163097977638,
"rewards/rank_analyze_format_reward": 0.7232708260416985,
"rewards/rank_answer_foramt_reward": 0.876953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.994936153292656,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.994936153292656,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 654.140625,
"epoch": 2.344,
"grad_norm": 0.030975518748164177,
"kl": 0.006802558898925781,
"learning_rate": 1.9975249353131304e-05,
"loss": 0.0211,
"reward": 6.162993669509888,
"reward_std": 0.28368850238621235,
"rewards/mrr_reward": 0.3227430731058121,
"rewards/rank_analyze_format_reward": 0.8875313103199005,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 687.78125,
"epoch": 2.352,
"grad_norm": 0.03459201753139496,
"kl": 0.011088371276855469,
"learning_rate": 1.9975072320716785e-05,
"loss": 0.0123,
"reward": 5.5102492570877075,
"reward_std": 1.0369550734758377,
"rewards/mrr_reward": 0.25905878841876984,
"rewards/rank_analyze_format_reward": 0.7301295399665833,
"rewards/rank_answer_foramt_reward": 0.826171875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9979188442230225,
"rewards/rank_overall_format_reward_more": 0.9375,
"rewards/rank_verify_format_reward": 0.9822938442230225,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 627.734375,
"epoch": 2.36,
"grad_norm": 0.031542547047138214,
"kl": 0.008196830749511719,
"learning_rate": 1.997489465822216e-05,
"loss": 0.0003,
"reward": 6.191069960594177,
"reward_std": 0.9258007109165192,
"rewards/mrr_reward": 0.3952629007399082,
"rewards/rank_analyze_format_reward": 0.7525966763496399,
"rewards/rank_answer_foramt_reward": 0.873046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 638.3125,
"epoch": 2.368,
"grad_norm": 0.03178432211279869,
"kl": 0.008670806884765625,
"learning_rate": 1.9974716365658646e-05,
"loss": -0.0171,
"reward": 5.212967276573181,
"reward_std": 0.41466130316257477,
"rewards/mrr_reward": 0.1444692499935627,
"rewards/rank_analyze_format_reward": 0.7190747410058975,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 617.453125,
"epoch": 2.376,
"grad_norm": 0.03690677136182785,
"kl": 0.008281707763671875,
"learning_rate": 1.9974537443037504e-05,
"loss": -0.0293,
"reward": 5.413174152374268,
"reward_std": 0.7004074454307556,
"rewards/mrr_reward": 0.18989335373044014,
"rewards/rank_analyze_format_reward": 0.798017293214798,
"rewards/rank_answer_foramt_reward": 0.857421875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 677.140625,
"epoch": 2.384,
"grad_norm": 0.03291052579879761,
"kl": 0.008054733276367188,
"learning_rate": 1.9974357890370038e-05,
"loss": -0.0051,
"reward": 5.8681100606918335,
"reward_std": 0.7517407834529877,
"rewards/mrr_reward": 0.2819134518504143,
"rewards/rank_analyze_format_reward": 0.8143666237592697,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 662.84375,
"epoch": 2.392,
"grad_norm": 0.03135626018047333,
"kl": 0.00891876220703125,
"learning_rate": 1.9974177707667594e-05,
"loss": -0.0061,
"reward": 5.645070552825928,
"reward_std": 0.483820416033268,
"rewards/mrr_reward": 0.22468998655676842,
"rewards/rank_analyze_format_reward": 0.7951385527849197,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 630.03125,
"epoch": 2.4,
"grad_norm": 0.03234223648905754,
"kl": 0.008310317993164062,
"learning_rate": 1.9973996894941545e-05,
"loss": -0.0066,
"reward": 6.414054274559021,
"reward_std": 0.9432376772165298,
"rewards/mrr_reward": 0.40456970781087875,
"rewards/rank_analyze_format_reward": 0.8504632115364075,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 619.9375,
"epoch": 2.408,
"grad_norm": 0.03262539207935333,
"kl": 0.0074558258056640625,
"learning_rate": 1.9973815452203314e-05,
"loss": -0.0375,
"reward": 5.410915374755859,
"reward_std": 0.6377813890576363,
"rewards/mrr_reward": 0.2176587451249361,
"rewards/rank_analyze_format_reward": 0.7108280807733536,
"rewards/rank_answer_foramt_reward": 0.845703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9957809001207352,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9957809001207352,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 649.0,
"epoch": 2.416,
"grad_norm": 0.03086378425359726,
"kl": 0.008134841918945312,
"learning_rate": 1.997363337946437e-05,
"loss": -0.0071,
"reward": 5.332266926765442,
"reward_std": 0.7226725369691849,
"rewards/mrr_reward": 0.16749132610857487,
"rewards/rank_analyze_format_reward": 0.7792825102806091,
"rewards/rank_answer_foramt_reward": 0.890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9961971044540405,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9961971044540405,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 638.296875,
"epoch": 2.424,
"grad_norm": 0.03537590056657791,
"kl": 0.008157730102539062,
"learning_rate": 1.9973450676736205e-05,
"loss": -0.0204,
"reward": 5.607993245124817,
"reward_std": 0.5448657497763634,
"rewards/mrr_reward": 0.2038070484995842,
"rewards/rank_analyze_format_reward": 0.8434313237667084,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 651.46875,
"epoch": 2.432,
"grad_norm": 0.030700810253620148,
"kl": 0.008941650390625,
"learning_rate": 1.997326734403036e-05,
"loss": -0.0217,
"reward": 5.499999642372131,
"reward_std": 0.7909096032381058,
"rewards/mrr_reward": 0.20515872910618782,
"rewards/rank_analyze_format_reward": 0.8101091831922531,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 661.234375,
"epoch": 2.44,
"grad_norm": 0.031224045902490616,
"kl": 0.007833480834960938,
"learning_rate": 1.997308338135842e-05,
"loss": 0.0108,
"reward": 5.637946367263794,
"reward_std": 0.6872468590736389,
"rewards/mrr_reward": 0.2270585335791111,
"rewards/rank_analyze_format_reward": 0.8155348151922226,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 626.90625,
"epoch": 2.448,
"grad_norm": 0.034319475293159485,
"kl": 0.009645462036132812,
"learning_rate": 1.9972898788732e-05,
"loss": -0.0273,
"reward": 5.709458708763123,
"reward_std": 0.8589234948158264,
"rewards/mrr_reward": 0.2706225086003542,
"rewards/rank_analyze_format_reward": 0.7733380496501923,
"rewards/rank_answer_foramt_reward": 0.86328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 673.84375,
"epoch": 2.456,
"grad_norm": 0.03254551440477371,
"kl": 0.00882720947265625,
"learning_rate": 1.9972713566162763e-05,
"loss": -0.0144,
"reward": 5.992625951766968,
"reward_std": 0.5823017284274101,
"rewards/mrr_reward": 0.2946366611868143,
"rewards/rank_analyze_format_reward": 0.8765792399644852,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 648.28125,
"epoch": 2.464,
"grad_norm": 0.03191380575299263,
"kl": 0.008279800415039062,
"learning_rate": 1.997252771366241e-05,
"loss": -0.0266,
"reward": 5.676502346992493,
"reward_std": 0.8015426993370056,
"rewards/mrr_reward": 0.2450086809694767,
"rewards/rank_analyze_format_reward": 0.7492017894983292,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.984375,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 652.546875,
"epoch": 2.472,
"grad_norm": 0.03314831480383873,
"kl": 0.008116722106933594,
"learning_rate": 1.9972341231242675e-05,
"loss": -0.0008,
"reward": 5.710240483283997,
"reward_std": 0.6632269471883774,
"rewards/mrr_reward": 0.2543836794793606,
"rewards/rank_analyze_format_reward": 0.7747371196746826,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 642.03125,
"epoch": 2.48,
"grad_norm": 0.03377654030919075,
"kl": 0.009763717651367188,
"learning_rate": 1.9972154118915344e-05,
"loss": -0.0154,
"reward": 5.755140542984009,
"reward_std": 0.6953508257865906,
"rewards/mrr_reward": 0.27708953991532326,
"rewards/rank_analyze_format_reward": 0.699516773223877,
"rewards/rank_answer_foramt_reward": 0.955078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 647.53125,
"epoch": 2.488,
"grad_norm": 0.03434586524963379,
"kl": 0.009517669677734375,
"learning_rate": 1.997196637669223e-05,
"loss": -0.0056,
"reward": 5.399365782737732,
"reward_std": 0.6744739785790443,
"rewards/mrr_reward": 0.1757440436631441,
"rewards/rank_analyze_format_reward": 0.8096708953380585,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 615.125,
"epoch": 2.496,
"grad_norm": 0.03466728329658508,
"kl": 0.010303497314453125,
"learning_rate": 1.99717780045852e-05,
"loss": -0.0225,
"reward": 5.5405789613723755,
"reward_std": 0.7150269001722336,
"rewards/mrr_reward": 0.21937625110149384,
"rewards/rank_analyze_format_reward": 0.7294801473617554,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 664.421875,
"epoch": 2.504,
"grad_norm": 0.030946679413318634,
"kl": 0.007822036743164062,
"learning_rate": 1.997158900260614e-05,
"loss": -0.0271,
"reward": 5.6557512283325195,
"reward_std": 0.5378784239292145,
"rewards/mrr_reward": 0.21952505223453045,
"rewards/rank_analyze_format_reward": 0.799135148525238,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 670.078125,
"epoch": 2.512,
"grad_norm": 0.03168868273496628,
"kl": 0.007956504821777344,
"learning_rate": 1.9971399370767e-05,
"loss": -0.0138,
"reward": 5.643744587898254,
"reward_std": 0.6119559705257416,
"rewards/mrr_reward": 0.21652406081557274,
"rewards/rank_analyze_format_reward": 0.8040668964385986,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9975329041481018,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9975329041481018,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 644.828125,
"epoch": 2.52,
"grad_norm": 0.03241531923413277,
"kl": 0.009516716003417969,
"learning_rate": 1.9971209109079752e-05,
"loss": -0.0025,
"reward": 5.7852044105529785,
"reward_std": 0.7106733173131943,
"rewards/mrr_reward": 0.2660466283559799,
"rewards/rank_analyze_format_reward": 0.7932835072278976,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 659.78125,
"epoch": 2.528,
"grad_norm": 0.032261837273836136,
"kl": 0.009158134460449219,
"learning_rate": 1.9971018217556416e-05,
"loss": -0.0131,
"reward": 5.741572380065918,
"reward_std": 0.9340634196996689,
"rewards/mrr_reward": 0.252250749617815,
"rewards/rank_analyze_format_reward": 0.8470719158649445,
"rewards/rank_answer_foramt_reward": 0.890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 633.46875,
"epoch": 2.536,
"grad_norm": 0.03570343554019928,
"kl": 0.008861541748046875,
"learning_rate": 1.997082669620905e-05,
"loss": -0.0283,
"reward": 5.575627684593201,
"reward_std": 0.59528449177742,
"rewards/mrr_reward": 0.20295760035514832,
"rewards/rank_analyze_format_reward": 0.814463660120964,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 666.09375,
"epoch": 2.544,
"grad_norm": 0.031063677743077278,
"kl": 0.007955551147460938,
"learning_rate": 1.997063454504975e-05,
"loss": -0.0086,
"reward": 5.3937273025512695,
"reward_std": 0.3589708264917135,
"rewards/mrr_reward": 0.14628596417605877,
"rewards/rank_analyze_format_reward": 0.853196918964386,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 633.078125,
"epoch": 2.552,
"grad_norm": 0.03542445972561836,
"kl": 0.008488655090332031,
"learning_rate": 1.9970441764090654e-05,
"loss": 0.0057,
"reward": 5.481135725975037,
"reward_std": 0.5193400681018829,
"rewards/mrr_reward": 0.1921502985060215,
"rewards/rank_analyze_format_reward": 0.7550382316112518,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9992559552192688,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 625.046875,
"epoch": 2.56,
"grad_norm": 0.033270470798015594,
"kl": 0.010507583618164062,
"learning_rate": 1.9970248353343943e-05,
"loss": -0.0402,
"reward": 5.634747266769409,
"reward_std": 0.6059275269508362,
"rewards/mrr_reward": 0.22289186716079712,
"rewards/rank_analyze_format_reward": 0.8007473796606064,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9985600560903549,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9985600560903549,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 637.25,
"epoch": 2.568,
"grad_norm": 0.03648350015282631,
"kl": 0.009977340698242188,
"learning_rate": 1.997005431282183e-05,
"loss": -0.0263,
"reward": 5.567351460456848,
"reward_std": 0.9927941262722015,
"rewards/mrr_reward": 0.21014384925365448,
"rewards/rank_analyze_format_reward": 0.795135423541069,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 694.09375,
"epoch": 2.576,
"grad_norm": 0.030762799084186554,
"kl": 0.009020805358886719,
"learning_rate": 1.996985964253657e-05,
"loss": -0.0093,
"reward": 5.783640742301941,
"reward_std": 0.7187513560056686,
"rewards/mrr_reward": 0.2635354623198509,
"rewards/rank_analyze_format_reward": 0.8368059247732162,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 671.828125,
"epoch": 2.584,
"grad_norm": 0.034205637872219086,
"kl": 0.008504867553710938,
"learning_rate": 1.996966434250046e-05,
"loss": 0.0016,
"reward": 5.436808228492737,
"reward_std": 0.7342798858880997,
"rewards/mrr_reward": 0.18624752573668957,
"rewards/rank_analyze_format_reward": 0.8596720993518829,
"rewards/rank_answer_foramt_reward": 0.833984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 659.828125,
"epoch": 2.592,
"grad_norm": 0.03484297916293144,
"kl": 0.011442184448242188,
"learning_rate": 1.996946841272584e-05,
"loss": -0.0141,
"reward": 5.498148679733276,
"reward_std": 0.6755934655666351,
"rewards/mrr_reward": 0.1958395354449749,
"rewards/rank_analyze_format_reward": 0.8358840942382812,
"rewards/rank_answer_foramt_reward": 0.88671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 649.140625,
"epoch": 2.6,
"grad_norm": 0.032932061702013016,
"kl": 0.00991058349609375,
"learning_rate": 1.9969271853225083e-05,
"loss": -0.0066,
"reward": 5.963220715522766,
"reward_std": 0.5944485515356064,
"rewards/mrr_reward": 0.29027777537703514,
"rewards/rank_analyze_format_reward": 0.8310981541872025,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 671.640625,
"epoch": 2.608,
"grad_norm": 0.032074637711048126,
"kl": 0.008647918701171875,
"learning_rate": 1.9969074664010605e-05,
"loss": 0.0031,
"reward": 5.655932188034058,
"reward_std": 0.41921700816601515,
"rewards/mrr_reward": 0.20592138171195984,
"rewards/rank_analyze_format_reward": 0.8595906496047974,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 641.78125,
"epoch": 2.616,
"grad_norm": 0.033006053417921066,
"kl": 0.00919342041015625,
"learning_rate": 1.9968876845094864e-05,
"loss": 0.0,
"reward": 5.528472542762756,
"reward_std": 0.41449059918522835,
"rewards/mrr_reward": 0.17598586156964302,
"rewards/rank_analyze_format_reward": 0.8398456275463104,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 666.734375,
"epoch": 2.624,
"grad_norm": 0.034119799733161926,
"kl": 0.009557723999023438,
"learning_rate": 1.996867839649035e-05,
"loss": -0.0152,
"reward": 5.444994330406189,
"reward_std": 0.7444438338279724,
"rewards/mrr_reward": 0.18702257797122002,
"rewards/rank_analyze_format_reward": 0.7671014666557312,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 649.71875,
"epoch": 2.632,
"grad_norm": 0.034841958433389664,
"kl": 0.009944915771484375,
"learning_rate": 1.9968479318209603e-05,
"loss": 0.0103,
"reward": 6.070975661277771,
"reward_std": 1.140429526567459,
"rewards/mrr_reward": 0.3466765880584717,
"rewards/rank_analyze_format_reward": 0.8070079386234283,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 700.140625,
"epoch": 2.64,
"grad_norm": 0.03080068528652191,
"kl": 0.009492874145507812,
"learning_rate": 1.9968279610265194e-05,
"loss": 0.0229,
"reward": 5.357762455940247,
"reward_std": 0.6760208085179329,
"rewards/mrr_reward": 0.16169394925236702,
"rewards/rank_analyze_format_reward": 0.8436842709779739,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 655.25,
"epoch": 2.648,
"grad_norm": 0.03430721163749695,
"kl": 0.008755683898925781,
"learning_rate": 1.9968079272669744e-05,
"loss": 0.0123,
"reward": 5.500509142875671,
"reward_std": 0.5423839017748833,
"rewards/mrr_reward": 0.19489708170294762,
"rewards/rank_analyze_format_reward": 0.8066286146640778,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 616.5625,
"epoch": 2.656,
"grad_norm": 0.03663242235779762,
"kl": 0.011625289916992188,
"learning_rate": 1.9967878305435902e-05,
"loss": -0.0071,
"reward": 5.21767783164978,
"reward_std": 0.6230617165565491,
"rewards/mrr_reward": 0.15186012163758278,
"rewards/rank_analyze_format_reward": 0.7214507311582565,
"rewards/rank_answer_foramt_reward": 0.9140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 671.84375,
"epoch": 2.664,
"grad_norm": 0.032446544617414474,
"kl": 0.010744094848632812,
"learning_rate": 1.9967676708576362e-05,
"loss": -0.0252,
"reward": 5.431139588356018,
"reward_std": 0.38856903836131096,
"rewards/mrr_reward": 0.1630394347012043,
"rewards/rank_analyze_format_reward": 0.820786789059639,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.014835858717560768,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 662.96875,
"epoch": 2.672,
"grad_norm": 0.03309661149978638,
"kl": 0.00853729248046875,
"learning_rate": 1.9967474482103863e-05,
"loss": 0.0207,
"reward": 5.812266111373901,
"reward_std": 0.6310017332434654,
"rewards/mrr_reward": 0.2564794160425663,
"rewards/rank_analyze_format_reward": 0.8384659141302109,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 638.546875,
"epoch": 2.68,
"grad_norm": 0.032147496938705444,
"kl": 0.009319305419921875,
"learning_rate": 1.996727162603117e-05,
"loss": -0.0164,
"reward": 5.495377421379089,
"reward_std": 0.5436971858143806,
"rewards/mrr_reward": 0.22277406230568886,
"rewards/rank_analyze_format_reward": 0.6531094089150429,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 671.296875,
"epoch": 2.6879999999999997,
"grad_norm": 0.03020451031625271,
"kl": 0.008257865905761719,
"learning_rate": 1.9967068140371103e-05,
"loss": -0.023,
"reward": 6.15105414390564,
"reward_std": 0.7604061029851437,
"rewards/mrr_reward": 0.35879215970635414,
"rewards/rank_analyze_format_reward": 0.7471354156732559,
"rewards/rank_answer_foramt_reward": 0.984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 669.15625,
"epoch": 2.6959999999999997,
"grad_norm": 0.03703196346759796,
"kl": 0.009157180786132812,
"learning_rate": 1.9966864025136518e-05,
"loss": 0.0097,
"reward": 5.549162268638611,
"reward_std": 0.5662261173129082,
"rewards/mrr_reward": 0.1858568899333477,
"rewards/rank_analyze_format_reward": 0.8500397950410843,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 647.375,
"epoch": 2.7039999999999997,
"grad_norm": 0.03430277109146118,
"kl": 0.009218215942382812,
"learning_rate": 1.99666592803403e-05,
"loss": 0.0008,
"reward": 6.126144886016846,
"reward_std": 0.626649871468544,
"rewards/mrr_reward": 0.3454737141728401,
"rewards/rank_analyze_format_reward": 0.7743111848831177,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.998641312122345,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.998641312122345,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 688.765625,
"epoch": 2.7119999999999997,
"grad_norm": 0.03879823163151741,
"kl": 0.008647918701171875,
"learning_rate": 1.9966453905995386e-05,
"loss": 0.0293,
"reward": 5.698531866073608,
"reward_std": 0.5574172139167786,
"rewards/mrr_reward": 0.22999751940369606,
"rewards/rank_analyze_format_reward": 0.8449480086565018,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 631.390625,
"epoch": 2.7199999999999998,
"grad_norm": 0.034662626683712006,
"kl": 0.009153366088867188,
"learning_rate": 1.996624790211475e-05,
"loss": -0.0002,
"reward": 6.773137092590332,
"reward_std": 0.912096843123436,
"rewards/mrr_reward": 0.500465027987957,
"rewards/rank_analyze_format_reward": 0.7869019955396652,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 642.796875,
"epoch": 2.7279999999999998,
"grad_norm": 0.035187844187021255,
"kl": 0.009860992431640625,
"learning_rate": 1.9966041268711404e-05,
"loss": -0.001,
"reward": 6.187240362167358,
"reward_std": 1.2494878768920898,
"rewards/mrr_reward": 0.3659474216401577,
"rewards/rank_analyze_format_reward": 0.8201817274093628,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9975329041481018,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9975329041481018,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 616.015625,
"epoch": 2.7359999999999998,
"grad_norm": 0.03603256121277809,
"kl": 0.009283065795898438,
"learning_rate": 1.9965834005798395e-05,
"loss": -0.0136,
"reward": 5.895509123802185,
"reward_std": 0.9243122488260269,
"rewards/mrr_reward": 0.2991505488753319,
"rewards/rank_analyze_format_reward": 0.7767235189676285,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 652.671875,
"epoch": 2.7439999999999998,
"grad_norm": 0.032031431794166565,
"kl": 0.007723808288574219,
"learning_rate": 1.9965626113388823e-05,
"loss": 0.004,
"reward": 5.493781566619873,
"reward_std": 0.8891884908080101,
"rewards/mrr_reward": 0.2125496082007885,
"rewards/rank_analyze_format_reward": 0.7606558352708817,
"rewards/rank_answer_foramt_reward": 0.900390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 647.375,
"epoch": 2.752,
"grad_norm": 0.03751445189118385,
"kl": 0.009563446044921875,
"learning_rate": 1.9965417591495813e-05,
"loss": -0.0398,
"reward": 5.696443438529968,
"reward_std": 0.8934725448489189,
"rewards/mrr_reward": 0.2595982141792774,
"rewards/rank_analyze_format_reward": 0.8279723674058914,
"rewards/rank_answer_foramt_reward": 0.876953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.984375,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 666.75,
"epoch": 2.76,
"grad_norm": 0.03237143158912659,
"kl": 0.0087738037109375,
"learning_rate": 1.9965208440132538e-05,
"loss": 0.0013,
"reward": 5.7427204847335815,
"reward_std": 0.41047997772693634,
"rewards/mrr_reward": 0.23925472237169743,
"rewards/rank_analyze_format_reward": 0.8612567484378815,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 658.296875,
"epoch": 2.768,
"grad_norm": 0.03239751234650612,
"kl": 0.00971221923828125,
"learning_rate": 1.9964998659312212e-05,
"loss": -0.0036,
"reward": 6.0654884576797485,
"reward_std": 1.099491998553276,
"rewards/mrr_reward": 0.3638826832175255,
"rewards/rank_analyze_format_reward": 0.7552082240581512,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9810855388641357,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9810855388641357,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 656.46875,
"epoch": 2.776,
"grad_norm": 0.032796673476696014,
"kl": 0.008184432983398438,
"learning_rate": 1.996478824904808e-05,
"loss": -0.0104,
"reward": 5.7753273248672485,
"reward_std": 0.6123448684811592,
"rewards/mrr_reward": 0.2482638955116272,
"rewards/rank_analyze_format_reward": 0.7992332726716995,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 637.734375,
"epoch": 2.784,
"grad_norm": 0.03421541303396225,
"kl": 0.009485244750976562,
"learning_rate": 1.9964577209353438e-05,
"loss": -0.0268,
"reward": 5.9183748960494995,
"reward_std": 0.7210484445095062,
"rewards/mrr_reward": 0.29007937386631966,
"rewards/rank_analyze_format_reward": 0.8342294245958328,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 591.3125,
"epoch": 2.792,
"grad_norm": 0.037076905369758606,
"kl": 0.008688926696777344,
"learning_rate": 1.9964365540241614e-05,
"loss": -0.0433,
"reward": 5.8531190156936646,
"reward_std": 0.9352162629365921,
"rewards/mrr_reward": 0.3134424611926079,
"rewards/rank_analyze_format_reward": 0.6715000495314598,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 628.421875,
"epoch": 2.8,
"grad_norm": 0.035160817205905914,
"kl": 0.009990692138671875,
"learning_rate": 1.9964153241725984e-05,
"loss": -0.0108,
"reward": 6.071021556854248,
"reward_std": 0.7491893395781517,
"rewards/mrr_reward": 0.34606895968317986,
"rewards/rank_analyze_format_reward": 0.7609644383192062,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 657.671875,
"epoch": 2.808,
"grad_norm": 0.032391324639320374,
"kl": 0.008762359619140625,
"learning_rate": 1.996394031381995e-05,
"loss": -0.0273,
"reward": 5.768019914627075,
"reward_std": 0.6652617454528809,
"rewards/mrr_reward": 0.24676959216594696,
"rewards/rank_analyze_format_reward": 0.8082853406667709,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 667.265625,
"epoch": 2.816,
"grad_norm": 0.030825432389974594,
"kl": 0.008817672729492188,
"learning_rate": 1.996372675653696e-05,
"loss": -0.0243,
"reward": 5.394962310791016,
"reward_std": 0.48561568558216095,
"rewards/mrr_reward": 0.14479166641831398,
"rewards/rank_analyze_format_reward": 0.8704832494258881,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 653.015625,
"epoch": 2.824,
"grad_norm": 0.035048164427280426,
"kl": 0.008731842041015625,
"learning_rate": 1.9963512569890512e-05,
"loss": -0.0201,
"reward": 5.514656662940979,
"reward_std": 0.5665445066988468,
"rewards/mrr_reward": 0.19130083918571472,
"rewards/rank_analyze_format_reward": 0.8020728975534439,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 636.859375,
"epoch": 2.832,
"grad_norm": 0.03681398183107376,
"kl": 0.009633064270019531,
"learning_rate": 1.9963297753894134e-05,
"loss": 0.0131,
"reward": 5.816080689430237,
"reward_std": 0.7230090275406837,
"rewards/mrr_reward": 0.26795635372400284,
"rewards/rank_analyze_format_reward": 0.8138361871242523,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974361509084702,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9974361509084702,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 673.0625,
"epoch": 2.84,
"grad_norm": 0.03574398159980774,
"kl": 0.009863853454589844,
"learning_rate": 1.9963082308561386e-05,
"loss": -0.0167,
"reward": 5.45677387714386,
"reward_std": 0.7667002454400063,
"rewards/mrr_reward": 0.19638517871499062,
"rewards/rank_analyze_format_reward": 0.8426849991083145,
"rewards/rank_answer_foramt_reward": 0.83203125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 618.8125,
"epoch": 2.848,
"grad_norm": 0.03704219311475754,
"kl": 0.009098052978515625,
"learning_rate": 1.9962866233905887e-05,
"loss": -0.0226,
"reward": 5.530822277069092,
"reward_std": 0.7445577755570412,
"rewards/mrr_reward": 0.19813368655741215,
"rewards/rank_analyze_format_reward": 0.837896928191185,
"rewards/rank_answer_foramt_reward": 0.900390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 651.765625,
"epoch": 2.856,
"grad_norm": 0.034613966941833496,
"kl": 0.00957489013671875,
"learning_rate": 1.9962649529941283e-05,
"loss": 0.0124,
"reward": 5.686863660812378,
"reward_std": 0.8740081563591957,
"rewards/mrr_reward": 0.23311011120676994,
"rewards/rank_analyze_format_reward": 0.862964078783989,
"rewards/rank_answer_foramt_reward": 0.927734375,
"rewards/rank_contrast_format_reward": 0.014082618057727814,
"rewards/rank_initial_format_reward": 0.9826335161924362,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 648.453125,
"epoch": 2.864,
"grad_norm": 0.033967334777116776,
"kl": 0.00858306884765625,
"learning_rate": 1.996243219668126e-05,
"loss": -0.0098,
"reward": 5.471218466758728,
"reward_std": 0.6651558130979538,
"rewards/mrr_reward": 0.20168030634522438,
"rewards/rank_analyze_format_reward": 0.7719189673662186,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 1.0,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 643.296875,
"epoch": 2.872,
"grad_norm": 0.033462896943092346,
"kl": 0.0072383880615234375,
"learning_rate": 1.996221423413954e-05,
"loss": -0.0058,
"reward": 6.096756100654602,
"reward_std": 0.7008651196956635,
"rewards/mrr_reward": 0.3258432671427727,
"rewards/rank_analyze_format_reward": 0.8519767969846725,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 643.25,
"epoch": 2.88,
"grad_norm": 0.03083069995045662,
"kl": 0.008129119873046875,
"learning_rate": 1.9961995642329905e-05,
"loss": -0.0077,
"reward": 5.6521806716918945,
"reward_std": 0.3540456146001816,
"rewards/mrr_reward": 0.2099454402923584,
"rewards/rank_analyze_format_reward": 0.8338831663131714,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 631.015625,
"epoch": 2.888,
"grad_norm": 0.03494837507605553,
"kl": 0.0089569091796875,
"learning_rate": 1.996177642126615e-05,
"loss": -0.0372,
"reward": 6.048594832420349,
"reward_std": 0.5146159902215004,
"rewards/mrr_reward": 0.32186879962682724,
"rewards/rank_analyze_format_reward": 0.7611195892095566,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 666.5,
"epoch": 2.896,
"grad_norm": 0.03123210370540619,
"kl": 0.00858306884765625,
"learning_rate": 1.996155657096213e-05,
"loss": -0.0066,
"reward": 5.597678542137146,
"reward_std": 0.6386058628559113,
"rewards/mrr_reward": 0.20701265148818493,
"rewards/rank_analyze_format_reward": 0.8321279138326645,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 636.078125,
"epoch": 2.904,
"grad_norm": 0.03348281979560852,
"kl": 0.00963592529296875,
"learning_rate": 1.9961336091431728e-05,
"loss": -0.0201,
"reward": 5.798085331916809,
"reward_std": 0.7690745741128922,
"rewards/mrr_reward": 0.2617683596909046,
"rewards/rank_analyze_format_reward": 0.8075376749038696,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 656.109375,
"epoch": 2.912,
"grad_norm": 0.03250405564904213,
"kl": 0.0079803466796875,
"learning_rate": 1.9961114982688868e-05,
"loss": -0.0069,
"reward": 5.859158277511597,
"reward_std": 0.6455894485116005,
"rewards/mrr_reward": 0.2464347779750824,
"rewards/rank_analyze_format_reward": 0.8734191805124283,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 645.96875,
"epoch": 2.92,
"grad_norm": 0.03663971275091171,
"kl": 0.010545730590820312,
"learning_rate": 1.9960893244747525e-05,
"loss": -0.0396,
"reward": 5.503406643867493,
"reward_std": 0.8078130483627319,
"rewards/mrr_reward": 0.20906499400734901,
"rewards/rank_analyze_format_reward": 0.7881255447864532,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9834558814764023,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 655.03125,
"epoch": 2.928,
"grad_norm": 0.035040080547332764,
"kl": 0.009366989135742188,
"learning_rate": 1.9960670877621697e-05,
"loss": 0.0398,
"reward": 6.2447816133499146,
"reward_std": 0.8318488001823425,
"rewards/mrr_reward": 0.36858259700238705,
"rewards/rank_analyze_format_reward": 0.8759200721979141,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 1.0,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 654.484375,
"epoch": 2.936,
"grad_norm": 0.0322866216301918,
"kl": 0.009279251098632812,
"learning_rate": 1.9960447881325433e-05,
"loss": 0.0056,
"reward": 5.498760938644409,
"reward_std": 0.4926854334771633,
"rewards/mrr_reward": 0.18302952125668526,
"rewards/rank_analyze_format_reward": 0.8291427195072174,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.96875,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 693.359375,
"epoch": 2.944,
"grad_norm": 0.031908247619867325,
"kl": 0.007828712463378906,
"learning_rate": 1.996022425587282e-05,
"loss": 0.0028,
"reward": 5.857112407684326,
"reward_std": 0.5642570108175278,
"rewards/mrr_reward": 0.26006944477558136,
"rewards/rank_analyze_format_reward": 0.8460166752338409,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 653.9375,
"epoch": 2.952,
"grad_norm": 0.033450644463300705,
"kl": 0.008747100830078125,
"learning_rate": 1.9960000001277985e-05,
"loss": 0.0023,
"reward": 6.380629658699036,
"reward_std": 0.8351171687245369,
"rewards/mrr_reward": 0.4134734645485878,
"rewards/rank_analyze_format_reward": 0.7850210219621658,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 661.171875,
"epoch": 2.96,
"grad_norm": 0.03648662939667702,
"kl": 0.009329795837402344,
"learning_rate": 1.9959775117555085e-05,
"loss": 0.0345,
"reward": 6.414909482002258,
"reward_std": 0.45721762999892235,
"rewards/mrr_reward": 0.41873140074312687,
"rewards/rank_analyze_format_reward": 0.7966244220733643,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 617.609375,
"epoch": 2.968,
"grad_norm": 0.037536896765232086,
"kl": 0.010850906372070312,
"learning_rate": 1.995954960471833e-05,
"loss": -0.0382,
"reward": 5.33310854434967,
"reward_std": 0.5616030171513557,
"rewards/mrr_reward": 0.16109870746731758,
"rewards/rank_analyze_format_reward": 0.8039480000734329,
"rewards/rank_answer_foramt_reward": 0.900390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 642.15625,
"epoch": 2.976,
"grad_norm": 0.03529645875096321,
"kl": 0.009998321533203125,
"learning_rate": 1.995932346278197e-05,
"loss": -0.037,
"reward": 5.771807551383972,
"reward_std": 0.5844480693340302,
"rewards/mrr_reward": 0.2539186589419842,
"rewards/rank_analyze_format_reward": 0.8147266507148743,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 1.0,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 587.5625,
"epoch": 2.984,
"grad_norm": 0.034666452556848526,
"kl": 0.01047515869140625,
"learning_rate": 1.9959096691760284e-05,
"loss": -0.0132,
"reward": 6.080157160758972,
"reward_std": 0.6419508755207062,
"rewards/mrr_reward": 0.34099702909588814,
"rewards/rank_analyze_format_reward": 0.8141245543956757,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9978972524404526,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9978972524404526,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 654.375,
"epoch": 2.992,
"grad_norm": 0.03392947465181351,
"kl": 0.008699417114257812,
"learning_rate": 1.995886929166759e-05,
"loss": -0.0404,
"reward": 5.7819143533706665,
"reward_std": 0.47098034992814064,
"rewards/mrr_reward": 0.260416679084301,
"rewards/rank_analyze_format_reward": 0.8203257471323013,
"rewards/rank_answer_foramt_reward": 0.927734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 656.75,
"epoch": 3.0,
"grad_norm": 0.03569958359003067,
"kl": 0.009565353393554688,
"learning_rate": 1.9958641262518263e-05,
"loss": -0.0069,
"reward": 5.463603854179382,
"reward_std": 0.41404130309820175,
"rewards/mrr_reward": 0.17387152649462223,
"rewards/rank_analyze_format_reward": 0.832147404551506,
"rewards/rank_answer_foramt_reward": 0.955078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 661.671875,
"epoch": 3.008,
"grad_norm": 0.03453488275408745,
"kl": 0.008286476135253906,
"learning_rate": 3.4816627469912147e-06,
"loss": -0.019,
"reward": 5.870022416114807,
"reward_std": 0.5319755226373672,
"rewards/mrr_reward": 0.2689298205077648,
"rewards/rank_analyze_format_reward": 0.8235997408628464,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 626.734375,
"epoch": 3.016,
"grad_norm": 0.03683853894472122,
"kl": 0.009878158569335938,
"learning_rate": 3.4341424424704373e-06,
"loss": -0.0352,
"reward": 6.375778317451477,
"reward_std": 0.8507000654935837,
"rewards/mrr_reward": 0.4045138992369175,
"rewards/rank_analyze_format_reward": 0.8163162767887115,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 674.953125,
"epoch": 3.024,
"grad_norm": 0.03564409166574478,
"kl": 0.009000778198242188,
"learning_rate": 3.3868813467634833e-06,
"loss": 0.0035,
"reward": 6.099611282348633,
"reward_std": 0.6811064556241035,
"rewards/mrr_reward": 0.326202891767025,
"rewards/rank_analyze_format_reward": 0.8689035177230835,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 646.40625,
"epoch": 3.032,
"grad_norm": 0.033847253769636154,
"kl": 0.009696006774902344,
"learning_rate": 3.3398813256574847e-06,
"loss": -0.0138,
"reward": 5.70564591884613,
"reward_std": 0.7488968372344971,
"rewards/mrr_reward": 0.2354228664189577,
"rewards/rank_analyze_format_reward": 0.814735621213913,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 662.53125,
"epoch": 3.04,
"grad_norm": 0.03472684323787689,
"kl": 0.009616851806640625,
"learning_rate": 3.2931442346328e-06,
"loss": 0.005,
"reward": 5.8208394050598145,
"reward_std": 0.8216791450977325,
"rewards/mrr_reward": 0.2550409249961376,
"rewards/rank_analyze_format_reward": 0.8592693954706192,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 1.0,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 670.015625,
"epoch": 3.048,
"grad_norm": 0.03275707736611366,
"kl": 0.009731292724609375,
"learning_rate": 3.2466719187897555e-06,
"loss": 0.009,
"reward": 5.976478338241577,
"reward_std": 0.9694596379995346,
"rewards/mrr_reward": 0.3182477727532387,
"rewards/rank_analyze_format_reward": 0.8373227268457413,
"rewards/rank_answer_foramt_reward": 0.900390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9985119104385376,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9985119104385376,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 662.234375,
"epoch": 3.056,
"grad_norm": 0.03374367952346802,
"kl": 0.009586334228515625,
"learning_rate": 3.200466212775808e-06,
"loss": -0.0045,
"reward": 5.7755879163742065,
"reward_std": 0.5715985968708992,
"rewards/mrr_reward": 0.2742931507527828,
"rewards/rank_analyze_format_reward": 0.8286910802125931,
"rewards/rank_answer_foramt_reward": 0.890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 634.234375,
"epoch": 3.064,
"grad_norm": 0.03337114676833153,
"kl": 0.00981903076171875,
"learning_rate": 3.1545289407131128e-06,
"loss": 0.0193,
"reward": 6.042637348175049,
"reward_std": 0.8147040233016014,
"rewards/mrr_reward": 0.3159040194004774,
"rewards/rank_analyze_format_reward": 0.7925428599119186,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.013822115026414394,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 637.3125,
"epoch": 3.072,
"grad_norm": 0.03374440222978592,
"kl": 0.0095977783203125,
"learning_rate": 3.108861916126518e-06,
"loss": -0.0052,
"reward": 5.525590181350708,
"reward_std": 0.4414802975952625,
"rewards/mrr_reward": 0.1869729682803154,
"rewards/rank_analyze_format_reward": 0.843795970082283,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 647.640625,
"epoch": 3.08,
"grad_norm": 0.03527143970131874,
"kl": 0.0100250244140625,
"learning_rate": 3.063466941871952e-06,
"loss": -0.0349,
"reward": 5.4740070104599,
"reward_std": 0.39598348736763,
"rewards/mrr_reward": 0.17225322499871254,
"rewards/rank_analyze_format_reward": 0.8160143941640854,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 641.03125,
"epoch": 3.088,
"grad_norm": 0.03699163347482681,
"kl": 0.01068115234375,
"learning_rate": 3.0183458100652752e-06,
"loss": -0.0098,
"reward": 5.632686495780945,
"reward_std": 0.8547279201447964,
"rewards/mrr_reward": 0.21130332723259926,
"rewards/rank_analyze_format_reward": 0.836301326751709,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 615.25,
"epoch": 3.096,
"grad_norm": 0.035420242697000504,
"kl": 0.013090133666992188,
"learning_rate": 2.9735003020115095e-06,
"loss": 0.0072,
"reward": 5.7270954847335815,
"reward_std": 0.8438360095024109,
"rewards/mrr_reward": 0.26177455112338066,
"rewards/rank_analyze_format_reward": 0.7755855619907379,
"rewards/rank_answer_foramt_reward": 0.9140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 661.71875,
"epoch": 3.104,
"grad_norm": 0.033768534660339355,
"kl": 0.008324623107910156,
"learning_rate": 2.9289321881345257e-06,
"loss": -0.0074,
"reward": 5.456307530403137,
"reward_std": 0.2819017954170704,
"rewards/mrr_reward": 0.1582651287317276,
"rewards/rank_analyze_format_reward": 0.8524289280176163,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 623.921875,
"epoch": 3.112,
"grad_norm": 0.03795436769723892,
"kl": 0.011144638061523438,
"learning_rate": 2.884643227907147e-06,
"loss": -0.02,
"reward": 6.103384494781494,
"reward_std": 0.7188520580530167,
"rewards/mrr_reward": 0.3517671152949333,
"rewards/rank_analyze_format_reward": 0.7562461942434311,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 662.984375,
"epoch": 3.12,
"grad_norm": 0.03798002377152443,
"kl": 0.009586334228515625,
"learning_rate": 2.840635169781688e-06,
"loss": 0.0009,
"reward": 5.6863319873809814,
"reward_std": 0.8139217495918274,
"rewards/mrr_reward": 0.24700520560145378,
"rewards/rank_analyze_format_reward": 0.7954290956258774,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9973393976688385,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9973393976688385,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 640.625,
"epoch": 3.128,
"grad_norm": 0.03476894274353981,
"kl": 0.009393692016601562,
"learning_rate": 2.796909751120931e-06,
"loss": -0.0152,
"reward": 5.828433513641357,
"reward_std": 0.6863922253251076,
"rewards/mrr_reward": 0.2774987518787384,
"rewards/rank_analyze_format_reward": 0.8160946071147919,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 672.328125,
"epoch": 3.136,
"grad_norm": 0.03496420755982399,
"kl": 0.01062774658203125,
"learning_rate": 2.7534686981295335e-06,
"loss": 0.0069,
"reward": 5.767983317375183,
"reward_std": 0.6909240707755089,
"rewards/mrr_reward": 0.24921875447034836,
"rewards/rank_analyze_format_reward": 0.8529097139835358,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9981617629528046,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 639.734375,
"epoch": 3.144,
"grad_norm": 0.03465255722403526,
"kl": 0.009387969970703125,
"learning_rate": 2.7103137257858867e-06,
"loss": -0.0177,
"reward": 5.525047659873962,
"reward_std": 0.7425801493227482,
"rewards/mrr_reward": 0.21982267126441002,
"rewards/rank_analyze_format_reward": 0.7590381950139999,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 655.5,
"epoch": 3.152,
"grad_norm": 0.03880779445171356,
"kl": 0.012248992919921875,
"learning_rate": 2.667446537774402e-06,
"loss": -0.0006,
"reward": 5.65397036075592,
"reward_std": 0.5698041021823883,
"rewards/mrr_reward": 0.22565104067325592,
"rewards/rank_analyze_format_reward": 0.786522388458252,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 655.9375,
"epoch": 3.16,
"grad_norm": 0.03340061381459236,
"kl": 0.009691238403320312,
"learning_rate": 2.624868826418262e-06,
"loss": 0.0072,
"reward": 6.097415328025818,
"reward_std": 0.7755601853132248,
"rewards/mrr_reward": 0.330853171646595,
"rewards/rank_analyze_format_reward": 0.8189245611429214,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 618.46875,
"epoch": 3.168,
"grad_norm": 0.037465766072273254,
"kl": 0.009607315063476562,
"learning_rate": 2.5825822726126095e-06,
"loss": -0.0024,
"reward": 5.320816397666931,
"reward_std": 0.679179236292839,
"rewards/mrr_reward": 0.17338790372014046,
"rewards/rank_analyze_format_reward": 0.7400810569524765,
"rewards/rank_answer_foramt_reward": 0.912109375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9992559552192688,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 640.21875,
"epoch": 3.176,
"grad_norm": 0.033823512494564056,
"kl": 0.008966445922851562,
"learning_rate": 2.5405885457581793e-06,
"loss": -0.0125,
"reward": 6.29050076007843,
"reward_std": 0.8139433264732361,
"rewards/mrr_reward": 0.3840401843190193,
"rewards/rank_analyze_format_reward": 0.806766077876091,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 631.34375,
"epoch": 3.184,
"grad_norm": 0.034099679440259933,
"kl": 0.009246826171875,
"learning_rate": 2.4988893036954045e-06,
"loss": -0.0128,
"reward": 5.263689160346985,
"reward_std": 0.5176863595843315,
"rewards/mrr_reward": 0.13877107948064804,
"rewards/rank_analyze_format_reward": 0.8295836299657822,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9678308814764023,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 622.40625,
"epoch": 3.192,
"grad_norm": 0.03965727239847183,
"kl": 0.00982666015625,
"learning_rate": 2.4574861926389615e-06,
"loss": 0.0202,
"reward": 5.415614366531372,
"reward_std": 0.5104451552033424,
"rewards/mrr_reward": 0.1703559048473835,
"rewards/rank_analyze_format_reward": 0.8391944617033005,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9836309552192688,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 675.515625,
"epoch": 3.2,
"grad_norm": 0.03522520139813423,
"kl": 0.009145736694335938,
"learning_rate": 2.4163808471127815e-06,
"loss": 0.0108,
"reward": 6.274830937385559,
"reward_std": 0.9628144055604935,
"rewards/mrr_reward": 0.3546936884522438,
"rewards/rank_analyze_format_reward": 0.8702747970819473,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.013124999590218067,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 671.734375,
"epoch": 3.208,
"grad_norm": 0.03297824412584305,
"kl": 0.010050773620605469,
"learning_rate": 2.37557488988552e-06,
"loss": -0.0181,
"reward": 5.695779204368591,
"reward_std": 0.6984521001577377,
"rewards/mrr_reward": 0.23389137163758278,
"rewards/rank_analyze_format_reward": 0.8285732418298721,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 611.90625,
"epoch": 3.216,
"grad_norm": 0.03468155115842819,
"kl": 0.011409759521484375,
"learning_rate": 2.335069931906503e-06,
"loss": 0.016,
"reward": 5.605130910873413,
"reward_std": 0.5643002241849899,
"rewards/mrr_reward": 0.23312251828610897,
"rewards/rank_analyze_format_reward": 0.7599145472049713,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 652.015625,
"epoch": 3.224,
"grad_norm": 0.03274522349238396,
"kl": 0.0077266693115234375,
"learning_rate": 2.2948675722421086e-06,
"loss": -0.0008,
"reward": 5.618125796318054,
"reward_std": 0.4926639534533024,
"rewards/mrr_reward": 0.2079303190112114,
"rewards/rank_analyze_format_reward": 0.8157014697790146,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 630.703125,
"epoch": 3.232,
"grad_norm": 0.032266825437545776,
"kl": 0.008646011352539062,
"learning_rate": 2.254969398012663e-06,
"loss": 0.0062,
"reward": 5.917828798294067,
"reward_std": 1.038107082247734,
"rewards/mrr_reward": 0.31374628096818924,
"rewards/rank_analyze_format_reward": 0.723275676369667,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 685.0,
"epoch": 3.24,
"grad_norm": 0.033160846680402756,
"kl": 0.007889747619628906,
"learning_rate": 2.215376984329767e-06,
"loss": 0.0108,
"reward": 5.611984491348267,
"reward_std": 0.5307941734790802,
"rewards/mrr_reward": 0.20152530074119568,
"rewards/rank_analyze_format_reward": 0.8371334373950958,
"rewards/rank_answer_foramt_reward": 0.96875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 651.421875,
"epoch": 3.248,
"grad_norm": 0.032735127955675125,
"kl": 0.009138107299804688,
"learning_rate": 2.1760918942341193e-06,
"loss": -0.018,
"reward": 5.920067191123962,
"reward_std": 0.4110058397054672,
"rewards/mrr_reward": 0.28723959624767303,
"rewards/rank_analyze_format_reward": 0.8022439330816269,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 661.703125,
"epoch": 3.2560000000000002,
"grad_norm": 0.0345609113574028,
"kl": 0.008047103881835938,
"learning_rate": 2.1371156786338108e-06,
"loss": -0.0177,
"reward": 6.009241461753845,
"reward_std": 0.7121211290359497,
"rewards/mrr_reward": 0.3027033731341362,
"rewards/rank_analyze_format_reward": 0.8433498591184616,
"rewards/rank_answer_foramt_reward": 0.955078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 643.953125,
"epoch": 3.2640000000000002,
"grad_norm": 0.034407492727041245,
"kl": 0.009765625,
"learning_rate": 2.098449876243096e-06,
"loss": -0.0046,
"reward": 5.621447324752808,
"reward_std": 0.7794221378862858,
"rewards/mrr_reward": 0.22875124216079712,
"rewards/rank_analyze_format_reward": 0.8310191482305527,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 588.671875,
"epoch": 3.2720000000000002,
"grad_norm": 0.03622359409928322,
"kl": 0.008343696594238281,
"learning_rate": 2.0600960135216463e-06,
"loss": -0.0396,
"reward": 5.996233105659485,
"reward_std": 0.6990708820521832,
"rewards/mrr_reward": 0.3586743548512459,
"rewards/rank_analyze_format_reward": 0.678608126938343,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9834558814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 642.015625,
"epoch": 3.2800000000000002,
"grad_norm": 0.03268953040242195,
"kl": 0.009521484375,
"learning_rate": 2.022055604614289e-06,
"loss": -0.0041,
"reward": 5.610453367233276,
"reward_std": 0.608606144785881,
"rewards/mrr_reward": 0.21897321939468384,
"rewards/rank_analyze_format_reward": 0.7794823199510574,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 1.0,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 663.40625,
"epoch": 3.288,
"grad_norm": 0.03196902945637703,
"kl": 0.009124755859375,
"learning_rate": 1.984330151291233e-06,
"loss": -0.0202,
"reward": 5.701419472694397,
"reward_std": 0.3155892379581928,
"rewards/mrr_reward": 0.2233320865780115,
"rewards/rank_analyze_format_reward": 0.8373879790306091,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 612.28125,
"epoch": 3.296,
"grad_norm": 0.03455911576747894,
"kl": 0.010396957397460938,
"learning_rate": 1.9469211428887813e-06,
"loss": -0.0327,
"reward": 5.262094497680664,
"reward_std": 0.5627113878726959,
"rewards/mrr_reward": 0.15016741678118706,
"rewards/rank_analyze_format_reward": 0.7412733286619186,
"rewards/rank_answer_foramt_reward": 0.955078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9981617629528046,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9825367629528046,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 668.09375,
"epoch": 3.304,
"grad_norm": 0.0337919220328331,
"kl": 0.00904083251953125,
"learning_rate": 1.9098300562505266e-06,
"loss": -0.0192,
"reward": 5.6808494329452515,
"reward_std": 0.5475155636668205,
"rewards/mrr_reward": 0.21426091715693474,
"rewards/rank_analyze_format_reward": 0.8667744994163513,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 603.0625,
"epoch": 3.312,
"grad_norm": 0.039892613887786865,
"kl": 0.009840965270996094,
"learning_rate": 1.8730583556690607e-06,
"loss": 0.002,
"reward": 5.8083416223526,
"reward_std": 1.1347919255495071,
"rewards/mrr_reward": 0.2994481772184372,
"rewards/rank_analyze_format_reward": 0.7936044484376907,
"rewards/rank_answer_foramt_reward": 0.861328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9973393976688385,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9817143976688385,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 637.953125,
"epoch": 3.32,
"grad_norm": 0.03493841364979744,
"kl": 0.010486602783203125,
"learning_rate": 1.8366074928281608e-06,
"loss": -0.0211,
"reward": 6.134096622467041,
"reward_std": 1.0277494341135025,
"rewards/mrr_reward": 0.3545076847076416,
"rewards/rank_analyze_format_reward": 0.8130272477865219,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9974177181720734,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9974177181720734,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 641.890625,
"epoch": 3.328,
"grad_norm": 0.034219007939100266,
"kl": 0.008632659912109375,
"learning_rate": 1.8004789067454763e-06,
"loss": -0.0222,
"reward": 6.18049168586731,
"reward_std": 0.5877049472182989,
"rewards/mrr_reward": 0.36963665671646595,
"rewards/rank_analyze_format_reward": 0.7174552381038666,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 637.578125,
"epoch": 3.336,
"grad_norm": 0.03582116961479187,
"kl": 0.010225296020507812,
"learning_rate": 1.7646740237157256e-06,
"loss": -0.017,
"reward": 5.773987531661987,
"reward_std": 0.6770635172724724,
"rewards/mrr_reward": 0.25188492238521576,
"rewards/rank_analyze_format_reward": 0.8191821128129959,
"rewards/rank_answer_foramt_reward": 0.955078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 617.609375,
"epoch": 3.344,
"grad_norm": 0.03502979502081871,
"kl": 0.010501861572265625,
"learning_rate": 1.7291942572543806e-06,
"loss": -0.0141,
"reward": 6.250616192817688,
"reward_std": 0.9646867886185646,
"rewards/mrr_reward": 0.3685888033360243,
"rewards/rank_analyze_format_reward": 0.8325932174921036,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 646.5,
"epoch": 3.352,
"grad_norm": 0.03461702913045883,
"kl": 0.009029388427734375,
"learning_rate": 1.6940410080418723e-06,
"loss": 0.002,
"reward": 5.604981422424316,
"reward_std": 0.6194628737866879,
"rewards/mrr_reward": 0.21886160969734192,
"rewards/rank_analyze_format_reward": 0.7708402574062347,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.013382176868617535,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 629.5625,
"epoch": 3.36,
"grad_norm": 0.14745499193668365,
"kl": 0.036579132080078125,
"learning_rate": 1.6592156638682887e-06,
"loss": -0.011,
"reward": 5.543843626976013,
"reward_std": 0.6722467541694641,
"rewards/mrr_reward": 0.22320189327001572,
"rewards/rank_analyze_format_reward": 0.7316861301660538,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.013099747709929943,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 1.0,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 638.125,
"epoch": 3.368,
"grad_norm": 0.03604007884860039,
"kl": 0.009619712829589844,
"learning_rate": 1.6247195995785836e-06,
"loss": -0.0095,
"reward": 5.799249887466431,
"reward_std": 0.9532105177640915,
"rewards/mrr_reward": 0.2738591395318508,
"rewards/rank_analyze_format_reward": 0.8365109711885452,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9834558814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 646.09375,
"epoch": 3.376,
"grad_norm": 0.035499464720487595,
"kl": 0.012187957763671875,
"learning_rate": 1.5905541770183096e-06,
"loss": 0.007,
"reward": 5.608644723892212,
"reward_std": 0.8124164063483477,
"rewards/mrr_reward": 0.23054935038089752,
"rewards/rank_analyze_format_reward": 0.8543011993169785,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9453125,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 645.90625,
"epoch": 3.384,
"grad_norm": 0.031818851828575134,
"kl": 0.008767127990722656,
"learning_rate": 1.5567207449798517e-06,
"loss": -0.0073,
"reward": 5.9224079847335815,
"reward_std": 1.0137402415275574,
"rewards/mrr_reward": 0.3127914294600487,
"rewards/rank_analyze_format_reward": 0.8292285054922104,
"rewards/rank_answer_foramt_reward": 0.953125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9678819477558136,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9678819477558136,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 669.125,
"epoch": 3.392,
"grad_norm": 0.032386232167482376,
"kl": 0.009622573852539062,
"learning_rate": 1.52322063914917e-06,
"loss": -0.0083,
"reward": 5.772087574005127,
"reward_std": 1.0963159650564194,
"rewards/mrr_reward": 0.2973834425210953,
"rewards/rank_analyze_format_reward": 0.7504077702760696,
"rewards/rank_answer_foramt_reward": 0.873046875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 636.953125,
"epoch": 3.4,
"grad_norm": 0.03259604051709175,
"kl": 0.008920669555664062,
"learning_rate": 1.490055182053083e-06,
"loss": -0.0153,
"reward": 6.132978916168213,
"reward_std": 0.8273980095982552,
"rewards/mrr_reward": 0.349144347012043,
"rewards/rank_analyze_format_reward": 0.7989014983177185,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 647.078125,
"epoch": 3.408,
"grad_norm": 0.034432653337717056,
"kl": 0.008844375610351562,
"learning_rate": 1.4572256830070497e-06,
"loss": 0.0307,
"reward": 5.3963083028793335,
"reward_std": 0.7059964388608932,
"rewards/mrr_reward": 0.17215402238070965,
"rewards/rank_analyze_format_reward": 0.8245711624622345,
"rewards/rank_answer_foramt_reward": 0.955078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9835526347160339,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 645.421875,
"epoch": 3.416,
"grad_norm": 0.03640694543719292,
"kl": 0.009423255920410156,
"learning_rate": 1.4247334380634792e-06,
"loss": -0.0005,
"reward": 5.745667219161987,
"reward_std": 1.3311158269643784,
"rewards/mrr_reward": 0.26646826043725014,
"rewards/rank_analyze_format_reward": 0.875150740146637,
"rewards/rank_answer_foramt_reward": 0.794921875,
"rewards/rank_contrast_format_reward": 0.015236318111419678,
"rewards/rank_initial_format_reward": 0.9972426444292068,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9972426444292068,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 642.109375,
"epoch": 3.424,
"grad_norm": 0.034119006246328354,
"kl": 0.008829116821289062,
"learning_rate": 1.3925797299605649e-06,
"loss": -0.0087,
"reward": 5.416736364364624,
"reward_std": 0.5500404462218285,
"rewards/mrr_reward": 0.16524678096175194,
"rewards/rank_analyze_format_reward": 0.8143431395292282,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 682.46875,
"epoch": 3.432,
"grad_norm": 0.03242919594049454,
"kl": 0.008722305297851562,
"learning_rate": 1.3607658280716474e-06,
"loss": -0.0019,
"reward": 5.716560482978821,
"reward_std": 0.5728246569633484,
"rewards/mrr_reward": 0.23904389142990112,
"rewards/rank_analyze_format_reward": 0.8736661523580551,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 626.25,
"epoch": 3.44,
"grad_norm": 0.03218907490372658,
"kl": 0.009031295776367188,
"learning_rate": 1.3292929883550998e-06,
"loss": -0.006,
"reward": 6.318280220031738,
"reward_std": 0.6036234200000763,
"rewards/mrr_reward": 0.38396577537059784,
"rewards/rank_analyze_format_reward": 0.8253858089447021,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 670.171875,
"epoch": 3.448,
"grad_norm": 0.03488701581954956,
"kl": 0.009675979614257812,
"learning_rate": 1.2981624533047432e-06,
"loss": -0.0275,
"reward": 5.669755578041077,
"reward_std": 1.1037492379546165,
"rewards/mrr_reward": 0.25412946194410324,
"rewards/rank_analyze_format_reward": 0.7779058814048767,
"rewards/rank_answer_foramt_reward": 0.88671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982128292322159,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9982128292322159,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 649.71875,
"epoch": 3.456,
"grad_norm": 0.03478289395570755,
"kl": 0.009863853454589844,
"learning_rate": 1.2673754519008008e-06,
"loss": -0.0365,
"reward": 5.450997948646545,
"reward_std": 0.6340261902660131,
"rewards/mrr_reward": 0.1804935522377491,
"rewards/rank_analyze_format_reward": 0.8247267752885818,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 632.875,
"epoch": 3.464,
"grad_norm": 0.03279885649681091,
"kl": 0.0091705322265625,
"learning_rate": 1.2369331995613664e-06,
"loss": 0.0036,
"reward": 5.708705902099609,
"reward_std": 0.6798514872789383,
"rewards/mrr_reward": 0.24531250447034836,
"rewards/rank_analyze_format_reward": 0.7816782742738724,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9992559552192688,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9836309552192688,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 631.09375,
"epoch": 3.472,
"grad_norm": 0.03269350156188011,
"kl": 0.00931549072265625,
"learning_rate": 1.206836898094439e-06,
"loss": 0.0103,
"reward": 6.359462022781372,
"reward_std": 0.8851971626281738,
"rewards/mrr_reward": 0.3808903694152832,
"rewards/rank_analyze_format_reward": 0.8730098009109497,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 609.71875,
"epoch": 3.48,
"grad_norm": 0.03501614183187485,
"kl": 0.010263442993164062,
"learning_rate": 1.1770877356504684e-06,
"loss": 0.0062,
"reward": 6.128593564033508,
"reward_std": 0.9365501217544079,
"rewards/mrr_reward": 0.361879987642169,
"rewards/rank_analyze_format_reward": 0.7884955406188965,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 1.0,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 664.265625,
"epoch": 3.488,
"grad_norm": 0.033260658383369446,
"kl": 0.008501052856445312,
"learning_rate": 1.1476868866754488e-06,
"loss": -0.0187,
"reward": 5.864734411239624,
"reward_std": 0.4621109887957573,
"rewards/mrr_reward": 0.25895338132977486,
"rewards/rank_analyze_format_reward": 0.8562646806240082,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 639.625,
"epoch": 3.496,
"grad_norm": 0.03378913179039955,
"kl": 0.009352684020996094,
"learning_rate": 1.1186355118645552e-06,
"loss": -0.0349,
"reward": 5.749386191368103,
"reward_std": 0.3994421735405922,
"rewards/mrr_reward": 0.2401475664228201,
"rewards/rank_analyze_format_reward": 0.8180928528308868,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 682.265625,
"epoch": 3.504,
"grad_norm": 0.03421083465218544,
"kl": 0.009157180786132812,
"learning_rate": 1.0899347581163222e-06,
"loss": -0.005,
"reward": 5.491220116615295,
"reward_std": 0.40433138608932495,
"rewards/mrr_reward": 0.15951761417090893,
"rewards/rank_analyze_format_reward": 0.8843995481729507,
"rewards/rank_answer_foramt_reward": 0.984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 643.046875,
"epoch": 3.512,
"grad_norm": 0.03325556218624115,
"kl": 0.008181571960449219,
"learning_rate": 1.0615857584873624e-06,
"loss": 0.0038,
"reward": 5.652897953987122,
"reward_std": 0.42538975179195404,
"rewards/mrr_reward": 0.2011718824505806,
"rewards/rank_analyze_format_reward": 0.8716480582952499,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 664.34375,
"epoch": 3.52,
"grad_norm": 0.03412213921546936,
"kl": 0.009923934936523438,
"learning_rate": 1.0335896321476413e-06,
"loss": 0.0084,
"reward": 5.463203430175781,
"reward_std": 0.47837162390351295,
"rewards/mrr_reward": 0.17101315408945084,
"rewards/rank_analyze_format_reward": 0.8377447873353958,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 665.5625,
"epoch": 3.528,
"grad_norm": 0.033871665596961975,
"kl": 0.008985519409179688,
"learning_rate": 1.0059474843362893e-06,
"loss": -0.0253,
"reward": 5.863033652305603,
"reward_std": 0.856530025601387,
"rewards/mrr_reward": 0.2802455462515354,
"rewards/rank_analyze_format_reward": 0.8234658539295197,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9983552694320679,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 614.828125,
"epoch": 3.536,
"grad_norm": 0.03464524820446968,
"kl": 0.010242462158203125,
"learning_rate": 9.786604063179728e-07,
"loss": -0.0112,
"reward": 6.178846478462219,
"reward_std": 0.7333296239376068,
"rewards/mrr_reward": 0.35404886677861214,
"rewards/rank_analyze_format_reward": 0.8036665320396423,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 630.484375,
"epoch": 3.544,
"grad_norm": 0.039022162556648254,
"kl": 0.010075569152832031,
"learning_rate": 9.517294753398066e-07,
"loss": -0.0103,
"reward": 6.17569887638092,
"reward_std": 0.805585939437151,
"rewards/mrr_reward": 0.3522135466337204,
"rewards/rank_analyze_format_reward": 0.8039542138576508,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 655.3125,
"epoch": 3.552,
"grad_norm": 0.03630434721708298,
"kl": 0.011911392211914062,
"learning_rate": 9.251557545888312e-07,
"loss": 0.0073,
"reward": 5.2746394872665405,
"reward_std": 0.6974633485078812,
"rewards/mrr_reward": 0.142398314550519,
"rewards/rank_analyze_format_reward": 0.8372418582439423,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983552694320679,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9827302694320679,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 614.171875,
"epoch": 3.56,
"grad_norm": 0.03797624632716179,
"kl": 0.009185791015625,
"learning_rate": 8.989402931500434e-07,
"loss": -0.0257,
"reward": 5.71190345287323,
"reward_std": 0.7688554152846336,
"rewards/mrr_reward": 0.2587921619415283,
"rewards/rank_analyze_format_reward": 0.7587659955024719,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 661.84375,
"epoch": 3.568,
"grad_norm": 0.036750566214323044,
"kl": 0.009357452392578125,
"learning_rate": 8.730841259649725e-07,
"loss": 0.0165,
"reward": 6.172403573989868,
"reward_std": 0.5994044467806816,
"rewards/mrr_reward": 0.3739459365606308,
"rewards/rank_analyze_format_reward": 0.7410728335380554,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 661.359375,
"epoch": 3.576,
"grad_norm": 0.030956851318478584,
"kl": 0.009416580200195312,
"learning_rate": 8.475882737908248e-07,
"loss": -0.0069,
"reward": 5.37591028213501,
"reward_std": 0.33451657742261887,
"rewards/mrr_reward": 0.1444692499935627,
"rewards/rank_analyze_format_reward": 0.8814376294612885,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983368366956711,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9827118366956711,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 631.265625,
"epoch": 3.584,
"grad_norm": 0.03611503168940544,
"kl": 0.010535240173339844,
"learning_rate": 8.224537431601886e-07,
"loss": 0.0162,
"reward": 5.798620223999023,
"reward_std": 0.4908381961286068,
"rewards/mrr_reward": 0.23276909813284874,
"rewards/rank_analyze_format_reward": 0.8812157958745956,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 632.671875,
"epoch": 3.592,
"grad_norm": 0.03399351239204407,
"kl": 0.009492874145507812,
"learning_rate": 7.976815263412963e-07,
"loss": -0.0118,
"reward": 6.341515421867371,
"reward_std": 0.6863338127732277,
"rewards/mrr_reward": 0.3844680190086365,
"rewards/rank_analyze_format_reward": 0.8559543788433075,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 618.03125,
"epoch": 3.6,
"grad_norm": 0.039749711751937866,
"kl": 0.009737014770507812,
"learning_rate": 7.732726012988512e-07,
"loss": -0.0146,
"reward": 5.313909411430359,
"reward_std": 0.4731576666235924,
"rewards/mrr_reward": 0.13981274887919426,
"rewards/rank_analyze_format_reward": 0.8499381393194199,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9982585161924362,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9982585161924362,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 629.46875,
"epoch": 3.608,
"grad_norm": 0.03814476728439331,
"kl": 0.009777069091796875,
"learning_rate": 7.492279316554207e-07,
"loss": 0.002,
"reward": 5.441387295722961,
"reward_std": 0.964412122964859,
"rewards/mrr_reward": 0.2072792761027813,
"rewards/rank_analyze_format_reward": 0.7820773273706436,
"rewards/rank_answer_foramt_reward": 0.90234375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9834558814764023,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 621.78125,
"epoch": 3.616,
"grad_norm": 0.03884231299161911,
"kl": 0.010179519653320312,
"learning_rate": 7.255484666533874e-07,
"loss": -0.0293,
"reward": 6.066041827201843,
"reward_std": 1.1558443158864975,
"rewards/mrr_reward": 0.35128968954086304,
"rewards/rank_analyze_format_reward": 0.7974868565797806,
"rewards/rank_answer_foramt_reward": 0.888671875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 642.5625,
"epoch": 3.624,
"grad_norm": 0.035573799163103104,
"kl": 0.009339332580566406,
"learning_rate": 7.022351411174866e-07,
"loss": 0.0195,
"reward": 5.562433242797852,
"reward_std": 0.8050966486334801,
"rewards/mrr_reward": 0.21009425073862076,
"rewards/rank_analyze_format_reward": 0.816357433795929,
"rewards/rank_answer_foramt_reward": 0.916015625,
"rewards/rank_contrast_format_reward": 0.010822821408510208,
"rewards/rank_initial_format_reward": 0.9972426444292068,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9972426444292068,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 606.40625,
"epoch": 3.632,
"grad_norm": 0.03481682762503624,
"kl": 0.008967399597167969,
"learning_rate": 6.792888754178906e-07,
"loss": 0.0103,
"reward": 5.574246048927307,
"reward_std": 0.6400253660976887,
"rewards/mrr_reward": 0.20182291604578495,
"rewards/rank_analyze_format_reward": 0.8294544816017151,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 669.28125,
"epoch": 3.64,
"grad_norm": 0.031867899000644684,
"kl": 0.009153366088867188,
"learning_rate": 6.567105754338798e-07,
"loss": -0.0139,
"reward": 5.777210593223572,
"reward_std": 0.6435952112078667,
"rewards/mrr_reward": 0.2407862152904272,
"rewards/rank_analyze_format_reward": 0.8998882919549942,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 647.46875,
"epoch": 3.648,
"grad_norm": 0.03578875586390495,
"kl": 0.009320259094238281,
"learning_rate": 6.345011325180772e-07,
"loss": 0.0063,
"reward": 5.383034586906433,
"reward_std": 0.5498589277267456,
"rewards/mrr_reward": 0.15843254141509533,
"rewards/rank_analyze_format_reward": 0.882116824388504,
"rewards/rank_answer_foramt_reward": 0.890625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 663.125,
"epoch": 3.656,
"grad_norm": 0.03667246922850609,
"kl": 0.009763717651367188,
"learning_rate": 6.126614234612593e-07,
"loss": -0.0018,
"reward": 5.633982062339783,
"reward_std": 0.5885076597332954,
"rewards/mrr_reward": 0.20900297909975052,
"rewards/rank_analyze_format_reward": 0.8507044613361359,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 660.265625,
"epoch": 3.664,
"grad_norm": 0.03183294087648392,
"kl": 0.009653091430664062,
"learning_rate": 5.911923104577455e-07,
"loss": -0.03,
"reward": 6.144891262054443,
"reward_std": 0.7887123003602028,
"rewards/mrr_reward": 0.35075025632977486,
"rewards/rank_analyze_format_reward": 0.784550666809082,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 653.640625,
"epoch": 3.672,
"grad_norm": 0.03396729752421379,
"kl": 0.009286880493164062,
"learning_rate": 5.700946410713548e-07,
"loss": -0.0222,
"reward": 5.685562252998352,
"reward_std": 0.6213907264173031,
"rewards/mrr_reward": 0.24081102386116982,
"rewards/rank_analyze_format_reward": 0.7749374657869339,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 633.984375,
"epoch": 3.68,
"grad_norm": 0.03488519787788391,
"kl": 0.008077621459960938,
"learning_rate": 5.49369248201953e-07,
"loss": -0.0312,
"reward": 5.230130910873413,
"reward_std": 0.3854878172278404,
"rewards/mrr_reward": 0.1375558041036129,
"rewards/rank_analyze_format_reward": 0.7379215955734253,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9983368366956711,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9983368366956711,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 638.0625,
"epoch": 3.6879999999999997,
"grad_norm": 0.035259000957012177,
"kl": 0.009763717651367188,
"learning_rate": 5.290169500525577e-07,
"loss": -0.0113,
"reward": 5.952216863632202,
"reward_std": 0.7991086803376675,
"rewards/mrr_reward": 0.3176587447524071,
"rewards/rank_analyze_format_reward": 0.7538475692272186,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 631.921875,
"epoch": 3.6959999999999997,
"grad_norm": 0.03507812321186066,
"kl": 0.008602142333984375,
"learning_rate": 5.090385500970551e-07,
"loss": -0.0063,
"reward": 5.87649405002594,
"reward_std": 0.4639568105340004,
"rewards/mrr_reward": 0.2718812022358179,
"rewards/rank_analyze_format_reward": 0.8397503942251205,
"rewards/rank_answer_foramt_reward": 0.95703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 643.6875,
"epoch": 3.7039999999999997,
"grad_norm": 0.033048536628484726,
"kl": 0.008604049682617188,
"learning_rate": 4.894348370484648e-07,
"loss": 0.0067,
"reward": 5.392473220825195,
"reward_std": 0.3769769836217165,
"rewards/mrr_reward": 0.15872395411133766,
"rewards/rank_analyze_format_reward": 0.8239836394786835,
"rewards/rank_answer_foramt_reward": 0.94140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 644.796875,
"epoch": 3.7119999999999997,
"grad_norm": 0.0345633290708065,
"kl": 0.008055686950683594,
"learning_rate": 4.702065848278126e-07,
"loss": -0.0091,
"reward": 5.816651225090027,
"reward_std": 0.8016383498907089,
"rewards/mrr_reward": 0.27346230298280716,
"rewards/rank_analyze_format_reward": 0.822411373257637,
"rewards/rank_answer_foramt_reward": 0.900390625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 645.140625,
"epoch": 3.7199999999999998,
"grad_norm": 0.03394823893904686,
"kl": 0.0093536376953125,
"learning_rate": 4.5135455253357053e-07,
"loss": 0.0025,
"reward": 5.927626967430115,
"reward_std": 0.6637560278177261,
"rewards/mrr_reward": 0.29126983508467674,
"rewards/rank_analyze_format_reward": 0.7999017089605331,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.009406094439327717,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 631.921875,
"epoch": 3.7279999999999998,
"grad_norm": 0.03364517539739609,
"kl": 0.009876251220703125,
"learning_rate": 4.3287948441169457e-07,
"loss": -0.0222,
"reward": 5.698633670806885,
"reward_std": 0.6399536728858948,
"rewards/mrr_reward": 0.2584015466272831,
"rewards/rank_analyze_format_reward": 0.760730504989624,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 677.609375,
"epoch": 3.7359999999999998,
"grad_norm": 0.033980004489421844,
"kl": 0.009052276611328125,
"learning_rate": 4.1478210982624055e-07,
"loss": -0.0266,
"reward": 5.83451247215271,
"reward_std": 0.7994016855955124,
"rewards/mrr_reward": 0.2674727253615856,
"rewards/rank_analyze_format_reward": 0.8564186096191406,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 632.203125,
"epoch": 3.7439999999999998,
"grad_norm": 0.03298342972993851,
"kl": 0.010210037231445312,
"learning_rate": 3.9706314323056936e-07,
"loss": -0.0358,
"reward": 5.20824921131134,
"reward_std": 0.5714588239789009,
"rewards/mrr_reward": 0.13792782835662365,
"rewards/rank_analyze_format_reward": 0.7483347654342651,
"rewards/rank_answer_foramt_reward": 0.955078125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.984375,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 639.03125,
"epoch": 3.752,
"grad_norm": 0.03382823243737221,
"kl": 0.008636474609375,
"learning_rate": 3.7972328413914074e-07,
"loss": 0.0041,
"reward": 5.7235270738601685,
"reward_std": 0.36125198751688004,
"rewards/mrr_reward": 0.220572916790843,
"rewards/rank_analyze_format_reward": 0.8685792237520218,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 642.21875,
"epoch": 3.76,
"grad_norm": 0.03393082320690155,
"kl": 0.0091400146484375,
"learning_rate": 3.627632170999029e-07,
"loss": -0.0087,
"reward": 5.6611692905426025,
"reward_std": 0.46365745551884174,
"rewards/mrr_reward": 0.20757068321108818,
"rewards/rank_analyze_format_reward": 0.8445583134889603,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 644.671875,
"epoch": 3.768,
"grad_norm": 0.03466450795531273,
"kl": 0.011438369750976562,
"learning_rate": 3.4618361166726123e-07,
"loss": -0.0145,
"reward": 5.37939190864563,
"reward_std": 0.3622877076268196,
"rewards/mrr_reward": 0.15358383394777775,
"rewards/rank_analyze_format_reward": 0.8029301166534424,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.998641312122345,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.998641312122345,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 645.171875,
"epoch": 3.776,
"grad_norm": 0.03238410875201225,
"kl": 0.009354591369628906,
"learning_rate": 3.2998512237565005e-07,
"loss": -0.0078,
"reward": 5.8616310358047485,
"reward_std": 0.5439105778932571,
"rewards/mrr_reward": 0.2814174108207226,
"rewards/rank_analyze_format_reward": 0.810180202126503,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.984375,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 636.328125,
"epoch": 3.784,
"grad_norm": 0.033094972372055054,
"kl": 0.008489608764648438,
"learning_rate": 3.1416838871368925e-07,
"loss": -0.0358,
"reward": 5.760077238082886,
"reward_std": 0.7393556013703346,
"rewards/mrr_reward": 0.26919643953442574,
"rewards/rank_analyze_format_reward": 0.7711820602416992,
"rewards/rank_answer_foramt_reward": 0.912109375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 654.078125,
"epoch": 3.792,
"grad_norm": 0.07576505839824677,
"kl": 0.020990371704101562,
"learning_rate": 2.987340350989421e-07,
"loss": -0.0251,
"reward": 5.671926021575928,
"reward_std": 0.7974754720926285,
"rewards/mrr_reward": 0.23645834252238274,
"rewards/rank_analyze_format_reward": 0.8246570378541946,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9975927919149399,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9975927919149399,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 647.640625,
"epoch": 3.8,
"grad_norm": 0.03585861995816231,
"kl": 0.009164810180664062,
"learning_rate": 2.836826708532603e-07,
"loss": 0.0167,
"reward": 5.570275187492371,
"reward_std": 0.47667882964015007,
"rewards/mrr_reward": 0.18828125298023224,
"rewards/rank_analyze_format_reward": 0.8715294301509857,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 650.09375,
"epoch": 3.808,
"grad_norm": 0.032184626907110214,
"kl": 0.008043289184570312,
"learning_rate": 2.6901489017873375e-07,
"loss": -0.015,
"reward": 5.561799049377441,
"reward_std": 0.6698030084371567,
"rewards/mrr_reward": 0.21584821678698063,
"rewards/rank_analyze_format_reward": 0.7451662421226501,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 639.921875,
"epoch": 3.816,
"grad_norm": 0.0390472486615181,
"kl": 0.009185791015625,
"learning_rate": 2.547312721342277e-07,
"loss": 0.0123,
"reward": 5.636685132980347,
"reward_std": 0.7052134126424789,
"rewards/mrr_reward": 0.2304439563304186,
"rewards/rank_analyze_format_reward": 0.8242843002080917,
"rewards/rank_answer_foramt_reward": 0.9140625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 1.0,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 671.875,
"epoch": 3.824,
"grad_norm": 0.03435278683900833,
"kl": 0.008884429931640625,
"learning_rate": 2.4083238061252565e-07,
"loss": 0.0292,
"reward": 5.697912573814392,
"reward_std": 0.8254741281270981,
"rewards/mrr_reward": 0.25242435559630394,
"rewards/rank_analyze_format_reward": 0.7663401514291763,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 609.203125,
"epoch": 3.832,
"grad_norm": 0.03665322810411453,
"kl": 0.010316848754882812,
"learning_rate": 2.273187643180652e-07,
"loss": -0.0089,
"reward": 5.517371296882629,
"reward_std": 0.5238508731126785,
"rewards/mrr_reward": 0.2027529776096344,
"rewards/rank_analyze_format_reward": 0.747374877333641,
"rewards/rank_answer_foramt_reward": 0.958984375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 671.40625,
"epoch": 3.84,
"grad_norm": 0.033383507281541824,
"kl": 0.007233619689941406,
"learning_rate": 2.1419095674527934e-07,
"loss": 0.0034,
"reward": 5.486920118331909,
"reward_std": 0.46073780953884125,
"rewards/mrr_reward": 0.17415675148367882,
"rewards/rank_analyze_format_reward": 0.8176367580890656,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 642.09375,
"epoch": 3.848,
"grad_norm": 0.03508320823311806,
"kl": 0.00882720947265625,
"learning_rate": 2.014494761575314e-07,
"loss": -0.0212,
"reward": 6.144862055778503,
"reward_std": 0.6686284840106964,
"rewards/mrr_reward": 0.3460751511156559,
"rewards/rank_analyze_format_reward": 0.846498966217041,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 650.984375,
"epoch": 3.856,
"grad_norm": 0.0337209552526474,
"kl": 0.009634017944335938,
"learning_rate": 1.8909482556666026e-07,
"loss": 0.0118,
"reward": 5.237658619880676,
"reward_std": 0.35679256170988083,
"rewards/mrr_reward": 0.12162698619067669,
"rewards/rank_analyze_format_reward": 0.8467388600111008,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 645.1875,
"epoch": 3.864,
"grad_norm": 0.034814029932022095,
"kl": 0.0094757080078125,
"learning_rate": 1.7712749271311392e-07,
"loss": -0.0091,
"reward": 5.641056180000305,
"reward_std": 0.48677169997245073,
"rewards/mrr_reward": 0.19029638543725014,
"rewards/rank_analyze_format_reward": 0.8798703849315643,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 1.0,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 619.53125,
"epoch": 3.872,
"grad_norm": 0.040520522743463516,
"kl": 0.0150299072265625,
"learning_rate": 1.6554795004670389e-07,
"loss": -0.0202,
"reward": 6.0107786655426025,
"reward_std": 0.7183677442371845,
"rewards/mrr_reward": 0.3212549705058336,
"rewards/rank_analyze_format_reward": 0.81850266456604,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9995265156030655,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9995265156030655,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 654.921875,
"epoch": 3.88,
"grad_norm": 0.03340727090835571,
"kl": 0.008794784545898438,
"learning_rate": 1.543566547079467e-07,
"loss": 0.0042,
"reward": 6.221985816955566,
"reward_std": 0.7052921280264854,
"rewards/mrr_reward": 0.34005457162857056,
"rewards/rank_analyze_format_reward": 0.871224895119667,
"rewards/rank_answer_foramt_reward": 1.0,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9991776347160339,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 659.453125,
"epoch": 3.888,
"grad_norm": 0.033866625279188156,
"kl": 0.008701324462890625,
"learning_rate": 1.4355404851001953e-07,
"loss": -0.0056,
"reward": 5.805374503135681,
"reward_std": 0.8619978576898575,
"rewards/mrr_reward": 0.2608507052063942,
"rewards/rank_analyze_format_reward": 0.832169234752655,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 634.03125,
"epoch": 3.896,
"grad_norm": 0.03655744716525078,
"kl": 0.009066581726074219,
"learning_rate": 1.3314055792131964e-07,
"loss": -0.0147,
"reward": 5.252833724021912,
"reward_std": 0.5554525479674339,
"rewards/mrr_reward": 0.15148189663887024,
"rewards/rank_analyze_format_reward": 0.7708722352981567,
"rewards/rank_answer_foramt_reward": 0.931640625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.984375,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 641.859375,
"epoch": 3.904,
"grad_norm": 0.035562798380851746,
"kl": 0.008677482604980469,
"learning_rate": 1.231165940486234e-07,
"loss": -0.01,
"reward": 5.259171485900879,
"reward_std": 0.41013093292713165,
"rewards/mrr_reward": 0.14392981678247452,
"rewards/rank_analyze_format_reward": 0.7830992192029953,
"rewards/rank_answer_foramt_reward": 0.91796875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9911921620368958,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9911921620368958,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 656.796875,
"epoch": 3.912,
"grad_norm": 0.049858458340168,
"kl": 0.011943817138671875,
"learning_rate": 1.134825526208605e-07,
"loss": 0.0081,
"reward": 5.8922260999679565,
"reward_std": 0.6895529553294182,
"rewards/mrr_reward": 0.2795138917863369,
"rewards/rank_analyze_format_reward": 0.8414849489927292,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9975927919149399,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9975927919149399,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 630.03125,
"epoch": 3.92,
"grad_norm": 0.03522248566150665,
"kl": 0.009510040283203125,
"learning_rate": 1.0423881397349067e-07,
"loss": -0.0013,
"reward": 6.210868835449219,
"reward_std": 0.7074924185872078,
"rewards/mrr_reward": 0.3486483208835125,
"rewards/rank_analyze_format_reward": 0.8533848524093628,
"rewards/rank_answer_foramt_reward": 0.970703125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 620.046875,
"epoch": 3.928,
"grad_norm": 0.033086903393268585,
"kl": 0.009950637817382812,
"learning_rate": 9.538574303348813e-08,
"loss": -0.0079,
"reward": 6.068167686462402,
"reward_std": 1.18388731777668,
"rewards/mrr_reward": 0.35264757089316845,
"rewards/rank_analyze_format_reward": 0.7939876317977905,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9835526347160339,
"rewards/rank_overall_format_reward_more": 0.953125,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 620.5625,
"epoch": 3.936,
"grad_norm": 0.03736709803342819,
"kl": 0.009596824645996094,
"learning_rate": 8.692368930493522e-08,
"loss": -0.0075,
"reward": 6.095898747444153,
"reward_std": 0.8004505969583988,
"rewards/mrr_reward": 0.3334883488714695,
"rewards/rank_analyze_format_reward": 0.8049141466617584,
"rewards/rank_answer_foramt_reward": 0.97265625,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.984375,
"rewards/rank_verify_format_reward": 1.0,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 634.53125,
"epoch": 3.944,
"grad_norm": 0.036802154034376144,
"kl": 0.010059356689453125,
"learning_rate": 7.885298685522235e-08,
"loss": -0.0271,
"reward": 5.225739121437073,
"reward_std": 0.7527187168598175,
"rewards/mrr_reward": 0.1519965250045061,
"rewards/rank_analyze_format_reward": 0.7541633769869804,
"rewards/rank_answer_foramt_reward": 0.904296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9991776347160339,
"rewards/rank_overall_format_reward_more": 0.9765625,
"rewards/rank_verify_format_reward": 0.9835526347160339,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 651.234375,
"epoch": 3.952,
"grad_norm": 0.03432526811957359,
"kl": 0.009029388427734375,
"learning_rate": 7.117395430186414e-08,
"loss": 0.0325,
"reward": 5.624658584594727,
"reward_std": 0.5472202897071838,
"rewards/mrr_reward": 0.24223089963197708,
"rewards/rank_analyze_format_reward": 0.7981982976198196,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9834558814764023,
"rewards/rank_overall_format_reward_more": 0.9609375,
"rewards/rank_verify_format_reward": 0.9834558814764023,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 680.015625,
"epoch": 3.96,
"grad_norm": 0.0339614674448967,
"kl": 0.009700775146484375,
"learning_rate": 6.388689479991606e-08,
"loss": -0.0091,
"reward": 6.06891131401062,
"reward_std": 0.9122689664363861,
"rewards/mrr_reward": 0.3243551626801491,
"rewards/rank_analyze_format_reward": 0.8358288407325745,
"rewards/rank_answer_foramt_reward": 0.9453125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 666.359375,
"epoch": 3.968,
"grad_norm": 0.03345409035682678,
"kl": 0.00970458984375,
"learning_rate": 5.699209603001077e-08,
"loss": 0.0057,
"reward": 5.823601126670837,
"reward_std": 0.8781716674566269,
"rewards/mrr_reward": 0.2674107179045677,
"rewards/rank_analyze_format_reward": 0.8760750144720078,
"rewards/rank_answer_foramt_reward": 0.943359375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9828869104385376,
"rewards/rank_overall_format_reward_more": 0.96875,
"rewards/rank_verify_format_reward": 0.9828869104385376,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 640.28125,
"epoch": 3.976,
"grad_norm": 0.0337567999958992,
"kl": 0.008623123168945312,
"learning_rate": 5.048983018699827e-08,
"loss": -0.0062,
"reward": 6.0533905029296875,
"reward_std": 0.9021812565624714,
"rewards/mrr_reward": 0.329222459346056,
"rewards/rank_analyze_format_reward": 0.8165788054466248,
"rewards/rank_answer_foramt_reward": 0.927734375,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 644.0,
"epoch": 3.984,
"grad_norm": 0.03243754804134369,
"kl": 0.01001739501953125,
"learning_rate": 4.438035396920004e-08,
"loss": -0.0205,
"reward": 6.078030347824097,
"reward_std": 0.4891853742301464,
"rewards/mrr_reward": 0.31452134251594543,
"rewards/rank_analyze_format_reward": 0.8414293229579926,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 653.953125,
"epoch": 3.992,
"grad_norm": 0.032491762191057205,
"kl": 0.009099960327148438,
"learning_rate": 3.866390856827495e-08,
"loss": -0.0154,
"reward": 5.358732223510742,
"reward_std": 0.5416415482759476,
"rewards/mrr_reward": 0.15270957723259926,
"rewards/rank_analyze_format_reward": 0.8260188400745392,
"rewards/rank_answer_foramt_reward": 0.9296875,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 1.0,
"rewards/rank_overall_format_reward_more": 0.9921875,
"rewards/rank_verify_format_reward": 1.0,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 644.09375,
"epoch": 4.0,
"grad_norm": 0.03646353259682655,
"kl": 0.009246826171875,
"learning_rate": 3.3340719659701315e-08,
"loss": -0.0217,
"reward": 5.45065975189209,
"reward_std": 0.54698271676898,
"rewards/mrr_reward": 0.19483507424592972,
"rewards/rank_analyze_format_reward": 0.6868295818567276,
"rewards/rank_answer_foramt_reward": 0.986328125,
"rewards/rank_contrast_format_reward": 0.0,
"rewards/rank_initial_format_reward": 0.9990808814764023,
"rewards/rank_overall_format_reward_more": 1.0,
"rewards/rank_verify_format_reward": 0.9990808814764023,
"step": 500
},
{
"epoch": 4.0,
"step": 500,
"total_flos": 0.0,
"train_loss": -0.0017955374517478048,
"train_runtime": 38748.2534,
"train_samples_per_second": 0.826,
"train_steps_per_second": 0.013
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}