| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.0, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 528.203125, |
| "epoch": 0.008, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0249, |
| "reward": 4.16574627161026, |
| "reward_std": 1.0997275561094284, |
| "rewards/mrr_reward": 0.13278149627149105, |
| "rewards/rank_analyze_format_reward": 0.20883905701339245, |
| "rewards/rank_answer_foramt_reward": 0.59765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.875, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 482.859375, |
| "epoch": 0.016, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0459, |
| "reward": 4.315212845802307, |
| "reward_std": 1.325284257531166, |
| "rewards/mrr_reward": 0.18193825148046017, |
| "rewards/rank_analyze_format_reward": 0.16742298379540443, |
| "rewards/rank_answer_foramt_reward": 0.59375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.859375, |
| "rewards/rank_verify_format_reward": 0.9678308814764023, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 504.859375, |
| "epoch": 0.024, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0326, |
| "reward": 4.513009071350098, |
| "reward_std": 1.3176036477088928, |
| "rewards/mrr_reward": 0.23714657500386238, |
| "rewards/rank_analyze_format_reward": 0.1366883972659707, |
| "rewards/rank_answer_foramt_reward": 0.662109375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.828125, |
| "rewards/rank_verify_format_reward": 0.953125, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 501.5625, |
| "epoch": 0.032, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0358, |
| "reward": 4.18767774105072, |
| "reward_std": 0.9816120713949203, |
| "rewards/mrr_reward": 0.1309461873024702, |
| "rewards/rank_analyze_format_reward": 0.190928403288126, |
| "rewards/rank_answer_foramt_reward": 0.646484375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.828125, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 517.453125, |
| "epoch": 0.04, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0459, |
| "reward": 4.4382476806640625, |
| "reward_std": 1.167427971959114, |
| "rewards/mrr_reward": 0.18655134364962578, |
| "rewards/rank_analyze_format_reward": 0.23099003173410892, |
| "rewards/rank_answer_foramt_reward": 0.634765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.84375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 523.921875, |
| "epoch": 0.048, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0513, |
| "reward": 4.052561104297638, |
| "reward_std": 1.0998588353395462, |
| "rewards/mrr_reward": 0.09990699402987957, |
| "rewards/rank_analyze_format_reward": 0.3598833493888378, |
| "rewards/rank_answer_foramt_reward": 0.490234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9912513643503189, |
| "rewards/rank_overall_format_reward_more": 0.8359375, |
| "rewards/rank_verify_format_reward": 0.9756263643503189, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 511.203125, |
| "epoch": 0.056, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0118, |
| "reward": 3.817609965801239, |
| "reward_std": 1.3295144587755203, |
| "rewards/mrr_reward": 0.10291418805718422, |
| "rewards/rank_analyze_format_reward": 0.22595737129449844, |
| "rewards/rank_answer_foramt_reward": 0.509765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9523026347160339, |
| "rewards/rank_overall_format_reward_more": 0.78125, |
| "rewards/rank_verify_format_reward": 0.9366776347160339, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 501.234375, |
| "epoch": 0.064, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0317, |
| "reward": 3.9616005420684814, |
| "reward_std": 1.4605186134576797, |
| "rewards/mrr_reward": 0.12518600933253765, |
| "rewards/rank_analyze_format_reward": 0.24394467286765575, |
| "rewards/rank_answer_foramt_reward": 0.4765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9834558814764023, |
| "rewards/rank_overall_format_reward_more": 0.7890625, |
| "rewards/rank_verify_format_reward": 0.9678308814764023, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.953125, |
| "epoch": 0.072, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 2e-05, |
| "loss": -0.0058, |
| "reward": 3.873459815979004, |
| "reward_std": 1.0441433489322662, |
| "rewards/mrr_reward": 0.10626240447163582, |
| "rewards/rank_analyze_format_reward": 0.14176952932029963, |
| "rewards/rank_answer_foramt_reward": 0.525390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.828125, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 9 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 515.296875, |
| "epoch": 0.08, |
| "grad_norm": 0.01915793865919113, |
| "kl": 0.0, |
| "learning_rate": 1.9999999684172664e-05, |
| "loss": -0.0341, |
| "reward": 4.031236290931702, |
| "reward_std": 1.048377439379692, |
| "rewards/mrr_reward": 0.09990079700946808, |
| "rewards/rank_analyze_format_reward": 0.19022684637457132, |
| "rewards/rank_answer_foramt_reward": 0.55078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.890625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 487.359375, |
| "epoch": 0.088, |
| "grad_norm": 0.02235390432178974, |
| "kl": -6.154179573059082e-06, |
| "learning_rate": 1.9999998736690666e-05, |
| "loss": -0.0483, |
| "reward": 4.058919072151184, |
| "reward_std": 1.0137622952461243, |
| "rewards/mrr_reward": 0.12898686341941357, |
| "rewards/rank_analyze_format_reward": 0.1387843620032072, |
| "rewards/rank_answer_foramt_reward": 0.603515625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9940857887268066, |
| "rewards/rank_overall_format_reward_more": 0.8125, |
| "rewards/rank_verify_format_reward": 0.9940857887268066, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.703125, |
| "epoch": 0.096, |
| "grad_norm": 0.019736869260668755, |
| "kl": -5.140900611877441e-06, |
| "learning_rate": 1.999999715755407e-05, |
| "loss": -0.0413, |
| "reward": 4.11133998632431, |
| "reward_std": 1.2341832220554352, |
| "rewards/mrr_reward": 0.12740575149655342, |
| "rewards/rank_analyze_format_reward": 0.3057297058403492, |
| "rewards/rank_answer_foramt_reward": 0.556640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9829545468091965, |
| "rewards/rank_overall_format_reward_more": 0.7890625, |
| "rewards/rank_verify_format_reward": 0.9673295468091965, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 499.734375, |
| "epoch": 0.104, |
| "grad_norm": 0.019736869260668755, |
| "kl": -3.725290298461914e-06, |
| "learning_rate": 1.999999715755407e-05, |
| "loss": -0.0211, |
| "reward": 4.32198166847229, |
| "reward_std": 0.9904958009719849, |
| "rewards/mrr_reward": 0.13051215931773186, |
| "rewards/rank_analyze_format_reward": 0.2530582267791033, |
| "rewards/rank_answer_foramt_reward": 0.65625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.890625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 474.59375, |
| "epoch": 0.112, |
| "grad_norm": 0.020436184480786324, |
| "kl": -3.859400749206543e-06, |
| "learning_rate": 1.9999994946762974e-05, |
| "loss": -0.0097, |
| "reward": 4.348296344280243, |
| "reward_std": 1.4071729183197021, |
| "rewards/mrr_reward": 0.19649058394134045, |
| "rewards/rank_analyze_format_reward": 0.12483388930559158, |
| "rewards/rank_answer_foramt_reward": 0.5546875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.8828125, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 517.84375, |
| "epoch": 0.12, |
| "grad_norm": 0.019303128123283386, |
| "kl": -2.339482307434082e-06, |
| "learning_rate": 1.999999210431752e-05, |
| "loss": -0.0125, |
| "reward": 4.1298569440841675, |
| "reward_std": 1.0909616947174072, |
| "rewards/mrr_reward": 0.11587301827967167, |
| "rewards/rank_analyze_format_reward": 0.19317593052983284, |
| "rewards/rank_answer_foramt_reward": 0.607421875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992897808551788, |
| "rewards/rank_overall_format_reward_more": 0.8671875, |
| "rewards/rank_verify_format_reward": 0.9992897808551788, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 489.984375, |
| "epoch": 0.128, |
| "grad_norm": 0.02006162703037262, |
| "kl": -5.513429641723633e-07, |
| "learning_rate": 1.9999988630217885e-05, |
| "loss": 0.004, |
| "reward": 4.19925457239151, |
| "reward_std": 1.0583490580320358, |
| "rewards/mrr_reward": 0.15414806827902794, |
| "rewards/rank_analyze_format_reward": 0.2584436684846878, |
| "rewards/rank_answer_foramt_reward": 0.55859375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.96875, |
| "rewards/rank_overall_format_reward_more": 0.84375, |
| "rewards/rank_verify_format_reward": 0.953125, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 519.984375, |
| "epoch": 0.136, |
| "grad_norm": 0.02006162703037262, |
| "kl": -3.8743019104003906e-07, |
| "learning_rate": 1.9999988630217885e-05, |
| "loss": -0.0332, |
| "reward": 3.918194353580475, |
| "reward_std": 1.2305240333080292, |
| "rewards/mrr_reward": 0.12146577425301075, |
| "rewards/rank_analyze_format_reward": 0.1569407321512699, |
| "rewards/rank_answer_foramt_reward": 0.556640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.953125, |
| "rewards/rank_overall_format_reward_more": 0.8125, |
| "rewards/rank_verify_format_reward": 0.953125, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 520.96875, |
| "epoch": 0.144, |
| "grad_norm": 0.01881454698741436, |
| "kl": 1.4901161193847656e-08, |
| "learning_rate": 1.999998452446429e-05, |
| "loss": -0.0496, |
| "reward": 4.462850987911224, |
| "reward_std": 1.4440096318721771, |
| "rewards/mrr_reward": 0.19556052424013615, |
| "rewards/rank_analyze_format_reward": 0.2645931877195835, |
| "rewards/rank_answer_foramt_reward": 0.689453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.96875, |
| "rewards/rank_overall_format_reward_more": 0.8046875, |
| "rewards/rank_verify_format_reward": 0.953125, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 520.109375, |
| "epoch": 0.152, |
| "grad_norm": 0.01960228569805622, |
| "kl": 2.5480985641479492e-06, |
| "learning_rate": 1.9999979787056998e-05, |
| "loss": -0.0073, |
| "reward": 4.165937960147858, |
| "reward_std": 1.3161405473947525, |
| "rewards/mrr_reward": 0.11880580708384514, |
| "rewards/rank_analyze_format_reward": 0.3187452331185341, |
| "rewards/rank_answer_foramt_reward": 0.568359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9682112038135529, |
| "rewards/rank_overall_format_reward_more": 0.8671875, |
| "rewards/rank_verify_format_reward": 0.9682112038135529, |
| "step": 19 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 510.09375, |
| "epoch": 0.16, |
| "grad_norm": 0.02011404000222683, |
| "kl": 5.4389238357543945e-06, |
| "learning_rate": 1.9999974417996303e-05, |
| "loss": -0.0173, |
| "reward": 4.031284391880035, |
| "reward_std": 1.1271260976791382, |
| "rewards/mrr_reward": 0.09977678954601288, |
| "rewards/rank_analyze_format_reward": 0.31458218209445477, |
| "rewards/rank_answer_foramt_reward": 0.4765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.998641312122345, |
| "rewards/rank_overall_format_reward_more": 0.859375, |
| "rewards/rank_verify_format_reward": 0.983016312122345, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 491.15625, |
| "epoch": 0.168, |
| "grad_norm": 0.020767828449606895, |
| "kl": 1.1399388313293457e-05, |
| "learning_rate": 1.9999968417282542e-05, |
| "loss": 0.0079, |
| "reward": 4.090369284152985, |
| "reward_std": 0.8935733437538147, |
| "rewards/mrr_reward": 0.10092386044561863, |
| "rewards/rank_analyze_format_reward": 0.18777898885309696, |
| "rewards/rank_answer_foramt_reward": 0.6328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.995541125535965, |
| "rewards/rank_overall_format_reward_more": 0.875, |
| "rewards/rank_verify_format_reward": 0.995541125535965, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 497.125, |
| "epoch": 0.176, |
| "grad_norm": 0.020388908684253693, |
| "kl": 1.4647841453552246e-05, |
| "learning_rate": 1.99999617849161e-05, |
| "loss": -0.0264, |
| "reward": 4.866297721862793, |
| "reward_std": 1.3940207660198212, |
| "rewards/mrr_reward": 0.27614088356494904, |
| "rewards/rank_analyze_format_reward": 0.23634351417422295, |
| "rewards/rank_answer_foramt_reward": 0.705078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.8671875, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 493.0, |
| "epoch": 0.184, |
| "grad_norm": 0.021081620827317238, |
| "kl": 2.060830593109131e-05, |
| "learning_rate": 1.9999954520897394e-05, |
| "loss": -0.0198, |
| "reward": 4.1482550501823425, |
| "reward_std": 0.7457377761602402, |
| "rewards/mrr_reward": 0.09423363115638494, |
| "rewards/rank_analyze_format_reward": 0.24007043987512589, |
| "rewards/rank_answer_foramt_reward": 0.640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.890625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 507.25, |
| "epoch": 0.192, |
| "grad_norm": 0.021528450772166252, |
| "kl": 2.8684735298156738e-05, |
| "learning_rate": 1.999994662522688e-05, |
| "loss": -0.0459, |
| "reward": 4.055303335189819, |
| "reward_std": 1.1271640360355377, |
| "rewards/mrr_reward": 0.12201760895550251, |
| "rewards/rank_analyze_format_reward": 0.23910778760910034, |
| "rewards/rank_answer_foramt_reward": 0.5546875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.8046875, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 514.78125, |
| "epoch": 0.2, |
| "grad_norm": 0.020565951243042946, |
| "kl": 3.625452518463135e-05, |
| "learning_rate": 1.9999938097905064e-05, |
| "loss": 0.022, |
| "reward": 4.250920534133911, |
| "reward_std": 1.3971717804670334, |
| "rewards/mrr_reward": 0.1861669160425663, |
| "rewards/rank_analyze_format_reward": 0.20351847913116217, |
| "rewards/rank_answer_foramt_reward": 0.638671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.96875, |
| "rewards/rank_overall_format_reward_more": 0.7578125, |
| "rewards/rank_verify_format_reward": 0.9375, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 525.25, |
| "epoch": 0.208, |
| "grad_norm": 0.01966329663991928, |
| "kl": 3.674626350402832e-05, |
| "learning_rate": 1.9999928938932473e-05, |
| "loss": -0.0257, |
| "reward": 4.075399398803711, |
| "reward_std": 1.2551968395709991, |
| "rewards/mrr_reward": 0.10590897873044014, |
| "rewards/rank_analyze_format_reward": 0.28262292593717575, |
| "rewards/rank_answer_foramt_reward": 0.564453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.8515625, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 477.421875, |
| "epoch": 0.216, |
| "grad_norm": 0.02318265102803707, |
| "kl": 5.188584327697754e-05, |
| "learning_rate": 1.99999191483097e-05, |
| "loss": -0.0383, |
| "reward": 4.07093209028244, |
| "reward_std": 1.1806218922138214, |
| "rewards/mrr_reward": 0.13093997910618782, |
| "rewards/rank_analyze_format_reward": 0.20123931858688593, |
| "rewards/rank_answer_foramt_reward": 0.498046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.8515625, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.625, |
| "epoch": 0.224, |
| "grad_norm": 0.02106453664600849, |
| "kl": 5.2034854888916016e-05, |
| "learning_rate": 1.999990872603735e-05, |
| "loss": -0.0142, |
| "reward": 4.171126127243042, |
| "reward_std": 0.9384299516677856, |
| "rewards/mrr_reward": 0.10311880148947239, |
| "rewards/rank_analyze_format_reward": 0.24095792695879936, |
| "rewards/rank_answer_foramt_reward": 0.66015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.875, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.96875, |
| "epoch": 0.232, |
| "grad_norm": 0.021827075630426407, |
| "kl": 6.267428398132324e-05, |
| "learning_rate": 1.999989767211609e-05, |
| "loss": -0.0406, |
| "reward": 4.739075601100922, |
| "reward_std": 1.122992992401123, |
| "rewards/mrr_reward": 0.2242001499980688, |
| "rewards/rank_analyze_format_reward": 0.1782125374302268, |
| "rewards/rank_answer_foramt_reward": 0.7890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.890625, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 29 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 518.5625, |
| "epoch": 0.24, |
| "grad_norm": 0.019971711561083794, |
| "kl": 6.35683536529541e-05, |
| "learning_rate": 1.9999885986546613e-05, |
| "loss": -0.0358, |
| "reward": 4.3094329833984375, |
| "reward_std": 0.8498065173625946, |
| "rewards/mrr_reward": 0.09677579626441002, |
| "rewards/rank_analyze_format_reward": 0.29673536494374275, |
| "rewards/rank_answer_foramt_reward": 0.705078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9993206560611725, |
| "rewards/rank_overall_format_reward_more": 0.921875, |
| "rewards/rank_verify_format_reward": 0.9993206560611725, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 520.0, |
| "epoch": 0.248, |
| "grad_norm": 0.02139338292181492, |
| "kl": 0.00010059773921966553, |
| "learning_rate": 1.999987366932966e-05, |
| "loss": -0.0341, |
| "reward": 4.280443549156189, |
| "reward_std": 1.3128504306077957, |
| "rewards/mrr_reward": 0.1447172649204731, |
| "rewards/rank_analyze_format_reward": 0.3246212564408779, |
| "rewards/rank_answer_foramt_reward": 0.572265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.8359375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 511.296875, |
| "epoch": 0.256, |
| "grad_norm": 0.02021283283829689, |
| "kl": 9.936094284057617e-05, |
| "learning_rate": 1.9999860720466007e-05, |
| "loss": -0.0208, |
| "reward": 4.884114027023315, |
| "reward_std": 1.141958087682724, |
| "rewards/mrr_reward": 0.22592385485768318, |
| "rewards/rank_analyze_format_reward": 0.31244974583387375, |
| "rewards/rank_answer_foramt_reward": 0.80078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.8671875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 500.859375, |
| "epoch": 0.264, |
| "grad_norm": 0.02373325079679489, |
| "kl": 0.00011742115020751953, |
| "learning_rate": 1.9999847139956477e-05, |
| "loss": -0.0074, |
| "reward": 4.154786288738251, |
| "reward_std": 1.1190759539604187, |
| "rewards/mrr_reward": 0.15658481419086456, |
| "rewards/rank_analyze_format_reward": 0.17102508060634136, |
| "rewards/rank_answer_foramt_reward": 0.552734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.96875, |
| "rewards/rank_overall_format_reward_more": 0.8671875, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 461.6875, |
| "epoch": 0.272, |
| "grad_norm": 0.024350695312023163, |
| "kl": 0.00012856721878051758, |
| "learning_rate": 1.9999832927801922e-05, |
| "loss": -0.0463, |
| "reward": 4.32455313205719, |
| "reward_std": 1.5562799572944641, |
| "rewards/mrr_reward": 0.20357143133878708, |
| "rewards/rank_analyze_format_reward": 0.10987668856978416, |
| "rewards/rank_answer_foramt_reward": 0.525390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.90625, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 534.375, |
| "epoch": 0.28, |
| "grad_norm": 0.019665928557515144, |
| "kl": 0.00014293193817138672, |
| "learning_rate": 1.9999818084003243e-05, |
| "loss": -0.0164, |
| "reward": 4.593672394752502, |
| "reward_std": 1.1871068179607391, |
| "rewards/mrr_reward": 0.1902901791036129, |
| "rewards/rank_analyze_format_reward": 0.28130697179585695, |
| "rewards/rank_answer_foramt_reward": 0.6875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.8828125, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 512.890625, |
| "epoch": 0.288, |
| "grad_norm": 0.02193881757557392, |
| "kl": 0.00016960501670837402, |
| "learning_rate": 1.999980260856137e-05, |
| "loss": -0.0276, |
| "reward": 4.122673153877258, |
| "reward_std": 0.916993722319603, |
| "rewards/mrr_reward": 0.11755332630127668, |
| "rewards/rank_analyze_format_reward": 0.20117305219173431, |
| "rewards/rank_answer_foramt_reward": 0.59375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.875, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 36 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 501.9375, |
| "epoch": 0.296, |
| "grad_norm": 0.020937107503414154, |
| "kl": 0.00017371773719787598, |
| "learning_rate": 1.9999786501477298e-05, |
| "loss": 0.002, |
| "reward": 4.0404258370399475, |
| "reward_std": 1.075580656528473, |
| "rewards/mrr_reward": 0.10114087350666523, |
| "rewards/rank_analyze_format_reward": 0.22352311667054892, |
| "rewards/rank_answer_foramt_reward": 0.619140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9825367629528046, |
| "rewards/rank_overall_format_reward_more": 0.84375, |
| "rewards/rank_verify_format_reward": 0.9669117629528046, |
| "step": 37 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 518.5625, |
| "epoch": 0.304, |
| "grad_norm": 0.022719040513038635, |
| "kl": 0.00017246603965759277, |
| "learning_rate": 1.9999769762752024e-05, |
| "loss": -0.0087, |
| "reward": 4.427178978919983, |
| "reward_std": 1.1200510263442993, |
| "rewards/mrr_reward": 0.14256572909653187, |
| "rewards/rank_analyze_format_reward": 0.3164584683254361, |
| "rewards/rank_answer_foramt_reward": 0.646484375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9977678656578064, |
| "rewards/rank_overall_format_reward_more": 0.9140625, |
| "rewards/rank_verify_format_reward": 0.9821428656578064, |
| "step": 38 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 486.53125, |
| "epoch": 0.312, |
| "grad_norm": 0.023954233154654503, |
| "kl": 0.00021010637283325195, |
| "learning_rate": 1.999975239238662e-05, |
| "loss": 0.0227, |
| "reward": 4.685232400894165, |
| "reward_std": 1.559970200061798, |
| "rewards/mrr_reward": 0.2501426115632057, |
| "rewards/rank_analyze_format_reward": 0.19994227308779955, |
| "rewards/rank_answer_foramt_reward": 0.599609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9972426444292068, |
| "rewards/rank_overall_format_reward_more": 0.90625, |
| "rewards/rank_verify_format_reward": 0.9816176444292068, |
| "step": 39 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 503.546875, |
| "epoch": 0.32, |
| "grad_norm": 0.021838972344994545, |
| "kl": 0.00023385882377624512, |
| "learning_rate": 1.999973439038218e-05, |
| "loss": -0.0279, |
| "reward": 4.507627367973328, |
| "reward_std": 1.3758054077625275, |
| "rewards/mrr_reward": 0.1917472742497921, |
| "rewards/rank_analyze_format_reward": 0.2892364487051964, |
| "rewards/rank_answer_foramt_reward": 0.564453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9825367629528046, |
| "rewards/rank_overall_format_reward_more": 0.921875, |
| "rewards/rank_verify_format_reward": 0.9825367629528046, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 510.25, |
| "epoch": 0.328, |
| "grad_norm": 0.02145545743405819, |
| "kl": 0.00022032856941223145, |
| "learning_rate": 1.9999715756739833e-05, |
| "loss": -0.0426, |
| "reward": 4.766237854957581, |
| "reward_std": 1.3724510371685028, |
| "rewards/mrr_reward": 0.24400422349572182, |
| "rewards/rank_analyze_format_reward": 0.22967395186424255, |
| "rewards/rank_answer_foramt_reward": 0.677734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.8984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 41 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 535.625, |
| "epoch": 0.336, |
| "grad_norm": 0.02246415615081787, |
| "kl": 0.0002751350402832031, |
| "learning_rate": 1.9999696491460764e-05, |
| "loss": -0.0425, |
| "reward": 4.64515745639801, |
| "reward_std": 0.9094655960798264, |
| "rewards/mrr_reward": 0.16312624514102936, |
| "rewards/rank_analyze_format_reward": 0.3751567006111145, |
| "rewards/rank_answer_foramt_reward": 0.744140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.890625, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 42 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 502.515625, |
| "epoch": 0.344, |
| "grad_norm": 0.024739902466535568, |
| "kl": 0.0003191530704498291, |
| "learning_rate": 1.9999676594546187e-05, |
| "loss": -0.038, |
| "reward": 4.759453654289246, |
| "reward_std": 1.3180456161499023, |
| "rewards/mrr_reward": 0.2570870481431484, |
| "rewards/rank_analyze_format_reward": 0.16787387989461422, |
| "rewards/rank_answer_foramt_reward": 0.646484375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.921875, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 43 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 529.53125, |
| "epoch": 0.352, |
| "grad_norm": 0.022288991138339043, |
| "kl": 0.0002784132957458496, |
| "learning_rate": 1.999965606599736e-05, |
| "loss": -0.0421, |
| "reward": 4.620839357376099, |
| "reward_std": 1.0206461399793625, |
| "rewards/mrr_reward": 0.16845857724547386, |
| "rewards/rank_analyze_format_reward": 0.30247366055846214, |
| "rewards/rank_answer_foramt_reward": 0.74609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.8984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 44 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 520.03125, |
| "epoch": 0.36, |
| "grad_norm": 0.022286290302872658, |
| "kl": 0.00043213367462158203, |
| "learning_rate": 1.999963490581558e-05, |
| "loss": -0.0056, |
| "reward": 4.52034318447113, |
| "reward_std": 0.9805040061473846, |
| "rewards/mrr_reward": 0.16817336902022362, |
| "rewards/rank_analyze_format_reward": 0.2824648283421993, |
| "rewards/rank_answer_foramt_reward": 0.6484375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.921875, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 526.46875, |
| "epoch": 0.368, |
| "grad_norm": 0.023201555013656616, |
| "kl": 0.00047147274017333984, |
| "learning_rate": 1.9999613114002184e-05, |
| "loss": -0.0461, |
| "reward": 4.407416224479675, |
| "reward_std": 1.2047448754310608, |
| "rewards/mrr_reward": 0.1641245037317276, |
| "rewards/rank_analyze_format_reward": 0.3212462067604065, |
| "rewards/rank_answer_foramt_reward": 0.64453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9667892158031464, |
| "rewards/rank_overall_format_reward_more": 0.8671875, |
| "rewards/rank_verify_format_reward": 0.9511642158031464, |
| "step": 46 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 524.46875, |
| "epoch": 0.376, |
| "grad_norm": 0.023201555013656616, |
| "kl": 0.00044339895248413086, |
| "learning_rate": 1.9999613114002184e-05, |
| "loss": -0.0536, |
| "reward": 4.553882956504822, |
| "reward_std": 1.4308572709560394, |
| "rewards/mrr_reward": 0.17046131566166878, |
| "rewards/rank_analyze_format_reward": 0.4204007051885128, |
| "rewards/rank_answer_foramt_reward": 0.609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.84375, |
| "rewards/rank_verify_format_reward": 0.9992559552192688, |
| "step": 47 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 510.984375, |
| "epoch": 0.384, |
| "grad_norm": 0.02642284519970417, |
| "kl": 0.00043976306915283203, |
| "learning_rate": 1.9999590690558545e-05, |
| "loss": 0.0066, |
| "reward": 4.590193271636963, |
| "reward_std": 1.2087296098470688, |
| "rewards/mrr_reward": 0.20808532275259495, |
| "rewards/rank_analyze_format_reward": 0.13480500131845474, |
| "rewards/rank_answer_foramt_reward": 0.755859375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.8984375, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 48 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 539.734375, |
| "epoch": 0.392, |
| "grad_norm": 0.0221265759319067, |
| "kl": 0.00046002864837646484, |
| "learning_rate": 1.9999567635486086e-05, |
| "loss": -0.0091, |
| "reward": 4.26533442735672, |
| "reward_std": 1.1292133778333664, |
| "rewards/mrr_reward": 0.1356088798493147, |
| "rewards/rank_analyze_format_reward": 0.27040086686611176, |
| "rewards/rank_answer_foramt_reward": 0.568359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9967568069696426, |
| "rewards/rank_overall_format_reward_more": 0.90625, |
| "rewards/rank_verify_format_reward": 0.9811318069696426, |
| "step": 49 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 507.84375, |
| "epoch": 0.4, |
| "grad_norm": 0.02278602309525013, |
| "kl": 0.0005121231079101562, |
| "learning_rate": 1.9999543948786258e-05, |
| "loss": -0.0093, |
| "reward": 4.813909411430359, |
| "reward_std": 1.5017302483320236, |
| "rewards/mrr_reward": 0.25014261342585087, |
| "rewards/rank_analyze_format_reward": 0.2544366829097271, |
| "rewards/rank_answer_foramt_reward": 0.693359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.8984375, |
| "rewards/rank_verify_format_reward": 0.9679276347160339, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 526.59375, |
| "epoch": 0.408, |
| "grad_norm": 0.022586733102798462, |
| "kl": 0.0003803372383117676, |
| "learning_rate": 1.9999519630460554e-05, |
| "loss": -0.0174, |
| "reward": 4.623661637306213, |
| "reward_std": 0.6876689344644547, |
| "rewards/mrr_reward": 0.14153646305203438, |
| "rewards/rank_analyze_format_reward": 0.31521353125572205, |
| "rewards/rank_answer_foramt_reward": 0.861328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9140625, |
| "rewards/rank_verify_format_reward": 0.9678308814764023, |
| "step": 51 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 535.28125, |
| "epoch": 0.416, |
| "grad_norm": 0.022564509883522987, |
| "kl": 0.0006885528564453125, |
| "learning_rate": 1.999949468051052e-05, |
| "loss": -0.0146, |
| "reward": 4.366745591163635, |
| "reward_std": 1.1478500664234161, |
| "rewards/mrr_reward": 0.1415860652923584, |
| "rewards/rank_analyze_format_reward": 0.39610453229397535, |
| "rewards/rank_answer_foramt_reward": 0.529296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 52 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 523.671875, |
| "epoch": 0.424, |
| "grad_norm": 0.02404443360865116, |
| "kl": 0.0006675124168395996, |
| "learning_rate": 1.9999469098937726e-05, |
| "loss": -0.0412, |
| "reward": 4.3376225233078, |
| "reward_std": 0.9486726224422455, |
| "rewards/mrr_reward": 0.10815352387726307, |
| "rewards/rank_analyze_format_reward": 0.36356932669878006, |
| "rewards/rank_answer_foramt_reward": 0.630859375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9140625, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 53 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 565.125, |
| "epoch": 0.432, |
| "grad_norm": 0.024322351440787315, |
| "kl": 0.000644683837890625, |
| "learning_rate": 1.9999442885743785e-05, |
| "loss": -0.0459, |
| "reward": 4.411644458770752, |
| "reward_std": 1.2008217573165894, |
| "rewards/mrr_reward": 0.12810019869357347, |
| "rewards/rank_analyze_format_reward": 0.3992435559630394, |
| "rewards/rank_answer_foramt_reward": 0.6484375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.96875, |
| "rewards/rank_overall_format_reward_more": 0.9140625, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 54 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 525.125, |
| "epoch": 0.44, |
| "grad_norm": 0.02503729611635208, |
| "kl": 0.0007516145706176758, |
| "learning_rate": 1.9999416040930354e-05, |
| "loss": -0.0562, |
| "reward": 4.842287182807922, |
| "reward_std": 1.241357833147049, |
| "rewards/mrr_reward": 0.20032241940498352, |
| "rewards/rank_analyze_format_reward": 0.4095212556421757, |
| "rewards/rank_answer_foramt_reward": 0.736328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9827302694320679, |
| "rewards/rank_overall_format_reward_more": 0.9296875, |
| "rewards/rank_verify_format_reward": 0.9827302694320679, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 539.4375, |
| "epoch": 0.448, |
| "grad_norm": 0.02243031933903694, |
| "kl": 0.0006896257400512695, |
| "learning_rate": 1.9999388564499135e-05, |
| "loss": -0.0226, |
| "reward": 4.513183832168579, |
| "reward_std": 1.038706436753273, |
| "rewards/mrr_reward": 0.13224826380610466, |
| "rewards/rank_analyze_format_reward": 0.35126328840851784, |
| "rewards/rank_answer_foramt_reward": 0.705078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9296875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 56 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 534.796875, |
| "epoch": 0.456, |
| "grad_norm": 0.026454076170921326, |
| "kl": 0.0011850595474243164, |
| "learning_rate": 1.999936045645186e-05, |
| "loss": 0.0105, |
| "reward": 4.213799595832825, |
| "reward_std": 0.925405740737915, |
| "rewards/mrr_reward": 0.09674479439854622, |
| "rewards/rank_analyze_format_reward": 0.32846502028405666, |
| "rewards/rank_answer_foramt_reward": 0.59375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9835526347160339, |
| "rewards/rank_overall_format_reward_more": 0.9375, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 57 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 521.25, |
| "epoch": 0.464, |
| "grad_norm": 0.0244609247893095, |
| "kl": 0.0009926557540893555, |
| "learning_rate": 1.9999331716790303e-05, |
| "loss": -0.0587, |
| "reward": 4.537912011146545, |
| "reward_std": 1.1836341470479965, |
| "rewards/mrr_reward": 0.16721230559051037, |
| "rewards/rank_analyze_format_reward": 0.22257816419005394, |
| "rewards/rank_answer_foramt_reward": 0.748046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.9296875, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 58 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 505.328125, |
| "epoch": 0.472, |
| "grad_norm": 0.02554202266037464, |
| "kl": 0.0009477138519287109, |
| "learning_rate": 1.9999302345516278e-05, |
| "loss": -0.04, |
| "reward": 4.834690093994141, |
| "reward_std": 1.2422936260700226, |
| "rewards/mrr_reward": 0.23472222685813904, |
| "rewards/rank_analyze_format_reward": 0.2765456959605217, |
| "rewards/rank_answer_foramt_reward": 0.67578125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 59 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 525.5, |
| "epoch": 0.48, |
| "grad_norm": 0.02595827914774418, |
| "kl": 0.0011942386627197266, |
| "learning_rate": 1.9999272342631644e-05, |
| "loss": -0.0433, |
| "reward": 4.970677137374878, |
| "reward_std": 1.340297669172287, |
| "rewards/mrr_reward": 0.24613716453313828, |
| "rewards/rank_analyze_format_reward": 0.4353472888469696, |
| "rewards/rank_answer_foramt_reward": 0.65234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9296875, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 557.796875, |
| "epoch": 0.488, |
| "grad_norm": 0.023065784946084023, |
| "kl": 0.0008342266082763672, |
| "learning_rate": 1.9999241708138296e-05, |
| "loss": -0.0182, |
| "reward": 5.042990684509277, |
| "reward_std": 1.06068916618824, |
| "rewards/mrr_reward": 0.21142732724547386, |
| "rewards/rank_analyze_format_reward": 0.5171288028359413, |
| "rewards/rank_answer_foramt_reward": 0.720703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9992559552192688, |
| "step": 61 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 544.265625, |
| "epoch": 0.496, |
| "grad_norm": 0.02461962401866913, |
| "kl": 0.0009695291519165039, |
| "learning_rate": 1.9999210442038164e-05, |
| "loss": -0.0215, |
| "reward": 4.853347659111023, |
| "reward_std": 0.9557467103004456, |
| "rewards/mrr_reward": 0.19696180522441864, |
| "rewards/rank_analyze_format_reward": 0.3876512125134468, |
| "rewards/rank_answer_foramt_reward": 0.7734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9834558814764023, |
| "rewards/rank_overall_format_reward_more": 0.9375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 62 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 537.484375, |
| "epoch": 0.504, |
| "grad_norm": 0.024909336119890213, |
| "kl": 0.0018236637115478516, |
| "learning_rate": 1.9999178544333228e-05, |
| "loss": -0.0161, |
| "reward": 4.388993203639984, |
| "reward_std": 1.0270372480154037, |
| "rewards/mrr_reward": 0.09973958693444729, |
| "rewards/rank_analyze_format_reward": 0.47624821215867996, |
| "rewards/rank_answer_foramt_reward": 0.6640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9834558814764023, |
| "rewards/rank_overall_format_reward_more": 0.9140625, |
| "rewards/rank_verify_format_reward": 0.9522058814764023, |
| "step": 63 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 542.828125, |
| "epoch": 0.512, |
| "grad_norm": 0.024513162672519684, |
| "kl": 0.0010799169540405273, |
| "learning_rate": 1.9999146015025503e-05, |
| "loss": -0.0241, |
| "reward": 4.562963366508484, |
| "reward_std": 0.8578247427940369, |
| "rewards/mrr_reward": 0.12247024103999138, |
| "rewards/rank_analyze_format_reward": 0.3348013088107109, |
| "rewards/rank_answer_foramt_reward": 0.80078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 64 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 551.046875, |
| "epoch": 0.52, |
| "grad_norm": 0.025237975642085075, |
| "kl": 0.0012438297271728516, |
| "learning_rate": 1.999911285411704e-05, |
| "loss": -0.032, |
| "reward": 4.392790853977203, |
| "reward_std": 0.9753015786409378, |
| "rewards/mrr_reward": 0.11674107611179352, |
| "rewards/rank_analyze_format_reward": 0.36466294899582863, |
| "rewards/rank_answer_foramt_reward": 0.705078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9827302694320679, |
| "rewards/rank_overall_format_reward_more": 0.90625, |
| "rewards/rank_verify_format_reward": 0.9671052694320679, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 541.96875, |
| "epoch": 0.528, |
| "grad_norm": 0.025285648182034492, |
| "kl": 0.0014480352401733398, |
| "learning_rate": 1.9999079061609933e-05, |
| "loss": -0.0342, |
| "reward": 4.65506386756897, |
| "reward_std": 1.2143934965133667, |
| "rewards/mrr_reward": 0.16987847164273262, |
| "rewards/rank_analyze_format_reward": 0.4049289934337139, |
| "rewards/rank_answer_foramt_reward": 0.634765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 66 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 520.1875, |
| "epoch": 0.536, |
| "grad_norm": 0.02740044705569744, |
| "kl": 0.001478433609008789, |
| "learning_rate": 1.999904463750632e-05, |
| "loss": -0.0097, |
| "reward": 4.9612908363342285, |
| "reward_std": 1.1755748093128204, |
| "rewards/mrr_reward": 0.24047619476914406, |
| "rewards/rank_analyze_format_reward": 0.309462770819664, |
| "rewards/rank_answer_foramt_reward": 0.734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.993399053812027, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.993399053812027, |
| "step": 67 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 561.734375, |
| "epoch": 0.544, |
| "grad_norm": 0.026321450248360634, |
| "kl": 0.001252889633178711, |
| "learning_rate": 1.999900958180838e-05, |
| "loss": -0.0504, |
| "reward": 5.649916648864746, |
| "reward_std": 1.1421409100294113, |
| "rewards/mrr_reward": 0.3350074402987957, |
| "rewards/rank_analyze_format_reward": 0.4505118057131767, |
| "rewards/rank_answer_foramt_reward": 0.875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 68 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 559.59375, |
| "epoch": 0.552, |
| "grad_norm": 0.02405831590294838, |
| "kl": 0.0014369487762451172, |
| "learning_rate": 1.9998973894518318e-05, |
| "loss": 0.0015, |
| "reward": 4.788713574409485, |
| "reward_std": 0.9084204286336899, |
| "rewards/mrr_reward": 0.1370349731296301, |
| "rewards/rank_analyze_format_reward": 0.5101048266515136, |
| "rewards/rank_answer_foramt_reward": 0.81640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 69 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 526.03125, |
| "epoch": 0.56, |
| "grad_norm": 0.0266768429428339, |
| "kl": 0.001832723617553711, |
| "learning_rate": 1.999893757563839e-05, |
| "loss": -0.0401, |
| "reward": 4.823967456817627, |
| "reward_std": 1.1708943247795105, |
| "rewards/mrr_reward": 0.20266617462038994, |
| "rewards/rank_analyze_format_reward": 0.36864544451236725, |
| "rewards/rank_answer_foramt_reward": 0.705078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9971333742141724, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9815083742141724, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 583.609375, |
| "epoch": 0.568, |
| "grad_norm": 0.025261225178837776, |
| "kl": 0.001687765121459961, |
| "learning_rate": 1.9998900625170897e-05, |
| "loss": -0.0282, |
| "reward": 5.068133354187012, |
| "reward_std": 1.3164568394422531, |
| "rewards/mrr_reward": 0.24970858544111252, |
| "rewards/rank_analyze_format_reward": 0.37027864158153534, |
| "rewards/rank_answer_foramt_reward": 0.74609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9959945678710938, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9959945678710938, |
| "step": 71 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 579.515625, |
| "epoch": 0.576, |
| "grad_norm": 0.024856774136424065, |
| "kl": 0.0020842552185058594, |
| "learning_rate": 1.9998863043118163e-05, |
| "loss": -0.0296, |
| "reward": 4.706835389137268, |
| "reward_std": 0.9307773113250732, |
| "rewards/mrr_reward": 0.12280506081879139, |
| "rewards/rank_analyze_format_reward": 0.5460995584726334, |
| "rewards/rank_answer_foramt_reward": 0.70703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9968671798706055, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9968671798706055, |
| "step": 72 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 534.359375, |
| "epoch": 0.584, |
| "grad_norm": 0.025794433429837227, |
| "kl": 0.002407550811767578, |
| "learning_rate": 1.999882482948257e-05, |
| "loss": -0.0073, |
| "reward": 4.6565152406692505, |
| "reward_std": 0.9558501094579697, |
| "rewards/mrr_reward": 0.15843874588608742, |
| "rewards/rank_analyze_format_reward": 0.38018228113651276, |
| "rewards/rank_answer_foramt_reward": 0.759765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.9296875, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 73 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 545.03125, |
| "epoch": 0.592, |
| "grad_norm": 0.025461561977863312, |
| "kl": 0.0017633438110351562, |
| "learning_rate": 1.999878598426653e-05, |
| "loss": -0.0305, |
| "reward": 5.3728920221328735, |
| "reward_std": 0.9629544615745544, |
| "rewards/mrr_reward": 0.28620412945747375, |
| "rewards/rank_analyze_format_reward": 0.46884432435035706, |
| "rewards/rank_answer_foramt_reward": 0.78125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9968030601739883, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9968030601739883, |
| "step": 74 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 508.765625, |
| "epoch": 0.6, |
| "grad_norm": 0.03176024928689003, |
| "kl": 0.002418041229248047, |
| "learning_rate": 1.9998746507472493e-05, |
| "loss": -0.046, |
| "reward": 4.1894320249557495, |
| "reward_std": 0.9330323338508606, |
| "rewards/mrr_reward": 0.08410218358039856, |
| "rewards/rank_analyze_format_reward": 0.287723608314991, |
| "rewards/rank_answer_foramt_reward": 0.626953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9808920323848724, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9808920323848724, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 543.21875, |
| "epoch": 0.608, |
| "grad_norm": 0.0266107227653265, |
| "kl": 0.0020155906677246094, |
| "learning_rate": 1.999870639910296e-05, |
| "loss": -0.0428, |
| "reward": 4.562186181545258, |
| "reward_std": 0.9480538219213486, |
| "rewards/mrr_reward": 0.13458581641316414, |
| "rewards/rank_analyze_format_reward": 0.4180658236145973, |
| "rewards/rank_answer_foramt_reward": 0.669921875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 76 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 557.546875, |
| "epoch": 0.616, |
| "grad_norm": 0.027047261595726013, |
| "kl": 0.002165555953979492, |
| "learning_rate": 1.9998665659160453e-05, |
| "loss": 0.003, |
| "reward": 5.2997212409973145, |
| "reward_std": 1.2724156975746155, |
| "rewards/mrr_reward": 0.2895585522055626, |
| "rewards/rank_analyze_format_reward": 0.3944082595407963, |
| "rewards/rank_answer_foramt_reward": 0.771484375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9917034357786179, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9917034357786179, |
| "step": 77 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 539.265625, |
| "epoch": 0.624, |
| "grad_norm": 0.026604430750012398, |
| "kl": 0.0022330284118652344, |
| "learning_rate": 1.999862428764756e-05, |
| "loss": -0.051, |
| "reward": 4.927618980407715, |
| "reward_std": 1.0934423208236694, |
| "rewards/mrr_reward": 0.18596850894391537, |
| "rewards/rank_analyze_format_reward": 0.4122604951262474, |
| "rewards/rank_answer_foramt_reward": 0.794921875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 78 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 553.671875, |
| "epoch": 0.632, |
| "grad_norm": 0.025129646062850952, |
| "kl": 0.001984834671020508, |
| "learning_rate": 1.9998582284566878e-05, |
| "loss": -0.0399, |
| "reward": 4.638971567153931, |
| "reward_std": 1.0905082076787949, |
| "rewards/mrr_reward": 0.13253968209028244, |
| "rewards/rank_analyze_format_reward": 0.4443269595503807, |
| "rewards/rank_answer_foramt_reward": 0.68359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 79 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 596.5625, |
| "epoch": 0.64, |
| "grad_norm": 0.025474058464169502, |
| "kl": 0.0023970603942871094, |
| "learning_rate": 1.999853964992107e-05, |
| "loss": -0.0208, |
| "reward": 5.141489744186401, |
| "reward_std": 0.9898062199354172, |
| "rewards/mrr_reward": 0.19254092685878277, |
| "rewards/rank_analyze_format_reward": 0.6555207520723343, |
| "rewards/rank_answer_foramt_reward": 0.787109375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9955979138612747, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9799729138612747, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 557.5625, |
| "epoch": 0.648, |
| "grad_norm": 0.029823826625943184, |
| "kl": 0.002605438232421875, |
| "learning_rate": 1.9998496383712828e-05, |
| "loss": -0.0459, |
| "reward": 4.708982348442078, |
| "reward_std": 0.832055389881134, |
| "rewards/mrr_reward": 0.11339906044304371, |
| "rewards/rank_analyze_format_reward": 0.5364454686641693, |
| "rewards/rank_answer_foramt_reward": 0.75390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9903296828269958, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9903296828269958, |
| "step": 81 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 567.6875, |
| "epoch": 0.656, |
| "grad_norm": 0.02752668969333172, |
| "kl": 0.002631664276123047, |
| "learning_rate": 1.999845248594489e-05, |
| "loss": -0.0398, |
| "reward": 4.8155412673950195, |
| "reward_std": 0.9520252794027328, |
| "rewards/mrr_reward": 0.14484747499227524, |
| "rewards/rank_analyze_format_reward": 0.5442800670862198, |
| "rewards/rank_answer_foramt_reward": 0.724609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9836309552192688, |
| "step": 82 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 573.71875, |
| "epoch": 0.664, |
| "grad_norm": 0.028707344084978104, |
| "kl": 0.002571582794189453, |
| "learning_rate": 1.9998407956620017e-05, |
| "loss": -0.0306, |
| "reward": 5.175315976142883, |
| "reward_std": 1.2197599858045578, |
| "rewards/mrr_reward": 0.24100322648882866, |
| "rewards/rank_analyze_format_reward": 0.45521388202905655, |
| "rewards/rank_answer_foramt_reward": 0.775390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 83 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 557.078125, |
| "epoch": 0.672, |
| "grad_norm": 0.02770584635436535, |
| "kl": 0.003272533416748047, |
| "learning_rate": 1.9998362795741027e-05, |
| "loss": 0.0127, |
| "reward": 4.81751024723053, |
| "reward_std": 1.0414631068706512, |
| "rewards/mrr_reward": 0.18727059103548527, |
| "rewards/rank_analyze_format_reward": 0.4546085884794593, |
| "rewards/rank_answer_foramt_reward": 0.681640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9817143976688385, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9817143976688385, |
| "step": 84 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 562.15625, |
| "epoch": 0.68, |
| "grad_norm": 0.029665743932127953, |
| "kl": 0.002528667449951172, |
| "learning_rate": 1.9998317003310775e-05, |
| "loss": -0.0482, |
| "reward": 4.359201908111572, |
| "reward_std": 0.8430032134056091, |
| "rewards/mrr_reward": 0.08872148208320141, |
| "rewards/rank_analyze_format_reward": 0.400491826236248, |
| "rewards/rank_answer_foramt_reward": 0.66796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9679276347160339, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 574.3125, |
| "epoch": 0.688, |
| "grad_norm": 0.029378948733210564, |
| "kl": 0.0034728050231933594, |
| "learning_rate": 1.9998270579332154e-05, |
| "loss": 0.0053, |
| "reward": 5.317029237747192, |
| "reward_std": 1.1373002529144287, |
| "rewards/mrr_reward": 0.2627728134393692, |
| "rewards/rank_analyze_format_reward": 0.5021510571241379, |
| "rewards/rank_answer_foramt_reward": 0.7890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 86 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 579.984375, |
| "epoch": 0.696, |
| "grad_norm": 0.027385709807276726, |
| "kl": 0.0028448104858398438, |
| "learning_rate": 1.9998223523808092e-05, |
| "loss": -0.0373, |
| "reward": 5.256059646606445, |
| "reward_std": 0.8636089265346527, |
| "rewards/mrr_reward": 0.21436012163758278, |
| "rewards/rank_analyze_format_reward": 0.5581663772463799, |
| "rewards/rank_answer_foramt_reward": 0.845703125, |
| "rewards/rank_contrast_format_reward": 0.013700738549232483, |
| "rewards/rank_initial_format_reward": 0.9983368366956711, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9983368366956711, |
| "step": 87 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 569.390625, |
| "epoch": 0.704, |
| "grad_norm": 0.02710539661347866, |
| "kl": 0.0027756690979003906, |
| "learning_rate": 1.9998175836741564e-05, |
| "loss": -0.0384, |
| "reward": 4.694380164146423, |
| "reward_std": 0.9186579138040543, |
| "rewards/mrr_reward": 0.10895957797765732, |
| "rewards/rank_analyze_format_reward": 0.5749479159712791, |
| "rewards/rank_answer_foramt_reward": 0.74609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 88 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 579.078125, |
| "epoch": 0.712, |
| "grad_norm": 0.027813483029603958, |
| "kl": 0.0035161972045898438, |
| "learning_rate": 1.999812751813558e-05, |
| "loss": -0.0233, |
| "reward": 4.924348711967468, |
| "reward_std": 1.096758782863617, |
| "rewards/mrr_reward": 0.17403894662857056, |
| "rewards/rank_analyze_format_reward": 0.5381231904029846, |
| "rewards/rank_answer_foramt_reward": 0.779296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.9296875, |
| "rewards/rank_verify_format_reward": 0.9827302694320679, |
| "step": 89 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 597.78125, |
| "epoch": 0.72, |
| "grad_norm": 0.026623884215950966, |
| "kl": 0.003002643585205078, |
| "learning_rate": 1.9998078567993197e-05, |
| "loss": -0.0256, |
| "reward": 5.420621871948242, |
| "reward_std": 1.2407971769571304, |
| "rewards/mrr_reward": 0.2805493548512459, |
| "rewards/rank_analyze_format_reward": 0.5728173106908798, |
| "rewards/rank_answer_foramt_reward": 0.76171875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9936629235744476, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9936629235744476, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 585.40625, |
| "epoch": 0.728, |
| "grad_norm": 0.027379710227251053, |
| "kl": 0.0037784576416015625, |
| "learning_rate": 1.9998028986317504e-05, |
| "loss": -0.0222, |
| "reward": 5.071754574775696, |
| "reward_std": 1.0615117102861404, |
| "rewards/mrr_reward": 0.21678448282182217, |
| "rewards/rank_analyze_format_reward": 0.39732154086232185, |
| "rewards/rank_answer_foramt_reward": 0.88671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9954443722963333, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.9798193722963333, |
| "step": 91 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 596.796875, |
| "epoch": 0.736, |
| "grad_norm": 0.028991688042879105, |
| "kl": 0.0032110214233398438, |
| "learning_rate": 1.999797877311163e-05, |
| "loss": -0.0177, |
| "reward": 5.152738690376282, |
| "reward_std": 1.0150837451219559, |
| "rewards/mrr_reward": 0.1862909272313118, |
| "rewards/rank_analyze_format_reward": 0.6470415517687798, |
| "rewards/rank_answer_foramt_reward": 0.78515625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9955011606216431, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9955011606216431, |
| "step": 92 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 554.484375, |
| "epoch": 0.744, |
| "grad_norm": 0.028571411967277527, |
| "kl": 0.004134178161621094, |
| "learning_rate": 1.9997927928378753e-05, |
| "loss": -0.0372, |
| "reward": 4.580929517745972, |
| "reward_std": 0.7480403929948807, |
| "rewards/mrr_reward": 0.11786334402859211, |
| "rewards/rank_analyze_format_reward": 0.36851008981466293, |
| "rewards/rank_answer_foramt_reward": 0.76171875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 93 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 591.625, |
| "epoch": 0.752, |
| "grad_norm": 0.02991093136370182, |
| "kl": 0.0034475326538085938, |
| "learning_rate": 1.999787645212208e-05, |
| "loss": -0.0337, |
| "reward": 4.8702027797698975, |
| "reward_std": 0.8430802449584007, |
| "rewards/mrr_reward": 0.13307911716401577, |
| "rewards/rank_analyze_format_reward": 0.6129686385393143, |
| "rewards/rank_answer_foramt_reward": 0.734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 94 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 590.765625, |
| "epoch": 0.76, |
| "grad_norm": 0.03038324974477291, |
| "kl": 0.00400543212890625, |
| "learning_rate": 1.999782434434486e-05, |
| "loss": -0.0202, |
| "reward": 5.1996424198150635, |
| "reward_std": 0.8570089638233185, |
| "rewards/mrr_reward": 0.20330480858683586, |
| "rewards/rank_analyze_format_reward": 0.5464644953608513, |
| "rewards/rank_answer_foramt_reward": 0.857421875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 609.40625, |
| "epoch": 0.768, |
| "grad_norm": 0.028416253626346588, |
| "kl": 0.004006385803222656, |
| "learning_rate": 1.999777160505039e-05, |
| "loss": -0.0094, |
| "reward": 5.031667947769165, |
| "reward_std": 0.6344560533761978, |
| "rewards/mrr_reward": 0.14536210522055626, |
| "rewards/rank_analyze_format_reward": 0.6181881725788116, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 96 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 585.09375, |
| "epoch": 0.776, |
| "grad_norm": 0.0321107916533947, |
| "kl": 0.0050525665283203125, |
| "learning_rate": 1.9997718234242e-05, |
| "loss": -0.0221, |
| "reward": 5.048359394073486, |
| "reward_std": 1.0561828166246414, |
| "rewards/mrr_reward": 0.19980159029364586, |
| "rewards/rank_analyze_format_reward": 0.47314807027578354, |
| "rewards/rank_answer_foramt_reward": 0.84765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9915180057287216, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9758930057287216, |
| "step": 97 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 596.359375, |
| "epoch": 0.784, |
| "grad_norm": 0.03290560096502304, |
| "kl": 0.004299163818359375, |
| "learning_rate": 1.999766423192306e-05, |
| "loss": -0.0549, |
| "reward": 5.590569615364075, |
| "reward_std": 0.9068724364042282, |
| "rewards/mrr_reward": 0.2803075537085533, |
| "rewards/rank_analyze_format_reward": 0.6597586870193481, |
| "rewards/rank_answer_foramt_reward": 0.798828125, |
| "rewards/rank_contrast_format_reward": 0.012397300451993942, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 98 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.375, |
| "epoch": 0.792, |
| "grad_norm": 0.028949948027729988, |
| "kl": 0.0046710968017578125, |
| "learning_rate": 1.9997609598096982e-05, |
| "loss": -0.017, |
| "reward": 4.953081727027893, |
| "reward_std": 0.8293813019990921, |
| "rewards/mrr_reward": 0.14769965037703514, |
| "rewards/rank_analyze_format_reward": 0.6502636969089508, |
| "rewards/rank_answer_foramt_reward": 0.763671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9976112246513367, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9976112246513367, |
| "step": 99 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 605.71875, |
| "epoch": 0.8, |
| "grad_norm": 0.028032353147864342, |
| "kl": 0.004805564880371094, |
| "learning_rate": 1.9997554332767214e-05, |
| "loss": -0.0352, |
| "reward": 5.1254483461380005, |
| "reward_std": 0.728003740310669, |
| "rewards/mrr_reward": 0.14519469253718853, |
| "rewards/rank_analyze_format_reward": 0.7232527583837509, |
| "rewards/rank_answer_foramt_reward": 0.83203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9985989332199097, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9985989332199097, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.296875, |
| "epoch": 0.808, |
| "grad_norm": 0.028648706153035164, |
| "kl": 0.004345893859863281, |
| "learning_rate": 1.9997498435937254e-05, |
| "loss": 0.0106, |
| "reward": 5.0007301568984985, |
| "reward_std": 0.9343436509370804, |
| "rewards/mrr_reward": 0.17731894738972187, |
| "rewards/rank_analyze_format_reward": 0.5347120687365532, |
| "rewards/rank_answer_foramt_reward": 0.791015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9945820420980453, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9945820420980453, |
| "step": 101 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.828125, |
| "epoch": 0.816, |
| "grad_norm": 0.02675345167517662, |
| "kl": 0.004836559295654297, |
| "learning_rate": 1.9997441907610624e-05, |
| "loss": -0.0189, |
| "reward": 5.016782879829407, |
| "reward_std": 0.9857280552387238, |
| "rewards/mrr_reward": 0.17276166006922722, |
| "rewards/rank_analyze_format_reward": 0.5417740494012833, |
| "rewards/rank_answer_foramt_reward": 0.791015625, |
| "rewards/rank_contrast_format_reward": 0.010216346010565758, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 102 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.0625, |
| "epoch": 0.824, |
| "grad_norm": 0.027744626626372337, |
| "kl": 0.004809379577636719, |
| "learning_rate": 1.9997384747790903e-05, |
| "loss": -0.0061, |
| "reward": 5.06945013999939, |
| "reward_std": 1.0500756949186325, |
| "rewards/mrr_reward": 0.19982018508017063, |
| "rewards/rank_analyze_format_reward": 0.5732885971665382, |
| "rewards/rank_answer_foramt_reward": 0.76171875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9792998284101486, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9792998284101486, |
| "step": 103 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 567.90625, |
| "epoch": 0.832, |
| "grad_norm": 0.03045082278549671, |
| "kl": 0.00579833984375, |
| "learning_rate": 1.9997326956481693e-05, |
| "loss": -0.0106, |
| "reward": 4.810132622718811, |
| "reward_std": 0.8808675408363342, |
| "rewards/mrr_reward": 0.14889632910490036, |
| "rewards/rank_analyze_format_reward": 0.48346175998449326, |
| "rewards/rank_answer_foramt_reward": 0.7578125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 104 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 635.234375, |
| "epoch": 0.84, |
| "grad_norm": 0.030195703729987144, |
| "kl": 0.00594329833984375, |
| "learning_rate": 1.999726853368665e-05, |
| "loss": -0.0475, |
| "reward": 5.089649319648743, |
| "reward_std": 0.8489270955324173, |
| "rewards/mrr_reward": 0.1736421212553978, |
| "rewards/rank_analyze_format_reward": 0.5801311060786247, |
| "rewards/rank_answer_foramt_reward": 0.861328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9963420033454895, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9963420033454895, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 638.328125, |
| "epoch": 0.848, |
| "grad_norm": 0.03000379353761673, |
| "kl": 0.005078315734863281, |
| "learning_rate": 1.9997209479409464e-05, |
| "loss": 0.0322, |
| "reward": 5.540584683418274, |
| "reward_std": 0.9829646348953247, |
| "rewards/mrr_reward": 0.2296379003673792, |
| "rewards/rank_analyze_format_reward": 0.794389545917511, |
| "rewards/rank_answer_foramt_reward": 0.828125, |
| "rewards/rank_contrast_format_reward": 0.015083539299666882, |
| "rewards/rank_initial_format_reward": 0.9961237162351608, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9961237162351608, |
| "step": 106 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.65625, |
| "epoch": 0.856, |
| "grad_norm": 0.028458530083298683, |
| "kl": 0.005078315734863281, |
| "learning_rate": 1.9997149793653862e-05, |
| "loss": -0.0095, |
| "reward": 4.97307014465332, |
| "reward_std": 0.578310415148735, |
| "rewards/mrr_reward": 0.1025235615670681, |
| "rewards/rank_analyze_format_reward": 0.713058277964592, |
| "rewards/rank_answer_foramt_reward": 0.859375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 107 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.03125, |
| "epoch": 0.864, |
| "grad_norm": 0.02653643861413002, |
| "kl": 0.006175994873046875, |
| "learning_rate": 1.9997089476423617e-05, |
| "loss": -0.0059, |
| "reward": 4.949914813041687, |
| "reward_std": 0.7462972551584244, |
| "rewards/mrr_reward": 0.11983507312834263, |
| "rewards/rank_analyze_format_reward": 0.7087408900260925, |
| "rewards/rank_answer_foramt_reward": 0.818359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 108 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.09375, |
| "epoch": 0.872, |
| "grad_norm": 0.028843272477388382, |
| "kl": 0.0054416656494140625, |
| "learning_rate": 1.999702852772254e-05, |
| "loss": -0.0267, |
| "reward": 5.561126232147217, |
| "reward_std": 0.9516362547874451, |
| "rewards/mrr_reward": 0.2666604742407799, |
| "rewards/rank_analyze_format_reward": 0.7099277526140213, |
| "rewards/rank_answer_foramt_reward": 0.806640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9967704266309738, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9967704266309738, |
| "step": 109 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.828125, |
| "epoch": 0.88, |
| "grad_norm": 0.02726060152053833, |
| "kl": 0.005953788757324219, |
| "learning_rate": 1.9996966947554476e-05, |
| "loss": -0.0389, |
| "reward": 5.334239721298218, |
| "reward_std": 1.061211720108986, |
| "rewards/mrr_reward": 0.22806919366121292, |
| "rewards/rank_analyze_format_reward": 0.7086126804351807, |
| "rewards/rank_answer_foramt_reward": 0.798828125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9806985259056091, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9806985259056091, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 648.421875, |
| "epoch": 0.888, |
| "grad_norm": 0.02812885120511055, |
| "kl": 0.005530357360839844, |
| "learning_rate": 1.9996904735923325e-05, |
| "loss": -0.0271, |
| "reward": 5.212171792984009, |
| "reward_std": 1.2261153161525726, |
| "rewards/mrr_reward": 0.19998760521411896, |
| "rewards/rank_analyze_format_reward": 0.7305806577205658, |
| "rewards/rank_answer_foramt_reward": 0.798828125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.953125, |
| "step": 111 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.640625, |
| "epoch": 0.896, |
| "grad_norm": 0.030628954991698265, |
| "kl": 0.005995750427246094, |
| "learning_rate": 1.9996841892833e-05, |
| "loss": -0.0294, |
| "reward": 4.731189131736755, |
| "reward_std": 0.8310668021440506, |
| "rewards/mrr_reward": 0.10221974551677704, |
| "rewards/rank_analyze_format_reward": 0.6598925739526749, |
| "rewards/rank_answer_foramt_reward": 0.75, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9296875, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 112 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 625.71875, |
| "epoch": 0.904, |
| "grad_norm": 0.02799079939723015, |
| "kl": 0.006671905517578125, |
| "learning_rate": 1.9996778418287486e-05, |
| "loss": -0.0042, |
| "reward": 4.8320887088775635, |
| "reward_std": 0.781333327293396, |
| "rewards/mrr_reward": 0.12659970112144947, |
| "rewards/rank_analyze_format_reward": 0.6272406578063965, |
| "rewards/rank_answer_foramt_reward": 0.771484375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9791073650121689, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9791073650121689, |
| "step": 113 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 656.5625, |
| "epoch": 0.912, |
| "grad_norm": 0.028244731947779655, |
| "kl": 0.0065898895263671875, |
| "learning_rate": 1.9996714312290784e-05, |
| "loss": -0.0069, |
| "reward": 5.762642502784729, |
| "reward_std": 1.0326203405857086, |
| "rewards/mrr_reward": 0.292788939550519, |
| "rewards/rank_analyze_format_reward": 0.7934152334928513, |
| "rewards/rank_answer_foramt_reward": 0.8203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.996692106127739, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.996692106127739, |
| "step": 114 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 638.53125, |
| "epoch": 0.92, |
| "grad_norm": 0.03135911375284195, |
| "kl": 0.006131172180175781, |
| "learning_rate": 1.9996649574846948e-05, |
| "loss": 0.0157, |
| "reward": 5.772169351577759, |
| "reward_std": 0.9267353266477585, |
| "rewards/mrr_reward": 0.30804190039634705, |
| "rewards/rank_analyze_format_reward": 0.7891548573970795, |
| "rewards/rank_answer_foramt_reward": 0.84375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9965170323848724, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.9652670323848724, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 613.453125, |
| "epoch": 0.928, |
| "grad_norm": 0.028077326714992523, |
| "kl": 0.005988121032714844, |
| "learning_rate": 1.9996584205960063e-05, |
| "loss": -0.0113, |
| "reward": 5.589708924293518, |
| "reward_std": 1.1055090427398682, |
| "rewards/mrr_reward": 0.2775483652949333, |
| "rewards/rank_analyze_format_reward": 0.6549882963299751, |
| "rewards/rank_answer_foramt_reward": 0.833984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 116 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 639.6875, |
| "epoch": 0.936, |
| "grad_norm": 0.028432337567210197, |
| "kl": 0.006014823913574219, |
| "learning_rate": 1.999651820563426e-05, |
| "loss": -0.0167, |
| "reward": 5.5568296909332275, |
| "reward_std": 1.1293076276779175, |
| "rewards/mrr_reward": 0.25022321194410324, |
| "rewards/rank_analyze_format_reward": 0.7510844320058823, |
| "rewards/rank_answer_foramt_reward": 0.84375, |
| "rewards/rank_contrast_format_reward": 0.009815705008804798, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 117 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 617.359375, |
| "epoch": 0.944, |
| "grad_norm": 0.02918749675154686, |
| "kl": 0.007138252258300781, |
| "learning_rate": 1.999645157387371e-05, |
| "loss": -0.0352, |
| "reward": 5.3101993799209595, |
| "reward_std": 0.9095352292060852, |
| "rewards/mrr_reward": 0.18025793880224228, |
| "rewards/rank_analyze_format_reward": 0.7623788416385651, |
| "rewards/rank_answer_foramt_reward": 0.861328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9827302694320679, |
| "step": 118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.078125, |
| "epoch": 0.952, |
| "grad_norm": 0.029070457443594933, |
| "kl": 0.00724029541015625, |
| "learning_rate": 1.9996384310682615e-05, |
| "loss": -0.0279, |
| "reward": 5.085901737213135, |
| "reward_std": 0.9678252041339874, |
| "rewards/mrr_reward": 0.18282490503042936, |
| "rewards/rank_analyze_format_reward": 0.6765593886375427, |
| "rewards/rank_answer_foramt_reward": 0.75, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 119 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 654.546875, |
| "epoch": 0.96, |
| "grad_norm": 0.030889704823493958, |
| "kl": 0.0070590972900390625, |
| "learning_rate": 1.999631641606523e-05, |
| "loss": 0.0058, |
| "reward": 5.475605249404907, |
| "reward_std": 1.2632475644350052, |
| "rewards/mrr_reward": 0.2467137910425663, |
| "rewards/rank_analyze_format_reward": 0.7685735672712326, |
| "rewards/rank_answer_foramt_reward": 0.77734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9948538690805435, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9792288690805435, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 682.0625, |
| "epoch": 0.968, |
| "grad_norm": 0.0283079631626606, |
| "kl": 0.0067138671875, |
| "learning_rate": 1.9996247890025845e-05, |
| "loss": 0.0112, |
| "reward": 5.4859858751297, |
| "reward_std": 0.9399373084306717, |
| "rewards/mrr_reward": 0.22347470000386238, |
| "rewards/rank_analyze_format_reward": 0.7617006599903107, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9835526347160339, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 121 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.34375, |
| "epoch": 0.976, |
| "grad_norm": 0.028440656140446663, |
| "kl": 0.006336212158203125, |
| "learning_rate": 1.9996178732568784e-05, |
| "loss": -0.0263, |
| "reward": 5.595025658607483, |
| "reward_std": 1.2384063154459, |
| "rewards/mrr_reward": 0.2738715261220932, |
| "rewards/rank_analyze_format_reward": 0.7087783962488174, |
| "rewards/rank_answer_foramt_reward": 0.814453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9959664940834045, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9959664940834045, |
| "step": 122 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 610.578125, |
| "epoch": 0.984, |
| "grad_norm": 0.031218891963362694, |
| "kl": 0.006499290466308594, |
| "learning_rate": 1.9996108943698412e-05, |
| "loss": -0.0205, |
| "reward": 5.309447526931763, |
| "reward_std": 0.8753966242074966, |
| "rewards/mrr_reward": 0.22593005746603012, |
| "rewards/rank_analyze_format_reward": 0.5845683068037033, |
| "rewards/rank_answer_foramt_reward": 0.8359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9965170323848724, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9965170323848724, |
| "step": 123 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 679.9375, |
| "epoch": 0.992, |
| "grad_norm": 0.025655120611190796, |
| "kl": 0.006500244140625, |
| "learning_rate": 1.9996038523419148e-05, |
| "loss": 0.0137, |
| "reward": 5.34429144859314, |
| "reward_std": 0.6560061201453209, |
| "rewards/mrr_reward": 0.16470114514231682, |
| "rewards/rank_analyze_format_reward": 0.8025594502687454, |
| "rewards/rank_answer_foramt_reward": 0.900390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 124 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.8125, |
| "epoch": 1.0, |
| "grad_norm": 0.02733149379491806, |
| "kl": 0.006840705871582031, |
| "learning_rate": 1.9995967471735433e-05, |
| "loss": -0.0051, |
| "reward": 5.065644264221191, |
| "reward_std": 0.8055929243564606, |
| "rewards/mrr_reward": 0.14942336827516556, |
| "rewards/rank_analyze_format_reward": 0.6398258581757545, |
| "rewards/rank_answer_foramt_reward": 0.875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 643.5, |
| "epoch": 1.008, |
| "grad_norm": 0.028533369302749634, |
| "kl": 0.0065708160400390625, |
| "learning_rate": 1.9995895788651753e-05, |
| "loss": -0.0093, |
| "reward": 5.7667927742004395, |
| "reward_std": 1.0514316856861115, |
| "rewards/mrr_reward": 0.3164868615567684, |
| "rewards/rank_analyze_format_reward": 0.6641343683004379, |
| "rewards/rank_answer_foramt_reward": 0.84765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9984335899353027, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9984335899353027, |
| "step": 126 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 648.609375, |
| "epoch": 1.016, |
| "grad_norm": 0.029412733390927315, |
| "kl": 0.0068683624267578125, |
| "learning_rate": 1.9995823474172644e-05, |
| "loss": -0.0102, |
| "reward": 5.423908352851868, |
| "reward_std": 0.9281527996063232, |
| "rewards/mrr_reward": 0.20751488581299782, |
| "rewards/rank_analyze_format_reward": 0.7598643451929092, |
| "rewards/rank_answer_foramt_reward": 0.833984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 127 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.015625, |
| "epoch": 1.024, |
| "grad_norm": 0.030668683350086212, |
| "kl": 0.008924484252929688, |
| "learning_rate": 1.9995750528302668e-05, |
| "loss": -0.0259, |
| "reward": 5.635333180427551, |
| "reward_std": 0.6666858419775963, |
| "rewards/mrr_reward": 0.2542472630739212, |
| "rewards/rank_analyze_format_reward": 0.7405444979667664, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9965170323848724, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9965170323848724, |
| "step": 128 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 650.09375, |
| "epoch": 1.032, |
| "grad_norm": 0.029225200414657593, |
| "kl": 0.0071849822998046875, |
| "learning_rate": 1.999567695104643e-05, |
| "loss": 0.002, |
| "reward": 5.3161762952804565, |
| "reward_std": 0.9796330630779266, |
| "rewards/mrr_reward": 0.21669147536158562, |
| "rewards/rank_analyze_format_reward": 0.7243717163801193, |
| "rewards/rank_answer_foramt_reward": 0.7890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9948723018169403, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9801664054393768, |
| "step": 129 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.6875, |
| "epoch": 1.04, |
| "grad_norm": 0.03244978189468384, |
| "kl": 0.006844520568847656, |
| "learning_rate": 1.9995602742408584e-05, |
| "loss": -0.0215, |
| "reward": 5.631897449493408, |
| "reward_std": 1.2725486308336258, |
| "rewards/mrr_reward": 0.2755270190536976, |
| "rewards/rank_analyze_format_reward": 0.7383408695459366, |
| "rewards/rank_answer_foramt_reward": 0.80078125, |
| "rewards/rank_contrast_format_reward": 0.01306460052728653, |
| "rewards/rank_initial_format_reward": 0.9966137707233429, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9966137707233429, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 669.21875, |
| "epoch": 1.048, |
| "grad_norm": 0.02654576487839222, |
| "kl": 0.007208824157714844, |
| "learning_rate": 1.9995527902393814e-05, |
| "loss": -0.0166, |
| "reward": 5.244002819061279, |
| "reward_std": 0.7309348955750465, |
| "rewards/mrr_reward": 0.16037946939468384, |
| "rewards/rank_analyze_format_reward": 0.7431098967790604, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 131 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 653.578125, |
| "epoch": 1.056, |
| "grad_norm": 0.029960671439766884, |
| "kl": 0.0071048736572265625, |
| "learning_rate": 1.9995452431006844e-05, |
| "loss": -0.0318, |
| "reward": 5.312576532363892, |
| "reward_std": 1.0043585747480392, |
| "rewards/mrr_reward": 0.1904141791164875, |
| "rewards/rank_analyze_format_reward": 0.7370295971632004, |
| "rewards/rank_answer_foramt_reward": 0.8203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9967888593673706, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9967888593673706, |
| "step": 132 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 602.296875, |
| "epoch": 1.064, |
| "grad_norm": 0.034355491399765015, |
| "kl": 0.008157730102539062, |
| "learning_rate": 1.999537632825245e-05, |
| "loss": -0.0341, |
| "reward": 5.146085500717163, |
| "reward_std": 0.7237976565957069, |
| "rewards/mrr_reward": 0.15810392051935196, |
| "rewards/rank_analyze_format_reward": 0.6832832396030426, |
| "rewards/rank_answer_foramt_reward": 0.84765625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 133 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 674.625, |
| "epoch": 1.072, |
| "grad_norm": 0.02834927663207054, |
| "kl": 0.006766319274902344, |
| "learning_rate": 1.9995299594135434e-05, |
| "loss": -0.0002, |
| "reward": 5.701120138168335, |
| "reward_std": 0.5981364026665688, |
| "rewards/mrr_reward": 0.24760044924914837, |
| "rewards/rank_analyze_format_reward": 0.7354921996593475, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 134 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 659.953125, |
| "epoch": 1.08, |
| "grad_norm": 0.030475998297333717, |
| "kl": 0.009418487548828125, |
| "learning_rate": 1.999522222866064e-05, |
| "loss": -0.0179, |
| "reward": 5.579203009605408, |
| "reward_std": 1.1411446928977966, |
| "rewards/mrr_reward": 0.24535591155290604, |
| "rewards/rank_analyze_format_reward": 0.8200124651193619, |
| "rewards/rank_answer_foramt_reward": 0.8046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.5625, |
| "epoch": 1.088, |
| "grad_norm": 0.02837698720395565, |
| "kl": 0.007771492004394531, |
| "learning_rate": 1.999514423183296e-05, |
| "loss": -0.022, |
| "reward": 5.316519737243652, |
| "reward_std": 0.48504022508859634, |
| "rewards/mrr_reward": 0.15651662088930607, |
| "rewards/rank_analyze_format_reward": 0.7586470544338226, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.006024893838912249, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 136 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.40625, |
| "epoch": 1.096, |
| "grad_norm": 0.03035576269030571, |
| "kl": 0.009632110595703125, |
| "learning_rate": 1.9995065603657317e-05, |
| "loss": -0.0195, |
| "reward": 5.066559195518494, |
| "reward_std": 0.7069189697504044, |
| "rewards/mrr_reward": 0.1323784776031971, |
| "rewards/rank_analyze_format_reward": 0.7358406782150269, |
| "rewards/rank_answer_foramt_reward": 0.8203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 137 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 672.09375, |
| "epoch": 1.104, |
| "grad_norm": 0.02903159335255623, |
| "kl": 0.007931709289550781, |
| "learning_rate": 1.999498634413868e-05, |
| "loss": -0.0238, |
| "reward": 5.2656556367874146, |
| "reward_std": 0.7046982049942017, |
| "rewards/mrr_reward": 0.17012028582394123, |
| "rewards/rank_analyze_format_reward": 0.7452157586812973, |
| "rewards/rank_answer_foramt_reward": 0.873046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 138 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.5625, |
| "epoch": 1.112, |
| "grad_norm": 0.029243705794215202, |
| "kl": 0.007679939270019531, |
| "learning_rate": 1.9994906453282055e-05, |
| "loss": -0.0086, |
| "reward": 5.43647313117981, |
| "reward_std": 0.6669348478317261, |
| "rewards/mrr_reward": 0.19714161939918995, |
| "rewards/rank_analyze_format_reward": 0.7475159168243408, |
| "rewards/rank_answer_foramt_reward": 0.900390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 139 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 652.046875, |
| "epoch": 1.12, |
| "grad_norm": 0.030478352680802345, |
| "kl": 0.008630752563476562, |
| "learning_rate": 1.9994825931092486e-05, |
| "loss": -0.034, |
| "reward": 5.387316823005676, |
| "reward_std": 0.8800464794039726, |
| "rewards/mrr_reward": 0.20342883095145226, |
| "rewards/rank_analyze_format_reward": 0.6943867355585098, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 654.4375, |
| "epoch": 1.1280000000000001, |
| "grad_norm": 0.029138056561350822, |
| "kl": 0.00916290283203125, |
| "learning_rate": 1.9994744777575064e-05, |
| "loss": 0.0121, |
| "reward": 5.340026617050171, |
| "reward_std": 1.1701116859912872, |
| "rewards/mrr_reward": 0.21713170036673546, |
| "rewards/rank_analyze_format_reward": 0.7348632365465164, |
| "rewards/rank_answer_foramt_reward": 0.81640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9835526347160339, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 141 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 649.765625, |
| "epoch": 1.1360000000000001, |
| "grad_norm": 0.027982011437416077, |
| "kl": 0.007659912109375, |
| "learning_rate": 1.999466299273491e-05, |
| "loss": -0.0359, |
| "reward": 5.487318634986877, |
| "reward_std": 0.7155840322375298, |
| "rewards/mrr_reward": 0.19176588580012321, |
| "rewards/rank_analyze_format_reward": 0.780378520488739, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 142 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 666.28125, |
| "epoch": 1.144, |
| "grad_norm": 0.030734730884432793, |
| "kl": 0.007786750793457031, |
| "learning_rate": 1.9994580576577193e-05, |
| "loss": 0.0205, |
| "reward": 5.202921390533447, |
| "reward_std": 1.0715601295232773, |
| "rewards/mrr_reward": 0.17847222834825516, |
| "rewards/rank_analyze_format_reward": 0.7116889655590057, |
| "rewards/rank_answer_foramt_reward": 0.87109375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.96875, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 143 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 650.546875, |
| "epoch": 1.152, |
| "grad_norm": 0.031996339559555054, |
| "kl": 0.008441925048828125, |
| "learning_rate": 1.9994497529107118e-05, |
| "loss": 0.0216, |
| "reward": 5.737574934959412, |
| "reward_std": 1.1086364686489105, |
| "rewards/mrr_reward": 0.30818453058600426, |
| "rewards/rank_analyze_format_reward": 0.7176125943660736, |
| "rewards/rank_answer_foramt_reward": 0.8203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 144 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 673.328125, |
| "epoch": 1.16, |
| "grad_norm": 0.03075815737247467, |
| "kl": 0.008372306823730469, |
| "learning_rate": 1.999441385032993e-05, |
| "loss": 0.0058, |
| "reward": 5.309332966804504, |
| "reward_std": 1.2192674428224564, |
| "rewards/mrr_reward": 0.19598214887082577, |
| "rewards/rank_analyze_format_reward": 0.820211187005043, |
| "rewards/rank_answer_foramt_reward": 0.81640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9834558814764023, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9522058814764023, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.265625, |
| "epoch": 1.168, |
| "grad_norm": 0.029912114143371582, |
| "kl": 0.008427619934082031, |
| "learning_rate": 1.9994329540250918e-05, |
| "loss": -0.0094, |
| "reward": 5.250480055809021, |
| "reward_std": 0.6816554740071297, |
| "rewards/mrr_reward": 0.17541543021798134, |
| "rewards/rank_analyze_format_reward": 0.6894431859254837, |
| "rewards/rank_answer_foramt_reward": 0.875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 146 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 678.734375, |
| "epoch": 1.176, |
| "grad_norm": 0.03161380812525749, |
| "kl": 0.008414268493652344, |
| "learning_rate": 1.99942445988754e-05, |
| "loss": -0.0024, |
| "reward": 6.2228370904922485, |
| "reward_std": 0.809022843837738, |
| "rewards/mrr_reward": 0.3891865164041519, |
| "rewards/rank_analyze_format_reward": 0.7911781966686249, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.998003289103508, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.998003289103508, |
| "step": 147 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 649.125, |
| "epoch": 1.184, |
| "grad_norm": 0.029842013493180275, |
| "kl": 0.009199142456054688, |
| "learning_rate": 1.999415902620875e-05, |
| "loss": -0.0172, |
| "reward": 5.441818833351135, |
| "reward_std": 0.9356655329465866, |
| "rewards/mrr_reward": 0.21156993880867958, |
| "rewards/rank_analyze_format_reward": 0.7728083282709122, |
| "rewards/rank_answer_foramt_reward": 0.86328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9992559552192688, |
| "step": 148 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 643.71875, |
| "epoch": 1.192, |
| "grad_norm": 0.029202759265899658, |
| "kl": 0.007966995239257812, |
| "learning_rate": 1.999407282225637e-05, |
| "loss": 0.0142, |
| "reward": 6.0522788763046265, |
| "reward_std": 0.693210706114769, |
| "rewards/mrr_reward": 0.3665984570980072, |
| "rewards/rank_analyze_format_reward": 0.7421348392963409, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 149 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 679.75, |
| "epoch": 1.2, |
| "grad_norm": 0.030646566301584244, |
| "kl": 0.00763702392578125, |
| "learning_rate": 1.9993985987023703e-05, |
| "loss": 0.025, |
| "reward": 5.18368136882782, |
| "reward_std": 0.5240126103162766, |
| "rewards/mrr_reward": 0.12678571417927742, |
| "rewards/rank_analyze_format_reward": 0.7496970891952515, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9995535761117935, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9995535761117935, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 665.375, |
| "epoch": 1.208, |
| "grad_norm": 0.029021795839071274, |
| "kl": 0.0076656341552734375, |
| "learning_rate": 1.9993898520516233e-05, |
| "loss": -0.0027, |
| "reward": 5.496846318244934, |
| "reward_std": 0.7196042984724045, |
| "rewards/mrr_reward": 0.20720486715435982, |
| "rewards/rank_analyze_format_reward": 0.7718491405248642, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.00924379751086235, |
| "rewards/rank_initial_format_reward": 0.9962014406919479, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9962014406919479, |
| "step": 151 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 654.1875, |
| "epoch": 1.216, |
| "grad_norm": 0.030017321929335594, |
| "kl": 0.008272171020507812, |
| "learning_rate": 1.9993810422739496e-05, |
| "loss": 0.0039, |
| "reward": 5.460033655166626, |
| "reward_std": 1.0394816249608994, |
| "rewards/mrr_reward": 0.24322297610342503, |
| "rewards/rank_analyze_format_reward": 0.7152340114116669, |
| "rewards/rank_answer_foramt_reward": 0.830078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9826335161924362, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 152 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 675.9375, |
| "epoch": 1.224, |
| "grad_norm": 0.03180088475346565, |
| "kl": 0.00795745849609375, |
| "learning_rate": 1.999372169369904e-05, |
| "loss": -0.0025, |
| "reward": 5.007642865180969, |
| "reward_std": 0.6734954938292503, |
| "rewards/mrr_reward": 0.11055307649075985, |
| "rewards/rank_analyze_format_reward": 0.7876633703708649, |
| "rewards/rank_answer_foramt_reward": 0.828125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9826335161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 153 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 667.8125, |
| "epoch": 1.232, |
| "grad_norm": 0.030025500804185867, |
| "kl": 0.00997161865234375, |
| "learning_rate": 1.999363233340048e-05, |
| "loss": -0.0225, |
| "reward": 5.430343270301819, |
| "reward_std": 0.8861743956804276, |
| "rewards/mrr_reward": 0.21569321304559708, |
| "rewards/rank_analyze_format_reward": 0.7130149006843567, |
| "rewards/rank_answer_foramt_reward": 0.900390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9966137856245041, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9966137856245041, |
| "step": 154 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 676.65625, |
| "epoch": 1.24, |
| "grad_norm": 0.028847267851233482, |
| "kl": 0.008131027221679688, |
| "learning_rate": 1.9993542341849462e-05, |
| "loss": 0.0216, |
| "reward": 5.840638756752014, |
| "reward_std": 0.8792509809136391, |
| "rewards/mrr_reward": 0.3192274421453476, |
| "rewards/rank_analyze_format_reward": 0.809514582157135, |
| "rewards/rank_answer_foramt_reward": 0.802734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 675.4375, |
| "epoch": 1.248, |
| "grad_norm": 0.02900339849293232, |
| "kl": 0.0074920654296875, |
| "learning_rate": 1.9993451719051663e-05, |
| "loss": 0.0185, |
| "reward": 5.44918155670166, |
| "reward_std": 1.0219481438398361, |
| "rewards/mrr_reward": 0.2371651791036129, |
| "rewards/rank_analyze_format_reward": 0.8032551407814026, |
| "rewards/rank_answer_foramt_reward": 0.751953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 156 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 673.5625, |
| "epoch": 1.256, |
| "grad_norm": 0.030704988166689873, |
| "kl": 0.008420944213867188, |
| "learning_rate": 1.999336046501281e-05, |
| "loss": 0.0103, |
| "reward": 5.615517616271973, |
| "reward_std": 0.6072335783392191, |
| "rewards/mrr_reward": 0.21866939775645733, |
| "rewards/rank_analyze_format_reward": 0.8225629776716232, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 157 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.53125, |
| "epoch": 1.264, |
| "grad_norm": 0.03495262190699577, |
| "kl": 0.010213851928710938, |
| "learning_rate": 1.999326857973867e-05, |
| "loss": 0.0143, |
| "reward": 5.4609445333480835, |
| "reward_std": 1.0773909091949463, |
| "rewards/mrr_reward": 0.24725942313671112, |
| "rewards/rank_analyze_format_reward": 0.7313768267631531, |
| "rewards/rank_answer_foramt_reward": 0.80078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9972181469202042, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9815931469202042, |
| "step": 158 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 659.21875, |
| "epoch": 1.272, |
| "grad_norm": 0.030684208497405052, |
| "kl": 0.008328437805175781, |
| "learning_rate": 1.9993176063235046e-05, |
| "loss": -0.0004, |
| "reward": 5.58943784236908, |
| "reward_std": 0.7682318538427353, |
| "rewards/mrr_reward": 0.23807664960622787, |
| "rewards/rank_analyze_format_reward": 0.7750714123249054, |
| "rewards/rank_answer_foramt_reward": 0.875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 159 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.078125, |
| "epoch": 1.28, |
| "grad_norm": 0.03072858415544033, |
| "kl": 0.008135795593261719, |
| "learning_rate": 1.9993082915507776e-05, |
| "loss": -0.0049, |
| "reward": 5.318588376045227, |
| "reward_std": 0.7114385366439819, |
| "rewards/mrr_reward": 0.1840277872979641, |
| "rewards/rank_analyze_format_reward": 0.7484926581382751, |
| "rewards/rank_answer_foramt_reward": 0.927734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 688.203125, |
| "epoch": 1.288, |
| "grad_norm": 0.029995588585734367, |
| "kl": 0.0068206787109375, |
| "learning_rate": 1.999298913656275e-05, |
| "loss": 0.0295, |
| "reward": 5.286089658737183, |
| "reward_std": 0.985167570412159, |
| "rewards/mrr_reward": 0.20013641566038132, |
| "rewards/rank_analyze_format_reward": 0.7478553950786591, |
| "rewards/rank_answer_foramt_reward": 0.828125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9899380803108215, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.9743130803108215, |
| "step": 161 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.953125, |
| "epoch": 1.296, |
| "grad_norm": 0.030260177329182625, |
| "kl": 0.0070362091064453125, |
| "learning_rate": 1.9992894726405894e-05, |
| "loss": -0.0186, |
| "reward": 5.705159902572632, |
| "reward_std": 0.8278112560510635, |
| "rewards/mrr_reward": 0.26141493394970894, |
| "rewards/rank_analyze_format_reward": 0.7882915586233139, |
| "rewards/rank_answer_foramt_reward": 0.873046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 162 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 663.359375, |
| "epoch": 1.304, |
| "grad_norm": 0.029093291610479355, |
| "kl": 0.007663726806640625, |
| "learning_rate": 1.9992799685043165e-05, |
| "loss": -0.0021, |
| "reward": 5.266281008720398, |
| "reward_std": 0.7597630694508553, |
| "rewards/mrr_reward": 0.15634300746023655, |
| "rewards/rank_analyze_format_reward": 0.8102140724658966, |
| "rewards/rank_answer_foramt_reward": 0.849609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 163 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 649.34375, |
| "epoch": 1.312, |
| "grad_norm": 0.030477292835712433, |
| "kl": 0.009012222290039062, |
| "learning_rate": 1.999270401248057e-05, |
| "loss": 0.005, |
| "reward": 5.561887741088867, |
| "reward_std": 0.9077768623828888, |
| "rewards/mrr_reward": 0.24484127387404442, |
| "rewards/rank_analyze_format_reward": 0.7151724994182587, |
| "rewards/rank_answer_foramt_reward": 0.859375, |
| "rewards/rank_contrast_format_reward": 0.013020833022892475, |
| "rewards/rank_initial_format_reward": 0.997477263212204, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.997477263212204, |
| "step": 164 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.546875, |
| "epoch": 1.32, |
| "grad_norm": 0.030477292835712433, |
| "kl": 0.007292747497558594, |
| "learning_rate": 1.999270401248057e-05, |
| "loss": -0.0001, |
| "reward": 5.293234348297119, |
| "reward_std": 0.6412914916872978, |
| "rewards/mrr_reward": 0.16529638320207596, |
| "rewards/rank_analyze_format_reward": 0.7485833615064621, |
| "rewards/rank_answer_foramt_reward": 0.890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9964202791452408, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9964202791452408, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 638.6875, |
| "epoch": 1.328, |
| "grad_norm": 0.031677015125751495, |
| "kl": 0.008253097534179688, |
| "learning_rate": 1.999260770872415e-05, |
| "loss": -0.0139, |
| "reward": 5.761396527290344, |
| "reward_std": 0.675188884139061, |
| "rewards/mrr_reward": 0.29050719179213047, |
| "rewards/rank_analyze_format_reward": 0.7996452897787094, |
| "rewards/rank_answer_foramt_reward": 0.80859375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9955643564462662, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9955643564462662, |
| "step": 166 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.5625, |
| "epoch": 1.336, |
| "grad_norm": 0.02820313349366188, |
| "kl": 0.00614166259765625, |
| "learning_rate": 1.999251077377999e-05, |
| "loss": 0.0049, |
| "reward": 5.31193470954895, |
| "reward_std": 0.7302871681749821, |
| "rewards/mrr_reward": 0.1731150783598423, |
| "rewards/rank_analyze_format_reward": 0.7047950625419617, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 167 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 667.875, |
| "epoch": 1.3439999999999999, |
| "grad_norm": 0.02908385545015335, |
| "kl": 0.007312774658203125, |
| "learning_rate": 1.999241320765421e-05, |
| "loss": 0.005, |
| "reward": 5.744455337524414, |
| "reward_std": 0.6932341083884239, |
| "rewards/mrr_reward": 0.25336061976850033, |
| "rewards/rank_analyze_format_reward": 0.8142653256654739, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 168 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 620.890625, |
| "epoch": 1.3519999999999999, |
| "grad_norm": 0.03778872266411781, |
| "kl": 0.01438140869140625, |
| "learning_rate": 1.9992315010352978e-05, |
| "loss": -0.05, |
| "reward": 5.296718597412109, |
| "reward_std": 1.1021133363246918, |
| "rewards/mrr_reward": 0.20252975821495056, |
| "rewards/rank_analyze_format_reward": 0.7184047400951385, |
| "rewards/rank_answer_foramt_reward": 0.779296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 169 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.734375, |
| "epoch": 1.3599999999999999, |
| "grad_norm": 0.032183870673179626, |
| "kl": 0.0066356658935546875, |
| "learning_rate": 1.9992216181882492e-05, |
| "loss": 0.014, |
| "reward": 5.106685400009155, |
| "reward_std": 0.636107549071312, |
| "rewards/mrr_reward": 0.12468378245830536, |
| "rewards/rank_analyze_format_reward": 0.7776422798633575, |
| "rewards/rank_answer_foramt_reward": 0.833984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 652.328125, |
| "epoch": 1.3679999999999999, |
| "grad_norm": 0.02835104614496231, |
| "kl": 0.007082939147949219, |
| "learning_rate": 1.9992116722248997e-05, |
| "loss": -0.0227, |
| "reward": 5.181930780410767, |
| "reward_std": 0.7388085126876831, |
| "rewards/mrr_reward": 0.1869481634348631, |
| "rewards/rank_analyze_format_reward": 0.7075169235467911, |
| "rewards/rank_answer_foramt_reward": 0.75, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9961231350898743, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9961231350898743, |
| "step": 171 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 671.078125, |
| "epoch": 1.376, |
| "grad_norm": 0.02974073402583599, |
| "kl": 0.006779670715332031, |
| "learning_rate": 1.9992016631458774e-05, |
| "loss": -0.0135, |
| "reward": 5.27878475189209, |
| "reward_std": 0.6198801323771477, |
| "rewards/mrr_reward": 0.15861235558986664, |
| "rewards/rank_analyze_format_reward": 0.7224602103233337, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 172 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 643.046875, |
| "epoch": 1.384, |
| "grad_norm": 0.031048448756337166, |
| "kl": 0.006651878356933594, |
| "learning_rate": 1.9991915909518146e-05, |
| "loss": 0.0086, |
| "reward": 5.175734996795654, |
| "reward_std": 0.6161081194877625, |
| "rewards/mrr_reward": 0.14768726006150246, |
| "rewards/rank_analyze_format_reward": 0.788111001253128, |
| "rewards/rank_answer_foramt_reward": 0.84375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 173 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 649.421875, |
| "epoch": 1.392, |
| "grad_norm": 0.032564930617809296, |
| "kl": 0.006524085998535156, |
| "learning_rate": 1.9991814556433475e-05, |
| "loss": 0.0131, |
| "reward": 5.380582928657532, |
| "reward_std": 0.7940613478422165, |
| "rewards/mrr_reward": 0.1871589906513691, |
| "rewards/rank_analyze_format_reward": 0.7780078798532486, |
| "rewards/rank_answer_foramt_reward": 0.873046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 174 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.640625, |
| "epoch": 1.4, |
| "grad_norm": 0.027925679460167885, |
| "kl": 0.008939743041992188, |
| "learning_rate": 1.9991712572211163e-05, |
| "loss": 0.0071, |
| "reward": 4.99386203289032, |
| "reward_std": 0.8465652763843536, |
| "rewards/mrr_reward": 0.13313491828739643, |
| "rewards/rank_analyze_format_reward": 0.6585879027843475, |
| "rewards/rank_answer_foramt_reward": 0.857421875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.265625, |
| "epoch": 1.408, |
| "grad_norm": 0.02996164932847023, |
| "kl": 0.0084228515625, |
| "learning_rate": 1.999160995685765e-05, |
| "loss": 0.004, |
| "reward": 5.5598554611206055, |
| "reward_std": 0.8114011436700821, |
| "rewards/mrr_reward": 0.22632689774036407, |
| "rewards/rank_analyze_format_reward": 0.7654528021812439, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 176 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 633.15625, |
| "epoch": 1.416, |
| "grad_norm": 0.03037683106958866, |
| "kl": 0.006896018981933594, |
| "learning_rate": 1.9991506710379424e-05, |
| "loss": 0.0169, |
| "reward": 5.7041707038879395, |
| "reward_std": 1.061623454093933, |
| "rewards/mrr_reward": 0.28074776753783226, |
| "rewards/rank_analyze_format_reward": 0.7275251597166061, |
| "rewards/rank_answer_foramt_reward": 0.87109375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9912803918123245, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9912803918123245, |
| "step": 177 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.078125, |
| "epoch": 1.424, |
| "grad_norm": 0.02834421582520008, |
| "kl": 0.006161689758300781, |
| "learning_rate": 1.9991402832783e-05, |
| "loss": -0.0165, |
| "reward": 5.452821254730225, |
| "reward_std": 0.7667314857244492, |
| "rewards/mrr_reward": 0.22672370821237564, |
| "rewards/rank_analyze_format_reward": 0.6621106863021851, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9965953528881073, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9809703528881073, |
| "step": 178 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 611.515625, |
| "epoch": 1.432, |
| "grad_norm": 0.03392954543232918, |
| "kl": 0.007987022399902344, |
| "learning_rate": 1.9991298324074942e-05, |
| "loss": 0.0131, |
| "reward": 5.4409414529800415, |
| "reward_std": 0.9278188347816467, |
| "rewards/mrr_reward": 0.22803819552063942, |
| "rewards/rank_analyze_format_reward": 0.7239862233400345, |
| "rewards/rank_answer_foramt_reward": 0.830078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 179 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 665.4375, |
| "epoch": 1.44, |
| "grad_norm": 0.02850930020213127, |
| "kl": 0.006137847900390625, |
| "learning_rate": 1.999119318426185e-05, |
| "loss": 0.0109, |
| "reward": 5.493017673492432, |
| "reward_std": 0.5988549739122391, |
| "rewards/mrr_reward": 0.19139384850859642, |
| "rewards/rank_analyze_format_reward": 0.8054522722959518, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 652.328125, |
| "epoch": 1.448, |
| "grad_norm": 0.03058907762169838, |
| "kl": 0.006911277770996094, |
| "learning_rate": 1.9991087413350367e-05, |
| "loss": -0.0328, |
| "reward": 5.574859023094177, |
| "reward_std": 1.08903668820858, |
| "rewards/mrr_reward": 0.26563740335404873, |
| "rewards/rank_analyze_format_reward": 0.7526647448539734, |
| "rewards/rank_answer_foramt_reward": 0.8359375, |
| "rewards/rank_contrast_format_reward": 0.013681219890713692, |
| "rewards/rank_initial_format_reward": 0.99407559633255, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.96282559633255, |
| "step": 181 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 629.359375, |
| "epoch": 1.456, |
| "grad_norm": 0.032083019614219666, |
| "kl": 0.007262229919433594, |
| "learning_rate": 1.9990981011347172e-05, |
| "loss": -0.0432, |
| "reward": 5.140234351158142, |
| "reward_std": 0.8412456661462784, |
| "rewards/mrr_reward": 0.14855531603097916, |
| "rewards/rank_analyze_format_reward": 0.7074387818574905, |
| "rewards/rank_answer_foramt_reward": 0.86328125, |
| "rewards/rank_contrast_format_reward": 0.014168431982398033, |
| "rewards/rank_initial_format_reward": 0.9961873590946198, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9805623590946198, |
| "step": 182 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 605.046875, |
| "epoch": 1.464, |
| "grad_norm": 0.03227703273296356, |
| "kl": 0.007939338684082031, |
| "learning_rate": 1.999087397825899e-05, |
| "loss": -0.0575, |
| "reward": 4.993819952011108, |
| "reward_std": 0.7700471132993698, |
| "rewards/mrr_reward": 0.13467882573604584, |
| "rewards/rank_analyze_format_reward": 0.70456662774086, |
| "rewards/rank_answer_foramt_reward": 0.763671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9973393976688385, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9973393976688385, |
| "step": 183 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.796875, |
| "epoch": 1.472, |
| "grad_norm": 0.02891026809811592, |
| "kl": 0.00670623779296875, |
| "learning_rate": 1.9990766314092575e-05, |
| "loss": -0.0023, |
| "reward": 5.713563561439514, |
| "reward_std": 0.8618924953043461, |
| "rewards/mrr_reward": 0.2501860074698925, |
| "rewards/rank_analyze_format_reward": 0.8630950748920441, |
| "rewards/rank_answer_foramt_reward": 0.890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 184 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 639.734375, |
| "epoch": 1.48, |
| "grad_norm": 0.030336197465658188, |
| "kl": 0.0074062347412109375, |
| "learning_rate": 1.9990658018854737e-05, |
| "loss": -0.0133, |
| "reward": 5.221981048583984, |
| "reward_std": 0.584196537733078, |
| "rewards/mrr_reward": 0.1581659186631441, |
| "rewards/rank_analyze_format_reward": 0.7678205668926239, |
| "rewards/rank_answer_foramt_reward": 0.85546875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9986388385295868, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9986388385295868, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 629.21875, |
| "epoch": 1.488, |
| "grad_norm": 0.03278738632798195, |
| "kl": 0.0077762603759765625, |
| "learning_rate": 1.9990549092552307e-05, |
| "loss": 0.0038, |
| "reward": 5.634615182876587, |
| "reward_std": 0.598552655428648, |
| "rewards/mrr_reward": 0.23745040595531464, |
| "rewards/rank_analyze_format_reward": 0.7684896737337112, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 186 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 635.484375, |
| "epoch": 1.496, |
| "grad_norm": 0.03179846331477165, |
| "kl": 0.007142066955566406, |
| "learning_rate": 1.999043953519217e-05, |
| "loss": 0.0049, |
| "reward": 5.652546405792236, |
| "reward_std": 0.9589240476489067, |
| "rewards/mrr_reward": 0.2782428301870823, |
| "rewards/rank_analyze_format_reward": 0.6794685870409012, |
| "rewards/rank_answer_foramt_reward": 0.873046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 187 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 603.75, |
| "epoch": 1.504, |
| "grad_norm": 0.029573241248726845, |
| "kl": 0.00788116455078125, |
| "learning_rate": 1.999032934678125e-05, |
| "loss": -0.018, |
| "reward": 5.10038423538208, |
| "reward_std": 0.5980538204312325, |
| "rewards/mrr_reward": 0.14192708767950535, |
| "rewards/rank_analyze_format_reward": 0.627951592206955, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.995330885052681, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.995330885052681, |
| "step": 188 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.3125, |
| "epoch": 1.512, |
| "grad_norm": 0.03184956684708595, |
| "kl": 0.007494926452636719, |
| "learning_rate": 1.99902185273265e-05, |
| "loss": 0.0342, |
| "reward": 5.1105430126190186, |
| "reward_std": 0.4383184686303139, |
| "rewards/mrr_reward": 0.10293898917734623, |
| "rewards/rank_analyze_format_reward": 0.8510159552097321, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 189 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.046875, |
| "epoch": 1.52, |
| "grad_norm": 0.03131790831685066, |
| "kl": 0.006718635559082031, |
| "learning_rate": 1.999010707683492e-05, |
| "loss": -0.0412, |
| "reward": 5.197941780090332, |
| "reward_std": 0.6700362041592598, |
| "rewards/mrr_reward": 0.16383928433060646, |
| "rewards/rank_analyze_format_reward": 0.6899384260177612, |
| "rewards/rank_answer_foramt_reward": 0.876953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9917527735233307, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9917527735233307, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.578125, |
| "epoch": 1.528, |
| "grad_norm": 0.031343378126621246, |
| "kl": 0.0063381195068359375, |
| "learning_rate": 1.998999499531356e-05, |
| "loss": -0.0108, |
| "reward": 6.47118866443634, |
| "reward_std": 1.109221488237381, |
| "rewards/mrr_reward": 0.4581039249897003, |
| "rewards/rank_analyze_format_reward": 0.6661168932914734, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 191 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 622.703125, |
| "epoch": 1.536, |
| "grad_norm": 0.03459320589900017, |
| "kl": 0.007761955261230469, |
| "learning_rate": 1.9989882282769485e-05, |
| "loss": -0.0315, |
| "reward": 5.452162504196167, |
| "reward_std": 0.8313007205724716, |
| "rewards/mrr_reward": 0.20660342648625374, |
| "rewards/rank_analyze_format_reward": 0.7848766297101974, |
| "rewards/rank_answer_foramt_reward": 0.861328125, |
| "rewards/rank_contrast_format_reward": 0.010794081725180149, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 192 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 589.8125, |
| "epoch": 1.544, |
| "grad_norm": 0.034033406525850296, |
| "kl": 0.007235527038574219, |
| "learning_rate": 1.9989768939209826e-05, |
| "loss": -0.0153, |
| "reward": 5.301190137863159, |
| "reward_std": 0.7263774573802948, |
| "rewards/mrr_reward": 0.22665550373494625, |
| "rewards/rank_analyze_format_reward": 0.5403951182961464, |
| "rewards/rank_answer_foramt_reward": 0.876953125, |
| "rewards/rank_contrast_format_reward": 0.012031249701976776, |
| "rewards/rank_initial_format_reward": 0.9826335161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9981800019741058, |
| "step": 193 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 611.953125, |
| "epoch": 1.552, |
| "grad_norm": 0.03618309274315834, |
| "kl": 0.0074520111083984375, |
| "learning_rate": 1.9989654964641737e-05, |
| "loss": -0.0302, |
| "reward": 5.4693708419799805, |
| "reward_std": 1.3693420886993408, |
| "rewards/mrr_reward": 0.2628844305872917, |
| "rewards/rank_analyze_format_reward": 0.7108018025755882, |
| "rewards/rank_answer_foramt_reward": 0.77734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 194 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 622.015625, |
| "epoch": 1.56, |
| "grad_norm": 0.0314011313021183, |
| "kl": 0.008198738098144531, |
| "learning_rate": 1.998954035907242e-05, |
| "loss": -0.0073, |
| "reward": 5.535840272903442, |
| "reward_std": 0.847998857498169, |
| "rewards/mrr_reward": 0.2551587335765362, |
| "rewards/rank_analyze_format_reward": 0.7528696805238724, |
| "rewards/rank_answer_foramt_reward": 0.8203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9827302694320679, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 687.484375, |
| "epoch": 1.568, |
| "grad_norm": 0.029867514967918396, |
| "kl": 0.0064868927001953125, |
| "learning_rate": 1.9989425122509113e-05, |
| "loss": -0.0081, |
| "reward": 5.249784588813782, |
| "reward_std": 0.7226946577429771, |
| "rewards/mrr_reward": 0.14637276344001293, |
| "rewards/rank_analyze_format_reward": 0.8121927380561829, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9973393976688385, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9817143976688385, |
| "step": 196 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.015625, |
| "epoch": 1.576, |
| "grad_norm": 0.029847877100110054, |
| "kl": 0.0074748992919921875, |
| "learning_rate": 1.9989309254959096e-05, |
| "loss": -0.0112, |
| "reward": 5.975355625152588, |
| "reward_std": 1.1915720701217651, |
| "rewards/mrr_reward": 0.32516741193830967, |
| "rewards/rank_analyze_format_reward": 0.8367954790592194, |
| "rewards/rank_answer_foramt_reward": 0.900390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 197 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.421875, |
| "epoch": 1.584, |
| "grad_norm": 0.030575547367334366, |
| "kl": 0.00701141357421875, |
| "learning_rate": 1.998919275642968e-05, |
| "loss": -0.0259, |
| "reward": 5.382705450057983, |
| "reward_std": 0.5365985631942749, |
| "rewards/mrr_reward": 0.17048611491918564, |
| "rewards/rank_analyze_format_reward": 0.7825622856616974, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 198 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 627.40625, |
| "epoch": 1.592, |
| "grad_norm": 0.0321117527782917, |
| "kl": 0.006863594055175781, |
| "learning_rate": 1.9989075626928237e-05, |
| "loss": -0.0073, |
| "reward": 5.219160199165344, |
| "reward_std": 0.6420910395681858, |
| "rewards/mrr_reward": 0.16282862052321434, |
| "rewards/rank_analyze_format_reward": 0.6769122779369354, |
| "rewards/rank_answer_foramt_reward": 0.900390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 199 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 656.796875, |
| "epoch": 1.6, |
| "grad_norm": 0.03242700546979904, |
| "kl": 0.007241249084472656, |
| "learning_rate": 1.9988957866462155e-05, |
| "loss": -0.0011, |
| "reward": 5.902221083641052, |
| "reward_std": 0.630526065826416, |
| "rewards/mrr_reward": 0.2948412746191025, |
| "rewards/rank_analyze_format_reward": 0.8107465952634811, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 654.90625, |
| "epoch": 1.608, |
| "grad_norm": 0.031312599778175354, |
| "kl": 0.006711006164550781, |
| "learning_rate": 1.998883947503888e-05, |
| "loss": -0.0324, |
| "reward": 5.3306708335876465, |
| "reward_std": 0.6714678555727005, |
| "rewards/mrr_reward": 0.18062996119260788, |
| "rewards/rank_analyze_format_reward": 0.7797176241874695, |
| "rewards/rank_answer_foramt_reward": 0.861328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 201 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 658.296875, |
| "epoch": 1.616, |
| "grad_norm": 0.03099161572754383, |
| "kl": 0.008016586303710938, |
| "learning_rate": 1.9988720452665885e-05, |
| "loss": -0.0357, |
| "reward": 5.8526880741119385, |
| "reward_std": 0.7714151293039322, |
| "rewards/mrr_reward": 0.2807725705206394, |
| "rewards/rank_analyze_format_reward": 0.7999103516340256, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 202 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 622.78125, |
| "epoch": 1.624, |
| "grad_norm": 0.031240420415997505, |
| "kl": 0.007191658020019531, |
| "learning_rate": 1.9988600799350685e-05, |
| "loss": -0.0077, |
| "reward": 5.453703999519348, |
| "reward_std": 0.7480319663882256, |
| "rewards/mrr_reward": 0.2080853171646595, |
| "rewards/rank_analyze_format_reward": 0.7671940922737122, |
| "rewards/rank_answer_foramt_reward": 0.876953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9964202791452408, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9964202791452408, |
| "step": 203 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 626.09375, |
| "epoch": 1.6320000000000001, |
| "grad_norm": 0.033636048436164856, |
| "kl": 0.008371353149414062, |
| "learning_rate": 1.998848051510085e-05, |
| "loss": 0.0101, |
| "reward": 5.431292653083801, |
| "reward_std": 0.7651955038309097, |
| "rewards/mrr_reward": 0.20585318095982075, |
| "rewards/rank_analyze_format_reward": 0.7279091775417328, |
| "rewards/rank_answer_foramt_reward": 0.927734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9956494122743607, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9800244122743607, |
| "step": 204 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 653.421875, |
| "epoch": 1.6400000000000001, |
| "grad_norm": 0.03052719309926033, |
| "kl": 0.008235931396484375, |
| "learning_rate": 1.9988359599923964e-05, |
| "loss": -0.0077, |
| "reward": 5.360659718513489, |
| "reward_std": 0.564825750887394, |
| "rewards/mrr_reward": 0.1643043179064989, |
| "rewards/rank_analyze_format_reward": 0.8024349361658096, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 205 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.65625, |
| "epoch": 1.6480000000000001, |
| "grad_norm": 0.030079178512096405, |
| "kl": 0.0066680908203125, |
| "learning_rate": 1.9988238053827677e-05, |
| "loss": -0.0209, |
| "reward": 5.6915318965911865, |
| "reward_std": 0.9940572530031204, |
| "rewards/mrr_reward": 0.24549851939082146, |
| "rewards/rank_analyze_format_reward": 0.8010262995958328, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 206 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 635.15625, |
| "epoch": 1.6560000000000001, |
| "grad_norm": 0.03138303756713867, |
| "kl": 0.008875846862792969, |
| "learning_rate": 1.9988115876819654e-05, |
| "loss": -0.0312, |
| "reward": 5.914277911186218, |
| "reward_std": 0.8364528864622116, |
| "rewards/mrr_reward": 0.3293774798512459, |
| "rewards/rank_analyze_format_reward": 0.7483799606561661, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9661861509084702, |
| "step": 207 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.828125, |
| "epoch": 1.6640000000000001, |
| "grad_norm": 0.031958747655153275, |
| "kl": 0.009166717529296875, |
| "learning_rate": 1.9987993068907624e-05, |
| "loss": -0.0156, |
| "reward": 5.197036981582642, |
| "reward_std": 0.5283743739128113, |
| "rewards/mrr_reward": 0.1414062473922968, |
| "rewards/rank_analyze_format_reward": 0.7246968895196915, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9992559552192688, |
| "step": 208 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.703125, |
| "epoch": 1.6720000000000002, |
| "grad_norm": 0.03019367717206478, |
| "kl": 0.007928848266601562, |
| "learning_rate": 1.9987869630099333e-05, |
| "loss": -0.0281, |
| "reward": 5.924240350723267, |
| "reward_std": 1.284628689289093, |
| "rewards/mrr_reward": 0.3399987667798996, |
| "rewards/rank_analyze_format_reward": 0.7304840087890625, |
| "rewards/rank_answer_foramt_reward": 0.849609375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9959821403026581, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9959821403026581, |
| "step": 209 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.828125, |
| "epoch": 1.6800000000000002, |
| "grad_norm": 0.03171711415052414, |
| "kl": 0.008306503295898438, |
| "learning_rate": 1.998774556040259e-05, |
| "loss": -0.0146, |
| "reward": 5.582144498825073, |
| "reward_std": 0.7671663761138916, |
| "rewards/mrr_reward": 0.23291171342134476, |
| "rewards/rank_analyze_format_reward": 0.7457775175571442, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 678.453125, |
| "epoch": 1.688, |
| "grad_norm": 0.03076333925127983, |
| "kl": 0.0064449310302734375, |
| "learning_rate": 1.9987620859825225e-05, |
| "loss": -0.0194, |
| "reward": 5.3084797859191895, |
| "reward_std": 0.4021785408258438, |
| "rewards/mrr_reward": 0.13763020560145378, |
| "rewards/rank_analyze_format_reward": 0.8322708457708359, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.997023805975914, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.997023805975914, |
| "step": 211 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.4375, |
| "epoch": 1.696, |
| "grad_norm": 0.03446948900818825, |
| "kl": 0.007775306701660156, |
| "learning_rate": 1.9987495528375115e-05, |
| "loss": 0.0253, |
| "reward": 5.778676629066467, |
| "reward_std": 0.4526245817542076, |
| "rewards/mrr_reward": 0.2587859593331814, |
| "rewards/rank_analyze_format_reward": 0.8156834691762924, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 212 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 686.171875, |
| "epoch": 1.704, |
| "grad_norm": 0.03106614388525486, |
| "kl": 0.0079193115234375, |
| "learning_rate": 1.998736956606018e-05, |
| "loss": -0.0063, |
| "reward": 5.9417431354522705, |
| "reward_std": 0.816119559109211, |
| "rewards/mrr_reward": 0.31315724551677704, |
| "rewards/rank_analyze_format_reward": 0.7785349041223526, |
| "rewards/rank_answer_foramt_reward": 0.9140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 213 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 666.65625, |
| "epoch": 1.712, |
| "grad_norm": 0.03145559877157211, |
| "kl": 0.0077838897705078125, |
| "learning_rate": 1.9987242972888368e-05, |
| "loss": 0.0266, |
| "reward": 5.924510478973389, |
| "reward_std": 1.2289659082889557, |
| "rewards/mrr_reward": 0.32067212648689747, |
| "rewards/rank_analyze_format_reward": 0.7929693013429642, |
| "rewards/rank_answer_foramt_reward": 0.875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9947387874126434, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9947387874126434, |
| "step": 214 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 649.828125, |
| "epoch": 1.72, |
| "grad_norm": 0.030902279540896416, |
| "kl": 0.007624626159667969, |
| "learning_rate": 1.9987115748867685e-05, |
| "loss": 0.0034, |
| "reward": 5.730231523513794, |
| "reward_std": 0.720735490322113, |
| "rewards/mrr_reward": 0.24854290671646595, |
| "rewards/rank_analyze_format_reward": 0.8314545601606369, |
| "rewards/rank_answer_foramt_reward": 0.9140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 215 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.578125, |
| "epoch": 1.728, |
| "grad_norm": 0.03282896429300308, |
| "kl": 0.0073909759521484375, |
| "learning_rate": 1.9986987894006164e-05, |
| "loss": -0.0497, |
| "reward": 5.874699831008911, |
| "reward_std": 0.8969438448548317, |
| "rewards/mrr_reward": 0.2993737608194351, |
| "rewards/rank_analyze_format_reward": 0.7611890435218811, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 216 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 591.046875, |
| "epoch": 1.736, |
| "grad_norm": 0.031988587230443954, |
| "kl": 0.009072303771972656, |
| "learning_rate": 1.9986859408311878e-05, |
| "loss": -0.0426, |
| "reward": 5.34618878364563, |
| "reward_std": 0.9178062975406647, |
| "rewards/mrr_reward": 0.21951264888048172, |
| "rewards/rank_analyze_format_reward": 0.6615510508418083, |
| "rewards/rank_answer_foramt_reward": 0.833984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9941138625144958, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9941138625144958, |
| "step": 217 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 651.53125, |
| "epoch": 1.744, |
| "grad_norm": 0.0311751589179039, |
| "kl": 0.0080718994140625, |
| "learning_rate": 1.9986730291792945e-05, |
| "loss": -0.0354, |
| "reward": 5.135533690452576, |
| "reward_std": 0.5661691799759865, |
| "rewards/mrr_reward": 0.12121155858039856, |
| "rewards/rank_analyze_format_reward": 0.8167032152414322, |
| "rewards/rank_answer_foramt_reward": 0.833984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 218 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.828125, |
| "epoch": 1.752, |
| "grad_norm": 0.03084523230791092, |
| "kl": 0.007534980773925781, |
| "learning_rate": 1.9986600544457524e-05, |
| "loss": -0.0277, |
| "reward": 5.917717456817627, |
| "reward_std": 0.8369560539722443, |
| "rewards/mrr_reward": 0.3187128081917763, |
| "rewards/rank_analyze_format_reward": 0.7128703743219376, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 219 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 654.984375, |
| "epoch": 1.76, |
| "grad_norm": 0.033047858625650406, |
| "kl": 0.0073394775390625, |
| "learning_rate": 1.9986470166313805e-05, |
| "loss": 0.0205, |
| "reward": 5.320756673812866, |
| "reward_std": 0.4847453236579895, |
| "rewards/mrr_reward": 0.14311756193637848, |
| "rewards/rank_analyze_format_reward": 0.857081413269043, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983368366956711, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9983368366956711, |
| "step": 220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 635.125, |
| "epoch": 1.768, |
| "grad_norm": 0.035302143543958664, |
| "kl": 0.00738525390625, |
| "learning_rate": 1.9986339157370026e-05, |
| "loss": -0.049, |
| "reward": 5.3989468812942505, |
| "reward_std": 0.8359893411397934, |
| "rewards/mrr_reward": 0.19479167088866234, |
| "rewards/rank_analyze_format_reward": 0.7908818274736404, |
| "rewards/rank_answer_foramt_reward": 0.83203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9984335899353027, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9984335899353027, |
| "step": 221 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 651.96875, |
| "epoch": 1.776, |
| "grad_norm": 0.03167646750807762, |
| "kl": 0.008357048034667969, |
| "learning_rate": 1.9986207517634466e-05, |
| "loss": -0.0245, |
| "reward": 5.336655378341675, |
| "reward_std": 0.7819190472364426, |
| "rewards/mrr_reward": 0.17885665223002434, |
| "rewards/rank_analyze_format_reward": 0.7340867817401886, |
| "rewards/rank_answer_foramt_reward": 0.8984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 222 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 651.890625, |
| "epoch": 1.784, |
| "grad_norm": 0.03162270411849022, |
| "kl": 0.007929801940917969, |
| "learning_rate": 1.998607524711543e-05, |
| "loss": -0.0394, |
| "reward": 5.528796672821045, |
| "reward_std": 0.6584100723266602, |
| "rewards/mrr_reward": 0.2038008477538824, |
| "rewards/rank_analyze_format_reward": 0.7580919712781906, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 223 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 635.4375, |
| "epoch": 1.792, |
| "grad_norm": 0.0319884791970253, |
| "kl": 0.008496284484863281, |
| "learning_rate": 1.9985942345821285e-05, |
| "loss": -0.0326, |
| "reward": 5.295137047767639, |
| "reward_std": 0.6776984333992004, |
| "rewards/mrr_reward": 0.15868675522506237, |
| "rewards/rank_analyze_format_reward": 0.7750860899686813, |
| "rewards/rank_answer_foramt_reward": 0.890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9973393976688385, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9973393976688385, |
| "step": 224 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 660.25, |
| "epoch": 1.8, |
| "grad_norm": 0.032160449773073196, |
| "kl": 0.0078582763671875, |
| "learning_rate": 1.998580881376042e-05, |
| "loss": -0.0341, |
| "reward": 5.92940092086792, |
| "reward_std": 1.2256246581673622, |
| "rewards/mrr_reward": 0.33193204551935196, |
| "rewards/rank_analyze_format_reward": 0.772309273481369, |
| "rewards/rank_answer_foramt_reward": 0.873046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9976895451545715, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9820645451545715, |
| "step": 225 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 682.046875, |
| "epoch": 1.808, |
| "grad_norm": 0.0322372205555439, |
| "kl": 0.0072021484375, |
| "learning_rate": 1.9985674650941265e-05, |
| "loss": -0.0123, |
| "reward": 6.088730692863464, |
| "reward_std": 1.1017219424247742, |
| "rewards/mrr_reward": 0.3432415649294853, |
| "rewards/rank_analyze_format_reward": 0.7877216339111328, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 226 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.0, |
| "epoch": 1.8159999999999998, |
| "grad_norm": 0.03190344572067261, |
| "kl": 0.008536338806152344, |
| "learning_rate": 1.9985539857372303e-05, |
| "loss": -0.0181, |
| "reward": 6.0037089586257935, |
| "reward_std": 1.1030287593603134, |
| "rewards/mrr_reward": 0.31786955520510674, |
| "rewards/rank_analyze_format_reward": 0.7933959513902664, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9977376908063889, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9977376908063889, |
| "step": 227 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.328125, |
| "epoch": 1.8239999999999998, |
| "grad_norm": 0.035804688930511475, |
| "kl": 0.008830070495605469, |
| "learning_rate": 1.998540443306204e-05, |
| "loss": -0.0227, |
| "reward": 5.816386938095093, |
| "reward_std": 0.5872849300503731, |
| "rewards/mrr_reward": 0.2789682596921921, |
| "rewards/rank_analyze_format_reward": 0.7708265483379364, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 228 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.359375, |
| "epoch": 1.8319999999999999, |
| "grad_norm": 0.030482899397611618, |
| "kl": 0.00791168212890625, |
| "learning_rate": 1.998526837801904e-05, |
| "loss": -0.0145, |
| "reward": 5.763516783714294, |
| "reward_std": 0.9057382866740227, |
| "rewards/mrr_reward": 0.2729290686547756, |
| "rewards/rank_analyze_format_reward": 0.7831287831068039, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 229 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 688.25, |
| "epoch": 1.8399999999999999, |
| "grad_norm": 0.03433312475681305, |
| "kl": 0.009449005126953125, |
| "learning_rate": 1.9985131692251887e-05, |
| "loss": 0.0284, |
| "reward": 5.669819235801697, |
| "reward_std": 0.6757724024355412, |
| "rewards/mrr_reward": 0.21458952501416206, |
| "rewards/rank_analyze_format_reward": 0.8934923410415649, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 658.875, |
| "epoch": 1.8479999999999999, |
| "grad_norm": 0.030251074582338333, |
| "kl": 0.008405685424804688, |
| "learning_rate": 1.9984994375769222e-05, |
| "loss": -0.0343, |
| "reward": 5.75708794593811, |
| "reward_std": 0.4988391697406769, |
| "rewards/mrr_reward": 0.2610367089509964, |
| "rewards/rank_analyze_format_reward": 0.7592362314462662, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983368366956711, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983368366956711, |
| "step": 231 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 643.390625, |
| "epoch": 1.8559999999999999, |
| "grad_norm": 0.03255580738186836, |
| "kl": 0.010509490966796875, |
| "learning_rate": 1.9984856428579717e-05, |
| "loss": -0.0253, |
| "reward": 5.649736762046814, |
| "reward_std": 0.8045858144760132, |
| "rewards/mrr_reward": 0.23132441379129887, |
| "rewards/rank_analyze_format_reward": 0.8100680112838745, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 232 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.625, |
| "epoch": 1.8639999999999999, |
| "grad_norm": 0.03155457600951195, |
| "kl": 0.008493423461914062, |
| "learning_rate": 1.998471785069208e-05, |
| "loss": 0.0024, |
| "reward": 5.295361399650574, |
| "reward_std": 0.409699484705925, |
| "rewards/mrr_reward": 0.1384796667844057, |
| "rewards/rank_analyze_format_reward": 0.8116403520107269, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 233 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 639.734375, |
| "epoch": 1.8719999999999999, |
| "grad_norm": 0.03446485847234726, |
| "kl": 0.008045196533203125, |
| "learning_rate": 1.9984578642115072e-05, |
| "loss": 0.0044, |
| "reward": 5.8424142599105835, |
| "reward_std": 0.7500941399484873, |
| "rewards/mrr_reward": 0.28348215110599995, |
| "rewards/rank_analyze_format_reward": 0.7956449091434479, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 234 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 632.15625, |
| "epoch": 1.88, |
| "grad_norm": 0.03296066075563431, |
| "kl": 0.007839202880859375, |
| "learning_rate": 1.998443880285748e-05, |
| "loss": 0.0001, |
| "reward": 6.259778738021851, |
| "reward_std": 1.2440795004367828, |
| "rewards/mrr_reward": 0.4108507037162781, |
| "rewards/rank_analyze_format_reward": 0.7565357685089111, |
| "rewards/rank_answer_foramt_reward": 0.861328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9992559552192688, |
| "step": 235 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 633.890625, |
| "epoch": 1.888, |
| "grad_norm": 0.03121933341026306, |
| "kl": 0.009092330932617188, |
| "learning_rate": 1.9984298332928142e-05, |
| "loss": -0.02, |
| "reward": 6.254051685333252, |
| "reward_std": 1.0369019284844398, |
| "rewards/mrr_reward": 0.3944692611694336, |
| "rewards/rank_analyze_format_reward": 0.780887171626091, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.01442819181829691, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 236 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 675.0, |
| "epoch": 1.896, |
| "grad_norm": 0.03492776304483414, |
| "kl": 0.008260726928710938, |
| "learning_rate": 1.9984157232335926e-05, |
| "loss": -0.0176, |
| "reward": 5.5136624574661255, |
| "reward_std": 0.5710221119225025, |
| "rewards/mrr_reward": 0.20375123620033264, |
| "rewards/rank_analyze_format_reward": 0.7764740437269211, |
| "rewards/rank_answer_foramt_reward": 0.955078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 237 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 677.265625, |
| "epoch": 1.904, |
| "grad_norm": 0.03103082999587059, |
| "kl": 0.009332656860351562, |
| "learning_rate": 1.998401550108975e-05, |
| "loss": -0.0236, |
| "reward": 5.937364459037781, |
| "reward_std": 0.6469563692808151, |
| "rewards/mrr_reward": 0.28705357387661934, |
| "rewards/rank_analyze_format_reward": 0.820400133728981, |
| "rewards/rank_answer_foramt_reward": 0.96875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 238 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 673.921875, |
| "epoch": 1.912, |
| "grad_norm": 0.030047811567783356, |
| "kl": 0.008413314819335938, |
| "learning_rate": 1.9983873139198565e-05, |
| "loss": 0.016, |
| "reward": 5.476260185241699, |
| "reward_std": 0.6603549867868423, |
| "rewards/mrr_reward": 0.18887649476528168, |
| "rewards/rank_analyze_format_reward": 0.8105982840061188, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 239 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 658.5, |
| "epoch": 1.92, |
| "grad_norm": 0.03159939870238304, |
| "kl": 0.007993698120117188, |
| "learning_rate": 1.9983730146671363e-05, |
| "loss": 0.0115, |
| "reward": 5.526045322418213, |
| "reward_std": 0.9571312367916107, |
| "rewards/mrr_reward": 0.23610491305589676, |
| "rewards/rank_analyze_format_reward": 0.7670577019453049, |
| "rewards/rank_answer_foramt_reward": 0.83203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 626.828125, |
| "epoch": 1.928, |
| "grad_norm": 0.029709013178944588, |
| "kl": 0.008957862854003906, |
| "learning_rate": 1.9983586523517175e-05, |
| "loss": -0.0143, |
| "reward": 6.118180155754089, |
| "reward_std": 0.942285418510437, |
| "rewards/mrr_reward": 0.36377108097076416, |
| "rewards/rank_analyze_format_reward": 0.7112708389759064, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9964202791452408, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9964202791452408, |
| "step": 241 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 686.921875, |
| "epoch": 1.936, |
| "grad_norm": 0.032562170177698135, |
| "kl": 0.00933074951171875, |
| "learning_rate": 1.9983442269745073e-05, |
| "loss": -0.0013, |
| "reward": 5.5490440130233765, |
| "reward_std": 0.6350295543670654, |
| "rewards/mrr_reward": 0.20087425410747528, |
| "rewards/rank_analyze_format_reward": 0.833014503121376, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 242 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 651.125, |
| "epoch": 1.944, |
| "grad_norm": 0.034032415598630905, |
| "kl": 0.008647918701171875, |
| "learning_rate": 1.9983297385364166e-05, |
| "loss": 0.0212, |
| "reward": 5.475777506828308, |
| "reward_std": 0.5296052135527134, |
| "rewards/mrr_reward": 0.19027157872915268, |
| "rewards/rank_analyze_format_reward": 0.7654724419116974, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 243 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 678.125, |
| "epoch": 1.952, |
| "grad_norm": 0.03090350329875946, |
| "kl": 0.009029388427734375, |
| "learning_rate": 1.9983151870383614e-05, |
| "loss": -0.0167, |
| "reward": 5.95075786113739, |
| "reward_std": 0.9621450752019882, |
| "rewards/mrr_reward": 0.3183469697833061, |
| "rewards/rank_analyze_format_reward": 0.8020616918802261, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 244 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 659.34375, |
| "epoch": 1.96, |
| "grad_norm": 0.031676217913627625, |
| "kl": 0.009099960327148438, |
| "learning_rate": 1.99830057248126e-05, |
| "loss": -0.0003, |
| "reward": 5.529332995414734, |
| "reward_std": 0.665337011218071, |
| "rewards/mrr_reward": 0.19913194328546524, |
| "rewards/rank_analyze_format_reward": 0.84761643409729, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9826335161924362, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 245 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.1875, |
| "epoch": 1.968, |
| "grad_norm": 0.03511827066540718, |
| "kl": 0.00841522216796875, |
| "learning_rate": 1.9982858948660363e-05, |
| "loss": -0.0093, |
| "reward": 5.8812315464019775, |
| "reward_std": 0.9640037417411804, |
| "rewards/mrr_reward": 0.3079365137964487, |
| "rewards/rank_analyze_format_reward": 0.8350324183702469, |
| "rewards/rank_answer_foramt_reward": 0.845703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 246 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.09375, |
| "epoch": 1.976, |
| "grad_norm": 0.03402575105428696, |
| "kl": 0.008482933044433594, |
| "learning_rate": 1.9982711541936167e-05, |
| "loss": 0.0024, |
| "reward": 5.7571070194244385, |
| "reward_std": 1.066563904285431, |
| "rewards/mrr_reward": 0.3015996962785721, |
| "rewards/rank_analyze_format_reward": 0.7728263139724731, |
| "rewards/rank_answer_foramt_reward": 0.845703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9973393976688385, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9660893976688385, |
| "step": 247 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.421875, |
| "epoch": 1.984, |
| "grad_norm": 0.0321134515106678, |
| "kl": 0.010736465454101562, |
| "learning_rate": 1.9982563504649327e-05, |
| "loss": -0.0097, |
| "reward": 5.307972192764282, |
| "reward_std": 0.8248837888240814, |
| "rewards/mrr_reward": 0.1758122555911541, |
| "rewards/rank_analyze_format_reward": 0.7355825752019882, |
| "rewards/rank_answer_foramt_reward": 0.900390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 248 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 657.71875, |
| "epoch": 1.992, |
| "grad_norm": 0.030712375417351723, |
| "kl": 0.009143829345703125, |
| "learning_rate": 1.998241483680919e-05, |
| "loss": -0.0232, |
| "reward": 5.721879601478577, |
| "reward_std": 0.6676881909370422, |
| "rewards/mrr_reward": 0.2327939011156559, |
| "rewards/rank_analyze_format_reward": 0.8345478177070618, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.014133165590465069, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 249 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 678.15625, |
| "epoch": 2.0, |
| "grad_norm": 0.03419603407382965, |
| "kl": 0.010951995849609375, |
| "learning_rate": 1.9982265538425157e-05, |
| "loss": 0.026, |
| "reward": 5.291154146194458, |
| "reward_std": 0.8919351100921631, |
| "rewards/mrr_reward": 0.19232391379773617, |
| "rewards/rank_analyze_format_reward": 0.7730166912078857, |
| "rewards/rank_answer_foramt_reward": 0.833984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9964912384748459, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9652412384748459, |
| "step": 250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 671.21875, |
| "epoch": 2.008, |
| "grad_norm": 0.033153921365737915, |
| "kl": 0.010362625122070312, |
| "learning_rate": 1.9982115609506648e-05, |
| "loss": -0.0114, |
| "reward": 5.587164759635925, |
| "reward_std": 0.6778712831437588, |
| "rewards/mrr_reward": 0.20784350484609604, |
| "rewards/rank_analyze_format_reward": 0.8671189993619919, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 251 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.625, |
| "epoch": 2.016, |
| "grad_norm": 0.03214149549603462, |
| "kl": 0.00939178466796875, |
| "learning_rate": 1.9981965050063134e-05, |
| "loss": 0.0123, |
| "reward": 5.736871004104614, |
| "reward_std": 0.46378058195114136, |
| "rewards/mrr_reward": 0.2288566492497921, |
| "rewards/rank_analyze_format_reward": 0.8448817729949951, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 252 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 686.265625, |
| "epoch": 2.024, |
| "grad_norm": 0.031552232801914215, |
| "kl": 0.008321762084960938, |
| "learning_rate": 1.998181386010413e-05, |
| "loss": -0.0037, |
| "reward": 5.441847443580627, |
| "reward_std": 0.4925037622451782, |
| "rewards/mrr_reward": 0.16958706080913544, |
| "rewards/rank_analyze_format_reward": 0.83710116147995, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 253 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 674.078125, |
| "epoch": 2.032, |
| "grad_norm": 0.030457468703389168, |
| "kl": 0.0079193115234375, |
| "learning_rate": 1.9981662039639182e-05, |
| "loss": -0.0145, |
| "reward": 5.886712431907654, |
| "reward_std": 0.6978030279278755, |
| "rewards/mrr_reward": 0.29466763883829117, |
| "rewards/rank_analyze_format_reward": 0.8364828526973724, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9836309552192688, |
| "step": 254 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 617.484375, |
| "epoch": 2.04, |
| "grad_norm": 0.033485788851976395, |
| "kl": 0.010133743286132812, |
| "learning_rate": 1.9981509588677883e-05, |
| "loss": -0.0244, |
| "reward": 5.449910879135132, |
| "reward_std": 0.9446172118186951, |
| "rewards/mrr_reward": 0.21412449702620506, |
| "rewards/rank_analyze_format_reward": 0.7960726916790009, |
| "rewards/rank_answer_foramt_reward": 0.845703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9836309552192688, |
| "step": 255 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 639.15625, |
| "epoch": 2.048, |
| "grad_norm": 0.03260404244065285, |
| "kl": 0.011867523193359375, |
| "learning_rate": 1.9981356507229862e-05, |
| "loss": -0.0329, |
| "reward": 5.332253098487854, |
| "reward_std": 0.8807602822780609, |
| "rewards/mrr_reward": 0.2012710850685835, |
| "rewards/rank_analyze_format_reward": 0.7252326309680939, |
| "rewards/rank_answer_foramt_reward": 0.833984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9956946671009064, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9956946671009064, |
| "step": 256 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 689.078125, |
| "epoch": 2.056, |
| "grad_norm": 0.03169155865907669, |
| "kl": 0.008187294006347656, |
| "learning_rate": 1.9981202795304787e-05, |
| "loss": -0.0265, |
| "reward": 5.487691640853882, |
| "reward_std": 0.5739990789443254, |
| "rewards/mrr_reward": 0.19051960110664368, |
| "rewards/rank_analyze_format_reward": 0.7804432064294815, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.009314903989434242, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 257 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 661.890625, |
| "epoch": 2.064, |
| "grad_norm": 0.03203202411532402, |
| "kl": 0.010366439819335938, |
| "learning_rate": 1.9981048452912364e-05, |
| "loss": -0.036, |
| "reward": 5.627110004425049, |
| "reward_std": 0.7383934706449509, |
| "rewards/mrr_reward": 0.21320684999227524, |
| "rewards/rank_analyze_format_reward": 0.8495243489742279, |
| "rewards/rank_answer_foramt_reward": 0.927734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9985119104385376, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9985119104385376, |
| "step": 258 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 665.34375, |
| "epoch": 2.072, |
| "grad_norm": 0.031892187893390656, |
| "kl": 0.008783340454101562, |
| "learning_rate": 1.998089348006235e-05, |
| "loss": -0.0254, |
| "reward": 5.347836494445801, |
| "reward_std": 0.6561598926782608, |
| "rewards/mrr_reward": 0.16795635037124157, |
| "rewards/rank_analyze_format_reward": 0.8007026761770248, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 259 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 672.203125, |
| "epoch": 2.08, |
| "grad_norm": 0.02936052717268467, |
| "kl": 0.008337020874023438, |
| "learning_rate": 1.998073787676453e-05, |
| "loss": -0.0013, |
| "reward": 5.713126063346863, |
| "reward_std": 0.6685744076967239, |
| "rewards/mrr_reward": 0.2293526791036129, |
| "rewards/rank_analyze_format_reward": 0.8230589926242828, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 678.53125, |
| "epoch": 2.088, |
| "grad_norm": 0.03272569924592972, |
| "kl": 0.008749008178710938, |
| "learning_rate": 1.9980581643028732e-05, |
| "loss": -0.0006, |
| "reward": 5.963893890380859, |
| "reward_std": 0.8036399632692337, |
| "rewards/mrr_reward": 0.29525669291615486, |
| "rewards/rank_analyze_format_reward": 0.8180233091115952, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 261 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 689.125, |
| "epoch": 2.096, |
| "grad_norm": 0.032431650906801224, |
| "kl": 0.009164810180664062, |
| "learning_rate": 1.9980424778864825e-05, |
| "loss": 0.0079, |
| "reward": 5.4519102573394775, |
| "reward_std": 0.6584747061133385, |
| "rewards/mrr_reward": 0.17532242834568024, |
| "rewards/rank_analyze_format_reward": 0.8502301275730133, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 262 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.734375, |
| "epoch": 2.104, |
| "grad_norm": 0.033844608813524246, |
| "kl": 0.008821487426757812, |
| "learning_rate": 1.9980267284282718e-05, |
| "loss": -0.0156, |
| "reward": 5.51776909828186, |
| "reward_std": 0.661251924932003, |
| "rewards/mrr_reward": 0.21566840261220932, |
| "rewards/rank_analyze_format_reward": 0.7573655396699905, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 263 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 641.515625, |
| "epoch": 2.112, |
| "grad_norm": 0.03329053893685341, |
| "kl": 0.009187698364257812, |
| "learning_rate": 1.998010915929236e-05, |
| "loss": 0.0021, |
| "reward": 5.585361957550049, |
| "reward_std": 0.6540718153119087, |
| "rewards/mrr_reward": 0.20502851717174053, |
| "rewards/rank_analyze_format_reward": 0.8237265795469284, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 264 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 670.28125, |
| "epoch": 2.12, |
| "grad_norm": 0.03286939114332199, |
| "kl": 0.008611679077148438, |
| "learning_rate": 1.9979950403903732e-05, |
| "loss": -0.0075, |
| "reward": 5.393820524215698, |
| "reward_std": 0.587890163064003, |
| "rewards/mrr_reward": 0.1768353171646595, |
| "rewards/rank_analyze_format_reward": 0.795545905828476, |
| "rewards/rank_answer_foramt_reward": 0.900390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 265 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 651.484375, |
| "epoch": 2.128, |
| "grad_norm": 0.03381568565964699, |
| "kl": 0.009261131286621094, |
| "learning_rate": 1.9979791018126874e-05, |
| "loss": -0.0234, |
| "reward": 5.706428170204163, |
| "reward_std": 0.9137073345482349, |
| "rewards/mrr_reward": 0.25823412649333477, |
| "rewards/rank_analyze_format_reward": 0.8322576582431793, |
| "rewards/rank_answer_foramt_reward": 0.85546875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9967888593673706, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9967888593673706, |
| "step": 266 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.21875, |
| "epoch": 2.136, |
| "grad_norm": 0.03102073445916176, |
| "kl": 0.0093231201171875, |
| "learning_rate": 1.9979631001971848e-05, |
| "loss": -0.0296, |
| "reward": 5.800906300544739, |
| "reward_std": 1.087342880666256, |
| "rewards/mrr_reward": 0.30744667910039425, |
| "rewards/rank_analyze_format_reward": 0.7718681544065475, |
| "rewards/rank_answer_foramt_reward": 0.873046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9826335161924362, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9826335161924362, |
| "step": 267 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.421875, |
| "epoch": 2.144, |
| "grad_norm": 0.031345415860414505, |
| "kl": 0.008287429809570312, |
| "learning_rate": 1.9979470355448756e-05, |
| "loss": -0.0282, |
| "reward": 5.249317646026611, |
| "reward_std": 0.5138699784874916, |
| "rewards/mrr_reward": 0.144283227622509, |
| "rewards/rank_analyze_format_reward": 0.7632499039173126, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9818111509084702, |
| "step": 268 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 673.96875, |
| "epoch": 2.152, |
| "grad_norm": 0.03057401068508625, |
| "kl": 0.00954437255859375, |
| "learning_rate": 1.9979309078567756e-05, |
| "loss": 0.0248, |
| "reward": 5.779935836791992, |
| "reward_std": 0.8248593732714653, |
| "rewards/mrr_reward": 0.28400298207998276, |
| "rewards/rank_analyze_format_reward": 0.7388757467269897, |
| "rewards/rank_answer_foramt_reward": 0.8984375, |
| "rewards/rank_contrast_format_reward": 0.014423076994717121, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 269 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 682.84375, |
| "epoch": 2.16, |
| "grad_norm": 0.02877761982381344, |
| "kl": 0.008660316467285156, |
| "learning_rate": 1.9979147171339022e-05, |
| "loss": 0.0112, |
| "reward": 5.953364610671997, |
| "reward_std": 1.0477607250213623, |
| "rewards/mrr_reward": 0.30703745037317276, |
| "rewards/rank_analyze_format_reward": 0.8443555235862732, |
| "rewards/rank_answer_foramt_reward": 0.896484375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.125, |
| "epoch": 2.168, |
| "grad_norm": 0.03201289847493172, |
| "kl": 0.011377334594726562, |
| "learning_rate": 1.9978984633772795e-05, |
| "loss": -0.0223, |
| "reward": 5.34253454208374, |
| "reward_std": 1.0002544522285461, |
| "rewards/mrr_reward": 0.2072482742369175, |
| "rewards/rank_analyze_format_reward": 0.7518228143453598, |
| "rewards/rank_answer_foramt_reward": 0.81640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 271 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 660.765625, |
| "epoch": 2.176, |
| "grad_norm": 0.033372726291418076, |
| "kl": 0.008876800537109375, |
| "learning_rate": 1.9978821465879332e-05, |
| "loss": 0.0035, |
| "reward": 5.907817602157593, |
| "reward_std": 0.8581305295228958, |
| "rewards/mrr_reward": 0.3209015391767025, |
| "rewards/rank_analyze_format_reward": 0.7235125303268433, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 272 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 638.609375, |
| "epoch": 2.184, |
| "grad_norm": 0.03363556042313576, |
| "kl": 0.009851455688476562, |
| "learning_rate": 1.9978657667668945e-05, |
| "loss": 0.0006, |
| "reward": 5.586699724197388, |
| "reward_std": 0.679840974509716, |
| "rewards/mrr_reward": 0.22039930522441864, |
| "rewards/rank_analyze_format_reward": 0.7949463129043579, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 273 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 652.359375, |
| "epoch": 2.192, |
| "grad_norm": 0.030939241871237755, |
| "kl": 0.008955001831054688, |
| "learning_rate": 1.9978493239151976e-05, |
| "loss": 0.0165, |
| "reward": 5.984506607055664, |
| "reward_std": 1.1070766001939774, |
| "rewards/mrr_reward": 0.35327382013201714, |
| "rewards/rank_analyze_format_reward": 0.7415075749158859, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9794049561023712, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9794049561023712, |
| "step": 274 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.21875, |
| "epoch": 2.2, |
| "grad_norm": 0.03531678020954132, |
| "kl": 0.008274078369140625, |
| "learning_rate": 1.997832818033881e-05, |
| "loss": -0.0297, |
| "reward": 5.348074316978455, |
| "reward_std": 0.7185068726539612, |
| "rewards/mrr_reward": 0.1775855701416731, |
| "rewards/rank_analyze_format_reward": 0.7607788443565369, |
| "rewards/rank_answer_foramt_reward": 0.876953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 275 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 666.109375, |
| "epoch": 2.208, |
| "grad_norm": 0.03195611387491226, |
| "kl": 0.00879669189453125, |
| "learning_rate": 1.9978162491239882e-05, |
| "loss": -0.0057, |
| "reward": 5.110758066177368, |
| "reward_std": 0.6071035340428352, |
| "rewards/mrr_reward": 0.13315972313284874, |
| "rewards/rank_analyze_format_reward": 0.7337524592876434, |
| "rewards/rank_answer_foramt_reward": 0.86328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 276 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 661.46875, |
| "epoch": 2.216, |
| "grad_norm": 0.03165001794695854, |
| "kl": 0.009108543395996094, |
| "learning_rate": 1.997799617186565e-05, |
| "loss": 0.0002, |
| "reward": 5.748872637748718, |
| "reward_std": 0.6040460020303726, |
| "rewards/mrr_reward": 0.23888888955116272, |
| "rewards/rank_analyze_format_reward": 0.8343324214220047, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 277 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 650.9375, |
| "epoch": 2.224, |
| "grad_norm": 0.03219401836395264, |
| "kl": 0.008958816528320312, |
| "learning_rate": 1.9977829222226622e-05, |
| "loss": -0.0102, |
| "reward": 6.228976130485535, |
| "reward_std": 1.0323386192321777, |
| "rewards/mrr_reward": 0.3954737111926079, |
| "rewards/rank_analyze_format_reward": 0.7388781309127808, |
| "rewards/rank_answer_foramt_reward": 0.955078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 278 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 650.625, |
| "epoch": 2.232, |
| "grad_norm": 0.03455818444490433, |
| "kl": 0.008190155029296875, |
| "learning_rate": 1.9977661642333344e-05, |
| "loss": -0.0149, |
| "reward": 5.373299837112427, |
| "reward_std": 0.8532019183039665, |
| "rewards/mrr_reward": 0.17197420820593834, |
| "rewards/rank_analyze_format_reward": 0.8064966201782227, |
| "rewards/rank_answer_foramt_reward": 0.88671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 279 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 668.390625, |
| "epoch": 2.24, |
| "grad_norm": 0.029518628492951393, |
| "kl": 0.008675575256347656, |
| "learning_rate": 1.99774934321964e-05, |
| "loss": 0.0097, |
| "reward": 5.885105848312378, |
| "reward_std": 0.8752723336219788, |
| "rewards/mrr_reward": 0.29821430146694183, |
| "rewards/rank_analyze_format_reward": 0.8387329578399658, |
| "rewards/rank_answer_foramt_reward": 0.861328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 695.234375, |
| "epoch": 2.248, |
| "grad_norm": 0.0311732180416584, |
| "kl": 0.00806427001953125, |
| "learning_rate": 1.9977324591826415e-05, |
| "loss": 0.0088, |
| "reward": 5.816622257232666, |
| "reward_std": 0.8411147147417068, |
| "rewards/mrr_reward": 0.2692398317158222, |
| "rewards/rank_analyze_format_reward": 0.8213856071233749, |
| "rewards/rank_answer_foramt_reward": 0.927734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 281 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.3125, |
| "epoch": 2.2560000000000002, |
| "grad_norm": 0.032965514808893204, |
| "kl": 0.008653640747070312, |
| "learning_rate": 1.9977155121234056e-05, |
| "loss": -0.0151, |
| "reward": 5.504308104515076, |
| "reward_std": 0.7789564803242683, |
| "rewards/mrr_reward": 0.2098772320896387, |
| "rewards/rank_analyze_format_reward": 0.7714656293392181, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9954948574304581, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9954948574304581, |
| "step": 282 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 658.296875, |
| "epoch": 2.2640000000000002, |
| "grad_norm": 0.033497847616672516, |
| "kl": 0.009673118591308594, |
| "learning_rate": 1.9976985020430022e-05, |
| "loss": -0.0045, |
| "reward": 6.024387836456299, |
| "reward_std": 0.6360235512256622, |
| "rewards/mrr_reward": 0.32351189479231834, |
| "rewards/rank_analyze_format_reward": 0.8103033602237701, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 283 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 692.0, |
| "epoch": 2.2720000000000002, |
| "grad_norm": 0.03325760364532471, |
| "kl": 0.008056640625, |
| "learning_rate": 1.9976814289425066e-05, |
| "loss": 0.0057, |
| "reward": 5.442414402961731, |
| "reward_std": 0.6741410419344902, |
| "rewards/mrr_reward": 0.16527777537703514, |
| "rewards/rank_analyze_format_reward": 0.8516157567501068, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 284 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 622.953125, |
| "epoch": 2.2800000000000002, |
| "grad_norm": 0.035167232155799866, |
| "kl": 0.009555816650390625, |
| "learning_rate": 1.9976642928229965e-05, |
| "loss": 0.0079, |
| "reward": 5.635341167449951, |
| "reward_std": 0.738533541560173, |
| "rewards/mrr_reward": 0.22927208244800568, |
| "rewards/rank_analyze_format_reward": 0.8002839833498001, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 285 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 689.6875, |
| "epoch": 2.288, |
| "grad_norm": 0.031026914715766907, |
| "kl": 0.009326934814453125, |
| "learning_rate": 1.997647093685555e-05, |
| "loss": -0.0035, |
| "reward": 6.1463258266448975, |
| "reward_std": 0.7740766424685717, |
| "rewards/mrr_reward": 0.34895833767950535, |
| "rewards/rank_analyze_format_reward": 0.8179265707731247, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9975328892469406, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9975328892469406, |
| "step": 286 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 661.875, |
| "epoch": 2.296, |
| "grad_norm": 0.03047414869070053, |
| "kl": 0.0081024169921875, |
| "learning_rate": 1.9976298315312675e-05, |
| "loss": -0.0073, |
| "reward": 5.633584260940552, |
| "reward_std": 0.7135986983776093, |
| "rewards/mrr_reward": 0.22126116044819355, |
| "rewards/rank_analyze_format_reward": 0.816898986697197, |
| "rewards/rank_answer_foramt_reward": 0.939453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 287 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 622.734375, |
| "epoch": 2.304, |
| "grad_norm": 0.033806800842285156, |
| "kl": 0.008437156677246094, |
| "learning_rate": 1.9976125063612254e-05, |
| "loss": -0.0368, |
| "reward": 5.522616624832153, |
| "reward_std": 1.1119669452309608, |
| "rewards/mrr_reward": 0.2499070018529892, |
| "rewards/rank_analyze_format_reward": 0.6830301284790039, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 288 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 653.484375, |
| "epoch": 2.312, |
| "grad_norm": 0.031600214540958405, |
| "kl": 0.008672714233398438, |
| "learning_rate": 1.9975951181765226e-05, |
| "loss": -0.0142, |
| "reward": 5.305689573287964, |
| "reward_std": 0.5672206580638885, |
| "rewards/mrr_reward": 0.1442832387983799, |
| "rewards/rank_analyze_format_reward": 0.7910565435886383, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 289 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 623.421875, |
| "epoch": 2.32, |
| "grad_norm": 0.08158010989427567, |
| "kl": 0.02387237548828125, |
| "learning_rate": 1.9975776669782572e-05, |
| "loss": -0.0073, |
| "reward": 5.95476496219635, |
| "reward_std": 1.296246200799942, |
| "rewards/mrr_reward": 0.35463789105415344, |
| "rewards/rank_analyze_format_reward": 0.7332646250724792, |
| "rewards/rank_answer_foramt_reward": 0.833984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9962009787559509, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9962009787559509, |
| "step": 290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 625.859375, |
| "epoch": 2.328, |
| "grad_norm": 0.031481146812438965, |
| "kl": 0.008309364318847656, |
| "learning_rate": 1.997560152767532e-05, |
| "loss": -0.0067, |
| "reward": 5.631876707077026, |
| "reward_std": 0.6546346843242645, |
| "rewards/mrr_reward": 0.23791542649269104, |
| "rewards/rank_analyze_format_reward": 0.7969877421855927, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.013663419522345066, |
| "rewards/rank_initial_format_reward": 0.9826335161924362, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9670085161924362, |
| "step": 291 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 623.84375, |
| "epoch": 2.336, |
| "grad_norm": 0.037191130220890045, |
| "kl": 0.007427215576171875, |
| "learning_rate": 1.997542575545453e-05, |
| "loss": -0.0003, |
| "reward": 5.354902744293213, |
| "reward_std": 0.7260187715291977, |
| "rewards/mrr_reward": 0.19120163097977638, |
| "rewards/rank_analyze_format_reward": 0.7232708260416985, |
| "rewards/rank_answer_foramt_reward": 0.876953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.994936153292656, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.994936153292656, |
| "step": 292 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 654.140625, |
| "epoch": 2.344, |
| "grad_norm": 0.030975518748164177, |
| "kl": 0.006802558898925781, |
| "learning_rate": 1.9975249353131304e-05, |
| "loss": 0.0211, |
| "reward": 6.162993669509888, |
| "reward_std": 0.28368850238621235, |
| "rewards/mrr_reward": 0.3227430731058121, |
| "rewards/rank_analyze_format_reward": 0.8875313103199005, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 293 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 687.78125, |
| "epoch": 2.352, |
| "grad_norm": 0.03459201753139496, |
| "kl": 0.011088371276855469, |
| "learning_rate": 1.9975072320716785e-05, |
| "loss": 0.0123, |
| "reward": 5.5102492570877075, |
| "reward_std": 1.0369550734758377, |
| "rewards/mrr_reward": 0.25905878841876984, |
| "rewards/rank_analyze_format_reward": 0.7301295399665833, |
| "rewards/rank_answer_foramt_reward": 0.826171875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9979188442230225, |
| "rewards/rank_overall_format_reward_more": 0.9375, |
| "rewards/rank_verify_format_reward": 0.9822938442230225, |
| "step": 294 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 627.734375, |
| "epoch": 2.36, |
| "grad_norm": 0.031542547047138214, |
| "kl": 0.008196830749511719, |
| "learning_rate": 1.997489465822216e-05, |
| "loss": 0.0003, |
| "reward": 6.191069960594177, |
| "reward_std": 0.9258007109165192, |
| "rewards/mrr_reward": 0.3952629007399082, |
| "rewards/rank_analyze_format_reward": 0.7525966763496399, |
| "rewards/rank_answer_foramt_reward": 0.873046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 295 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 638.3125, |
| "epoch": 2.368, |
| "grad_norm": 0.03178432211279869, |
| "kl": 0.008670806884765625, |
| "learning_rate": 1.9974716365658646e-05, |
| "loss": -0.0171, |
| "reward": 5.212967276573181, |
| "reward_std": 0.41466130316257477, |
| "rewards/mrr_reward": 0.1444692499935627, |
| "rewards/rank_analyze_format_reward": 0.7190747410058975, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 296 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 617.453125, |
| "epoch": 2.376, |
| "grad_norm": 0.03690677136182785, |
| "kl": 0.008281707763671875, |
| "learning_rate": 1.9974537443037504e-05, |
| "loss": -0.0293, |
| "reward": 5.413174152374268, |
| "reward_std": 0.7004074454307556, |
| "rewards/mrr_reward": 0.18989335373044014, |
| "rewards/rank_analyze_format_reward": 0.798017293214798, |
| "rewards/rank_answer_foramt_reward": 0.857421875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 297 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 677.140625, |
| "epoch": 2.384, |
| "grad_norm": 0.03291052579879761, |
| "kl": 0.008054733276367188, |
| "learning_rate": 1.9974357890370038e-05, |
| "loss": -0.0051, |
| "reward": 5.8681100606918335, |
| "reward_std": 0.7517407834529877, |
| "rewards/mrr_reward": 0.2819134518504143, |
| "rewards/rank_analyze_format_reward": 0.8143666237592697, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 298 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.84375, |
| "epoch": 2.392, |
| "grad_norm": 0.03135626018047333, |
| "kl": 0.00891876220703125, |
| "learning_rate": 1.9974177707667594e-05, |
| "loss": -0.0061, |
| "reward": 5.645070552825928, |
| "reward_std": 0.483820416033268, |
| "rewards/mrr_reward": 0.22468998655676842, |
| "rewards/rank_analyze_format_reward": 0.7951385527849197, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 299 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.03125, |
| "epoch": 2.4, |
| "grad_norm": 0.03234223648905754, |
| "kl": 0.008310317993164062, |
| "learning_rate": 1.9973996894941545e-05, |
| "loss": -0.0066, |
| "reward": 6.414054274559021, |
| "reward_std": 0.9432376772165298, |
| "rewards/mrr_reward": 0.40456970781087875, |
| "rewards/rank_analyze_format_reward": 0.8504632115364075, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 300 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 619.9375, |
| "epoch": 2.408, |
| "grad_norm": 0.03262539207935333, |
| "kl": 0.0074558258056640625, |
| "learning_rate": 1.9973815452203314e-05, |
| "loss": -0.0375, |
| "reward": 5.410915374755859, |
| "reward_std": 0.6377813890576363, |
| "rewards/mrr_reward": 0.2176587451249361, |
| "rewards/rank_analyze_format_reward": 0.7108280807733536, |
| "rewards/rank_answer_foramt_reward": 0.845703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9957809001207352, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9957809001207352, |
| "step": 301 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 649.0, |
| "epoch": 2.416, |
| "grad_norm": 0.03086378425359726, |
| "kl": 0.008134841918945312, |
| "learning_rate": 1.997363337946437e-05, |
| "loss": -0.0071, |
| "reward": 5.332266926765442, |
| "reward_std": 0.7226725369691849, |
| "rewards/mrr_reward": 0.16749132610857487, |
| "rewards/rank_analyze_format_reward": 0.7792825102806091, |
| "rewards/rank_answer_foramt_reward": 0.890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9961971044540405, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9961971044540405, |
| "step": 302 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 638.296875, |
| "epoch": 2.424, |
| "grad_norm": 0.03537590056657791, |
| "kl": 0.008157730102539062, |
| "learning_rate": 1.9973450676736205e-05, |
| "loss": -0.0204, |
| "reward": 5.607993245124817, |
| "reward_std": 0.5448657497763634, |
| "rewards/mrr_reward": 0.2038070484995842, |
| "rewards/rank_analyze_format_reward": 0.8434313237667084, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 303 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 651.46875, |
| "epoch": 2.432, |
| "grad_norm": 0.030700810253620148, |
| "kl": 0.008941650390625, |
| "learning_rate": 1.997326734403036e-05, |
| "loss": -0.0217, |
| "reward": 5.499999642372131, |
| "reward_std": 0.7909096032381058, |
| "rewards/mrr_reward": 0.20515872910618782, |
| "rewards/rank_analyze_format_reward": 0.8101091831922531, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 304 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 661.234375, |
| "epoch": 2.44, |
| "grad_norm": 0.031224045902490616, |
| "kl": 0.007833480834960938, |
| "learning_rate": 1.997308338135842e-05, |
| "loss": 0.0108, |
| "reward": 5.637946367263794, |
| "reward_std": 0.6872468590736389, |
| "rewards/mrr_reward": 0.2270585335791111, |
| "rewards/rank_analyze_format_reward": 0.8155348151922226, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 305 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 626.90625, |
| "epoch": 2.448, |
| "grad_norm": 0.034319475293159485, |
| "kl": 0.009645462036132812, |
| "learning_rate": 1.9972898788732e-05, |
| "loss": -0.0273, |
| "reward": 5.709458708763123, |
| "reward_std": 0.8589234948158264, |
| "rewards/mrr_reward": 0.2706225086003542, |
| "rewards/rank_analyze_format_reward": 0.7733380496501923, |
| "rewards/rank_answer_foramt_reward": 0.86328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 306 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 673.84375, |
| "epoch": 2.456, |
| "grad_norm": 0.03254551440477371, |
| "kl": 0.00882720947265625, |
| "learning_rate": 1.9972713566162763e-05, |
| "loss": -0.0144, |
| "reward": 5.992625951766968, |
| "reward_std": 0.5823017284274101, |
| "rewards/mrr_reward": 0.2946366611868143, |
| "rewards/rank_analyze_format_reward": 0.8765792399644852, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 307 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 648.28125, |
| "epoch": 2.464, |
| "grad_norm": 0.03191380575299263, |
| "kl": 0.008279800415039062, |
| "learning_rate": 1.997252771366241e-05, |
| "loss": -0.0266, |
| "reward": 5.676502346992493, |
| "reward_std": 0.8015426993370056, |
| "rewards/mrr_reward": 0.2450086809694767, |
| "rewards/rank_analyze_format_reward": 0.7492017894983292, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 308 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 652.546875, |
| "epoch": 2.472, |
| "grad_norm": 0.03314831480383873, |
| "kl": 0.008116722106933594, |
| "learning_rate": 1.9972341231242675e-05, |
| "loss": -0.0008, |
| "reward": 5.710240483283997, |
| "reward_std": 0.6632269471883774, |
| "rewards/mrr_reward": 0.2543836794793606, |
| "rewards/rank_analyze_format_reward": 0.7747371196746826, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 309 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.03125, |
| "epoch": 2.48, |
| "grad_norm": 0.03377654030919075, |
| "kl": 0.009763717651367188, |
| "learning_rate": 1.9972154118915344e-05, |
| "loss": -0.0154, |
| "reward": 5.755140542984009, |
| "reward_std": 0.6953508257865906, |
| "rewards/mrr_reward": 0.27708953991532326, |
| "rewards/rank_analyze_format_reward": 0.699516773223877, |
| "rewards/rank_answer_foramt_reward": 0.955078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 310 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.53125, |
| "epoch": 2.488, |
| "grad_norm": 0.03434586524963379, |
| "kl": 0.009517669677734375, |
| "learning_rate": 1.997196637669223e-05, |
| "loss": -0.0056, |
| "reward": 5.399365782737732, |
| "reward_std": 0.6744739785790443, |
| "rewards/mrr_reward": 0.1757440436631441, |
| "rewards/rank_analyze_format_reward": 0.8096708953380585, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 311 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 615.125, |
| "epoch": 2.496, |
| "grad_norm": 0.03466728329658508, |
| "kl": 0.010303497314453125, |
| "learning_rate": 1.99717780045852e-05, |
| "loss": -0.0225, |
| "reward": 5.5405789613723755, |
| "reward_std": 0.7150269001722336, |
| "rewards/mrr_reward": 0.21937625110149384, |
| "rewards/rank_analyze_format_reward": 0.7294801473617554, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 312 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.421875, |
| "epoch": 2.504, |
| "grad_norm": 0.030946679413318634, |
| "kl": 0.007822036743164062, |
| "learning_rate": 1.997158900260614e-05, |
| "loss": -0.0271, |
| "reward": 5.6557512283325195, |
| "reward_std": 0.5378784239292145, |
| "rewards/mrr_reward": 0.21952505223453045, |
| "rewards/rank_analyze_format_reward": 0.799135148525238, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 313 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 670.078125, |
| "epoch": 2.512, |
| "grad_norm": 0.03168868273496628, |
| "kl": 0.007956504821777344, |
| "learning_rate": 1.9971399370767e-05, |
| "loss": -0.0138, |
| "reward": 5.643744587898254, |
| "reward_std": 0.6119559705257416, |
| "rewards/mrr_reward": 0.21652406081557274, |
| "rewards/rank_analyze_format_reward": 0.8040668964385986, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9975329041481018, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9975329041481018, |
| "step": 314 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.828125, |
| "epoch": 2.52, |
| "grad_norm": 0.03241531923413277, |
| "kl": 0.009516716003417969, |
| "learning_rate": 1.9971209109079752e-05, |
| "loss": -0.0025, |
| "reward": 5.7852044105529785, |
| "reward_std": 0.7106733173131943, |
| "rewards/mrr_reward": 0.2660466283559799, |
| "rewards/rank_analyze_format_reward": 0.7932835072278976, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 315 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 659.78125, |
| "epoch": 2.528, |
| "grad_norm": 0.032261837273836136, |
| "kl": 0.009158134460449219, |
| "learning_rate": 1.9971018217556416e-05, |
| "loss": -0.0131, |
| "reward": 5.741572380065918, |
| "reward_std": 0.9340634196996689, |
| "rewards/mrr_reward": 0.252250749617815, |
| "rewards/rank_analyze_format_reward": 0.8470719158649445, |
| "rewards/rank_answer_foramt_reward": 0.890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 316 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 633.46875, |
| "epoch": 2.536, |
| "grad_norm": 0.03570343554019928, |
| "kl": 0.008861541748046875, |
| "learning_rate": 1.997082669620905e-05, |
| "loss": -0.0283, |
| "reward": 5.575627684593201, |
| "reward_std": 0.59528449177742, |
| "rewards/mrr_reward": 0.20295760035514832, |
| "rewards/rank_analyze_format_reward": 0.814463660120964, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 317 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 666.09375, |
| "epoch": 2.544, |
| "grad_norm": 0.031063677743077278, |
| "kl": 0.007955551147460938, |
| "learning_rate": 1.997063454504975e-05, |
| "loss": -0.0086, |
| "reward": 5.3937273025512695, |
| "reward_std": 0.3589708264917135, |
| "rewards/mrr_reward": 0.14628596417605877, |
| "rewards/rank_analyze_format_reward": 0.853196918964386, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 318 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 633.078125, |
| "epoch": 2.552, |
| "grad_norm": 0.03542445972561836, |
| "kl": 0.008488655090332031, |
| "learning_rate": 1.9970441764090654e-05, |
| "loss": 0.0057, |
| "reward": 5.481135725975037, |
| "reward_std": 0.5193400681018829, |
| "rewards/mrr_reward": 0.1921502985060215, |
| "rewards/rank_analyze_format_reward": 0.7550382316112518, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9992559552192688, |
| "step": 319 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 625.046875, |
| "epoch": 2.56, |
| "grad_norm": 0.033270470798015594, |
| "kl": 0.010507583618164062, |
| "learning_rate": 1.9970248353343943e-05, |
| "loss": -0.0402, |
| "reward": 5.634747266769409, |
| "reward_std": 0.6059275269508362, |
| "rewards/mrr_reward": 0.22289186716079712, |
| "rewards/rank_analyze_format_reward": 0.8007473796606064, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9985600560903549, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9985600560903549, |
| "step": 320 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.25, |
| "epoch": 2.568, |
| "grad_norm": 0.03648350015282631, |
| "kl": 0.009977340698242188, |
| "learning_rate": 1.997005431282183e-05, |
| "loss": -0.0263, |
| "reward": 5.567351460456848, |
| "reward_std": 0.9927941262722015, |
| "rewards/mrr_reward": 0.21014384925365448, |
| "rewards/rank_analyze_format_reward": 0.795135423541069, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 321 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 694.09375, |
| "epoch": 2.576, |
| "grad_norm": 0.030762799084186554, |
| "kl": 0.009020805358886719, |
| "learning_rate": 1.996985964253657e-05, |
| "loss": -0.0093, |
| "reward": 5.783640742301941, |
| "reward_std": 0.7187513560056686, |
| "rewards/mrr_reward": 0.2635354623198509, |
| "rewards/rank_analyze_format_reward": 0.8368059247732162, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 322 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 671.828125, |
| "epoch": 2.584, |
| "grad_norm": 0.034205637872219086, |
| "kl": 0.008504867553710938, |
| "learning_rate": 1.996966434250046e-05, |
| "loss": 0.0016, |
| "reward": 5.436808228492737, |
| "reward_std": 0.7342798858880997, |
| "rewards/mrr_reward": 0.18624752573668957, |
| "rewards/rank_analyze_format_reward": 0.8596720993518829, |
| "rewards/rank_answer_foramt_reward": 0.833984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 323 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 659.828125, |
| "epoch": 2.592, |
| "grad_norm": 0.03484297916293144, |
| "kl": 0.011442184448242188, |
| "learning_rate": 1.996946841272584e-05, |
| "loss": -0.0141, |
| "reward": 5.498148679733276, |
| "reward_std": 0.6755934655666351, |
| "rewards/mrr_reward": 0.1958395354449749, |
| "rewards/rank_analyze_format_reward": 0.8358840942382812, |
| "rewards/rank_answer_foramt_reward": 0.88671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 324 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 649.140625, |
| "epoch": 2.6, |
| "grad_norm": 0.032932061702013016, |
| "kl": 0.00991058349609375, |
| "learning_rate": 1.9969271853225083e-05, |
| "loss": -0.0066, |
| "reward": 5.963220715522766, |
| "reward_std": 0.5944485515356064, |
| "rewards/mrr_reward": 0.29027777537703514, |
| "rewards/rank_analyze_format_reward": 0.8310981541872025, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 671.640625, |
| "epoch": 2.608, |
| "grad_norm": 0.032074637711048126, |
| "kl": 0.008647918701171875, |
| "learning_rate": 1.9969074664010605e-05, |
| "loss": 0.0031, |
| "reward": 5.655932188034058, |
| "reward_std": 0.41921700816601515, |
| "rewards/mrr_reward": 0.20592138171195984, |
| "rewards/rank_analyze_format_reward": 0.8595906496047974, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 326 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 641.78125, |
| "epoch": 2.616, |
| "grad_norm": 0.033006053417921066, |
| "kl": 0.00919342041015625, |
| "learning_rate": 1.9968876845094864e-05, |
| "loss": 0.0, |
| "reward": 5.528472542762756, |
| "reward_std": 0.41449059918522835, |
| "rewards/mrr_reward": 0.17598586156964302, |
| "rewards/rank_analyze_format_reward": 0.8398456275463104, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 327 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 666.734375, |
| "epoch": 2.624, |
| "grad_norm": 0.034119799733161926, |
| "kl": 0.009557723999023438, |
| "learning_rate": 1.996867839649035e-05, |
| "loss": -0.0152, |
| "reward": 5.444994330406189, |
| "reward_std": 0.7444438338279724, |
| "rewards/mrr_reward": 0.18702257797122002, |
| "rewards/rank_analyze_format_reward": 0.7671014666557312, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 328 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 649.71875, |
| "epoch": 2.632, |
| "grad_norm": 0.034841958433389664, |
| "kl": 0.009944915771484375, |
| "learning_rate": 1.9968479318209603e-05, |
| "loss": 0.0103, |
| "reward": 6.070975661277771, |
| "reward_std": 1.140429526567459, |
| "rewards/mrr_reward": 0.3466765880584717, |
| "rewards/rank_analyze_format_reward": 0.8070079386234283, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 329 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 700.140625, |
| "epoch": 2.64, |
| "grad_norm": 0.03080068528652191, |
| "kl": 0.009492874145507812, |
| "learning_rate": 1.9968279610265194e-05, |
| "loss": 0.0229, |
| "reward": 5.357762455940247, |
| "reward_std": 0.6760208085179329, |
| "rewards/mrr_reward": 0.16169394925236702, |
| "rewards/rank_analyze_format_reward": 0.8436842709779739, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 330 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 655.25, |
| "epoch": 2.648, |
| "grad_norm": 0.03430721163749695, |
| "kl": 0.008755683898925781, |
| "learning_rate": 1.9968079272669744e-05, |
| "loss": 0.0123, |
| "reward": 5.500509142875671, |
| "reward_std": 0.5423839017748833, |
| "rewards/mrr_reward": 0.19489708170294762, |
| "rewards/rank_analyze_format_reward": 0.8066286146640778, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 331 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.5625, |
| "epoch": 2.656, |
| "grad_norm": 0.03663242235779762, |
| "kl": 0.011625289916992188, |
| "learning_rate": 1.9967878305435902e-05, |
| "loss": -0.0071, |
| "reward": 5.21767783164978, |
| "reward_std": 0.6230617165565491, |
| "rewards/mrr_reward": 0.15186012163758278, |
| "rewards/rank_analyze_format_reward": 0.7214507311582565, |
| "rewards/rank_answer_foramt_reward": 0.9140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 332 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 671.84375, |
| "epoch": 2.664, |
| "grad_norm": 0.032446544617414474, |
| "kl": 0.010744094848632812, |
| "learning_rate": 1.9967676708576362e-05, |
| "loss": -0.0252, |
| "reward": 5.431139588356018, |
| "reward_std": 0.38856903836131096, |
| "rewards/mrr_reward": 0.1630394347012043, |
| "rewards/rank_analyze_format_reward": 0.820786789059639, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.014835858717560768, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 333 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.96875, |
| "epoch": 2.672, |
| "grad_norm": 0.03309661149978638, |
| "kl": 0.00853729248046875, |
| "learning_rate": 1.9967474482103863e-05, |
| "loss": 0.0207, |
| "reward": 5.812266111373901, |
| "reward_std": 0.6310017332434654, |
| "rewards/mrr_reward": 0.2564794160425663, |
| "rewards/rank_analyze_format_reward": 0.8384659141302109, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 334 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 638.546875, |
| "epoch": 2.68, |
| "grad_norm": 0.032147496938705444, |
| "kl": 0.009319305419921875, |
| "learning_rate": 1.996727162603117e-05, |
| "loss": -0.0164, |
| "reward": 5.495377421379089, |
| "reward_std": 0.5436971858143806, |
| "rewards/mrr_reward": 0.22277406230568886, |
| "rewards/rank_analyze_format_reward": 0.6531094089150429, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 335 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 671.296875, |
| "epoch": 2.6879999999999997, |
| "grad_norm": 0.03020451031625271, |
| "kl": 0.008257865905761719, |
| "learning_rate": 1.9967068140371103e-05, |
| "loss": -0.023, |
| "reward": 6.15105414390564, |
| "reward_std": 0.7604061029851437, |
| "rewards/mrr_reward": 0.35879215970635414, |
| "rewards/rank_analyze_format_reward": 0.7471354156732559, |
| "rewards/rank_answer_foramt_reward": 0.984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 336 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 669.15625, |
| "epoch": 2.6959999999999997, |
| "grad_norm": 0.03703196346759796, |
| "kl": 0.009157180786132812, |
| "learning_rate": 1.9966864025136518e-05, |
| "loss": 0.0097, |
| "reward": 5.549162268638611, |
| "reward_std": 0.5662261173129082, |
| "rewards/mrr_reward": 0.1858568899333477, |
| "rewards/rank_analyze_format_reward": 0.8500397950410843, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 337 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.375, |
| "epoch": 2.7039999999999997, |
| "grad_norm": 0.03430277109146118, |
| "kl": 0.009218215942382812, |
| "learning_rate": 1.99666592803403e-05, |
| "loss": 0.0008, |
| "reward": 6.126144886016846, |
| "reward_std": 0.626649871468544, |
| "rewards/mrr_reward": 0.3454737141728401, |
| "rewards/rank_analyze_format_reward": 0.7743111848831177, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.998641312122345, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.998641312122345, |
| "step": 338 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 688.765625, |
| "epoch": 2.7119999999999997, |
| "grad_norm": 0.03879823163151741, |
| "kl": 0.008647918701171875, |
| "learning_rate": 1.9966453905995386e-05, |
| "loss": 0.0293, |
| "reward": 5.698531866073608, |
| "reward_std": 0.5574172139167786, |
| "rewards/mrr_reward": 0.22999751940369606, |
| "rewards/rank_analyze_format_reward": 0.8449480086565018, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 339 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.390625, |
| "epoch": 2.7199999999999998, |
| "grad_norm": 0.034662626683712006, |
| "kl": 0.009153366088867188, |
| "learning_rate": 1.996624790211475e-05, |
| "loss": -0.0002, |
| "reward": 6.773137092590332, |
| "reward_std": 0.912096843123436, |
| "rewards/mrr_reward": 0.500465027987957, |
| "rewards/rank_analyze_format_reward": 0.7869019955396652, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 340 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.796875, |
| "epoch": 2.7279999999999998, |
| "grad_norm": 0.035187844187021255, |
| "kl": 0.009860992431640625, |
| "learning_rate": 1.9966041268711404e-05, |
| "loss": -0.001, |
| "reward": 6.187240362167358, |
| "reward_std": 1.2494878768920898, |
| "rewards/mrr_reward": 0.3659474216401577, |
| "rewards/rank_analyze_format_reward": 0.8201817274093628, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9975329041481018, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9975329041481018, |
| "step": 341 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.015625, |
| "epoch": 2.7359999999999998, |
| "grad_norm": 0.03603256121277809, |
| "kl": 0.009283065795898438, |
| "learning_rate": 1.9965834005798395e-05, |
| "loss": -0.0136, |
| "reward": 5.895509123802185, |
| "reward_std": 0.9243122488260269, |
| "rewards/mrr_reward": 0.2991505488753319, |
| "rewards/rank_analyze_format_reward": 0.7767235189676285, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 342 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 652.671875, |
| "epoch": 2.7439999999999998, |
| "grad_norm": 0.032031431794166565, |
| "kl": 0.007723808288574219, |
| "learning_rate": 1.9965626113388823e-05, |
| "loss": 0.004, |
| "reward": 5.493781566619873, |
| "reward_std": 0.8891884908080101, |
| "rewards/mrr_reward": 0.2125496082007885, |
| "rewards/rank_analyze_format_reward": 0.7606558352708817, |
| "rewards/rank_answer_foramt_reward": 0.900390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 343 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.375, |
| "epoch": 2.752, |
| "grad_norm": 0.03751445189118385, |
| "kl": 0.009563446044921875, |
| "learning_rate": 1.9965417591495813e-05, |
| "loss": -0.0398, |
| "reward": 5.696443438529968, |
| "reward_std": 0.8934725448489189, |
| "rewards/mrr_reward": 0.2595982141792774, |
| "rewards/rank_analyze_format_reward": 0.8279723674058914, |
| "rewards/rank_answer_foramt_reward": 0.876953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 344 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 666.75, |
| "epoch": 2.76, |
| "grad_norm": 0.03237143158912659, |
| "kl": 0.0087738037109375, |
| "learning_rate": 1.9965208440132538e-05, |
| "loss": 0.0013, |
| "reward": 5.7427204847335815, |
| "reward_std": 0.41047997772693634, |
| "rewards/mrr_reward": 0.23925472237169743, |
| "rewards/rank_analyze_format_reward": 0.8612567484378815, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 345 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 658.296875, |
| "epoch": 2.768, |
| "grad_norm": 0.03239751234650612, |
| "kl": 0.00971221923828125, |
| "learning_rate": 1.9964998659312212e-05, |
| "loss": -0.0036, |
| "reward": 6.0654884576797485, |
| "reward_std": 1.099491998553276, |
| "rewards/mrr_reward": 0.3638826832175255, |
| "rewards/rank_analyze_format_reward": 0.7552082240581512, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9810855388641357, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9810855388641357, |
| "step": 346 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 656.46875, |
| "epoch": 2.776, |
| "grad_norm": 0.032796673476696014, |
| "kl": 0.008184432983398438, |
| "learning_rate": 1.996478824904808e-05, |
| "loss": -0.0104, |
| "reward": 5.7753273248672485, |
| "reward_std": 0.6123448684811592, |
| "rewards/mrr_reward": 0.2482638955116272, |
| "rewards/rank_analyze_format_reward": 0.7992332726716995, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 347 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.734375, |
| "epoch": 2.784, |
| "grad_norm": 0.03421541303396225, |
| "kl": 0.009485244750976562, |
| "learning_rate": 1.9964577209353438e-05, |
| "loss": -0.0268, |
| "reward": 5.9183748960494995, |
| "reward_std": 0.7210484445095062, |
| "rewards/mrr_reward": 0.29007937386631966, |
| "rewards/rank_analyze_format_reward": 0.8342294245958328, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 348 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 591.3125, |
| "epoch": 2.792, |
| "grad_norm": 0.037076905369758606, |
| "kl": 0.008688926696777344, |
| "learning_rate": 1.9964365540241614e-05, |
| "loss": -0.0433, |
| "reward": 5.8531190156936646, |
| "reward_std": 0.9352162629365921, |
| "rewards/mrr_reward": 0.3134424611926079, |
| "rewards/rank_analyze_format_reward": 0.6715000495314598, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 349 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 628.421875, |
| "epoch": 2.8, |
| "grad_norm": 0.035160817205905914, |
| "kl": 0.009990692138671875, |
| "learning_rate": 1.9964153241725984e-05, |
| "loss": -0.0108, |
| "reward": 6.071021556854248, |
| "reward_std": 0.7491893395781517, |
| "rewards/mrr_reward": 0.34606895968317986, |
| "rewards/rank_analyze_format_reward": 0.7609644383192062, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 350 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 657.671875, |
| "epoch": 2.808, |
| "grad_norm": 0.032391324639320374, |
| "kl": 0.008762359619140625, |
| "learning_rate": 1.996394031381995e-05, |
| "loss": -0.0273, |
| "reward": 5.768019914627075, |
| "reward_std": 0.6652617454528809, |
| "rewards/mrr_reward": 0.24676959216594696, |
| "rewards/rank_analyze_format_reward": 0.8082853406667709, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 351 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 667.265625, |
| "epoch": 2.816, |
| "grad_norm": 0.030825432389974594, |
| "kl": 0.008817672729492188, |
| "learning_rate": 1.996372675653696e-05, |
| "loss": -0.0243, |
| "reward": 5.394962310791016, |
| "reward_std": 0.48561568558216095, |
| "rewards/mrr_reward": 0.14479166641831398, |
| "rewards/rank_analyze_format_reward": 0.8704832494258881, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 352 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 653.015625, |
| "epoch": 2.824, |
| "grad_norm": 0.035048164427280426, |
| "kl": 0.008731842041015625, |
| "learning_rate": 1.9963512569890512e-05, |
| "loss": -0.0201, |
| "reward": 5.514656662940979, |
| "reward_std": 0.5665445066988468, |
| "rewards/mrr_reward": 0.19130083918571472, |
| "rewards/rank_analyze_format_reward": 0.8020728975534439, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 353 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.859375, |
| "epoch": 2.832, |
| "grad_norm": 0.03681398183107376, |
| "kl": 0.009633064270019531, |
| "learning_rate": 1.9963297753894134e-05, |
| "loss": 0.0131, |
| "reward": 5.816080689430237, |
| "reward_std": 0.7230090275406837, |
| "rewards/mrr_reward": 0.26795635372400284, |
| "rewards/rank_analyze_format_reward": 0.8138361871242523, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974361509084702, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9974361509084702, |
| "step": 354 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 673.0625, |
| "epoch": 2.84, |
| "grad_norm": 0.03574398159980774, |
| "kl": 0.009863853454589844, |
| "learning_rate": 1.9963082308561386e-05, |
| "loss": -0.0167, |
| "reward": 5.45677387714386, |
| "reward_std": 0.7667002454400063, |
| "rewards/mrr_reward": 0.19638517871499062, |
| "rewards/rank_analyze_format_reward": 0.8426849991083145, |
| "rewards/rank_answer_foramt_reward": 0.83203125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 355 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 618.8125, |
| "epoch": 2.848, |
| "grad_norm": 0.03704219311475754, |
| "kl": 0.009098052978515625, |
| "learning_rate": 1.9962866233905887e-05, |
| "loss": -0.0226, |
| "reward": 5.530822277069092, |
| "reward_std": 0.7445577755570412, |
| "rewards/mrr_reward": 0.19813368655741215, |
| "rewards/rank_analyze_format_reward": 0.837896928191185, |
| "rewards/rank_answer_foramt_reward": 0.900390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 356 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 651.765625, |
| "epoch": 2.856, |
| "grad_norm": 0.034613966941833496, |
| "kl": 0.00957489013671875, |
| "learning_rate": 1.9962649529941283e-05, |
| "loss": 0.0124, |
| "reward": 5.686863660812378, |
| "reward_std": 0.8740081563591957, |
| "rewards/mrr_reward": 0.23311011120676994, |
| "rewards/rank_analyze_format_reward": 0.862964078783989, |
| "rewards/rank_answer_foramt_reward": 0.927734375, |
| "rewards/rank_contrast_format_reward": 0.014082618057727814, |
| "rewards/rank_initial_format_reward": 0.9826335161924362, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 357 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 648.453125, |
| "epoch": 2.864, |
| "grad_norm": 0.033967334777116776, |
| "kl": 0.00858306884765625, |
| "learning_rate": 1.996243219668126e-05, |
| "loss": -0.0098, |
| "reward": 5.471218466758728, |
| "reward_std": 0.6651558130979538, |
| "rewards/mrr_reward": 0.20168030634522438, |
| "rewards/rank_analyze_format_reward": 0.7719189673662186, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 358 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 643.296875, |
| "epoch": 2.872, |
| "grad_norm": 0.033462896943092346, |
| "kl": 0.0072383880615234375, |
| "learning_rate": 1.996221423413954e-05, |
| "loss": -0.0058, |
| "reward": 6.096756100654602, |
| "reward_std": 0.7008651196956635, |
| "rewards/mrr_reward": 0.3258432671427727, |
| "rewards/rank_analyze_format_reward": 0.8519767969846725, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 359 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 643.25, |
| "epoch": 2.88, |
| "grad_norm": 0.03083069995045662, |
| "kl": 0.008129119873046875, |
| "learning_rate": 1.9961995642329905e-05, |
| "loss": -0.0077, |
| "reward": 5.6521806716918945, |
| "reward_std": 0.3540456146001816, |
| "rewards/mrr_reward": 0.2099454402923584, |
| "rewards/rank_analyze_format_reward": 0.8338831663131714, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 360 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.015625, |
| "epoch": 2.888, |
| "grad_norm": 0.03494837507605553, |
| "kl": 0.0089569091796875, |
| "learning_rate": 1.996177642126615e-05, |
| "loss": -0.0372, |
| "reward": 6.048594832420349, |
| "reward_std": 0.5146159902215004, |
| "rewards/mrr_reward": 0.32186879962682724, |
| "rewards/rank_analyze_format_reward": 0.7611195892095566, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 361 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 666.5, |
| "epoch": 2.896, |
| "grad_norm": 0.03123210370540619, |
| "kl": 0.00858306884765625, |
| "learning_rate": 1.996155657096213e-05, |
| "loss": -0.0066, |
| "reward": 5.597678542137146, |
| "reward_std": 0.6386058628559113, |
| "rewards/mrr_reward": 0.20701265148818493, |
| "rewards/rank_analyze_format_reward": 0.8321279138326645, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 362 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.078125, |
| "epoch": 2.904, |
| "grad_norm": 0.03348281979560852, |
| "kl": 0.00963592529296875, |
| "learning_rate": 1.9961336091431728e-05, |
| "loss": -0.0201, |
| "reward": 5.798085331916809, |
| "reward_std": 0.7690745741128922, |
| "rewards/mrr_reward": 0.2617683596909046, |
| "rewards/rank_analyze_format_reward": 0.8075376749038696, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 363 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 656.109375, |
| "epoch": 2.912, |
| "grad_norm": 0.03250405564904213, |
| "kl": 0.0079803466796875, |
| "learning_rate": 1.9961114982688868e-05, |
| "loss": -0.0069, |
| "reward": 5.859158277511597, |
| "reward_std": 0.6455894485116005, |
| "rewards/mrr_reward": 0.2464347779750824, |
| "rewards/rank_analyze_format_reward": 0.8734191805124283, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 364 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.96875, |
| "epoch": 2.92, |
| "grad_norm": 0.03663971275091171, |
| "kl": 0.010545730590820312, |
| "learning_rate": 1.9960893244747525e-05, |
| "loss": -0.0396, |
| "reward": 5.503406643867493, |
| "reward_std": 0.8078130483627319, |
| "rewards/mrr_reward": 0.20906499400734901, |
| "rewards/rank_analyze_format_reward": 0.7881255447864532, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9834558814764023, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 365 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 655.03125, |
| "epoch": 2.928, |
| "grad_norm": 0.035040080547332764, |
| "kl": 0.009366989135742188, |
| "learning_rate": 1.9960670877621697e-05, |
| "loss": 0.0398, |
| "reward": 6.2447816133499146, |
| "reward_std": 0.8318488001823425, |
| "rewards/mrr_reward": 0.36858259700238705, |
| "rewards/rank_analyze_format_reward": 0.8759200721979141, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 366 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 654.484375, |
| "epoch": 2.936, |
| "grad_norm": 0.0322866216301918, |
| "kl": 0.009279251098632812, |
| "learning_rate": 1.9960447881325433e-05, |
| "loss": 0.0056, |
| "reward": 5.498760938644409, |
| "reward_std": 0.4926854334771633, |
| "rewards/mrr_reward": 0.18302952125668526, |
| "rewards/rank_analyze_format_reward": 0.8291427195072174, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.96875, |
| "step": 367 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 693.359375, |
| "epoch": 2.944, |
| "grad_norm": 0.031908247619867325, |
| "kl": 0.007828712463378906, |
| "learning_rate": 1.996022425587282e-05, |
| "loss": 0.0028, |
| "reward": 5.857112407684326, |
| "reward_std": 0.5642570108175278, |
| "rewards/mrr_reward": 0.26006944477558136, |
| "rewards/rank_analyze_format_reward": 0.8460166752338409, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 368 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 653.9375, |
| "epoch": 2.952, |
| "grad_norm": 0.033450644463300705, |
| "kl": 0.008747100830078125, |
| "learning_rate": 1.9960000001277985e-05, |
| "loss": 0.0023, |
| "reward": 6.380629658699036, |
| "reward_std": 0.8351171687245369, |
| "rewards/mrr_reward": 0.4134734645485878, |
| "rewards/rank_analyze_format_reward": 0.7850210219621658, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 369 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 661.171875, |
| "epoch": 2.96, |
| "grad_norm": 0.03648662939667702, |
| "kl": 0.009329795837402344, |
| "learning_rate": 1.9959775117555085e-05, |
| "loss": 0.0345, |
| "reward": 6.414909482002258, |
| "reward_std": 0.45721762999892235, |
| "rewards/mrr_reward": 0.41873140074312687, |
| "rewards/rank_analyze_format_reward": 0.7966244220733643, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 370 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 617.609375, |
| "epoch": 2.968, |
| "grad_norm": 0.037536896765232086, |
| "kl": 0.010850906372070312, |
| "learning_rate": 1.995954960471833e-05, |
| "loss": -0.0382, |
| "reward": 5.33310854434967, |
| "reward_std": 0.5616030171513557, |
| "rewards/mrr_reward": 0.16109870746731758, |
| "rewards/rank_analyze_format_reward": 0.8039480000734329, |
| "rewards/rank_answer_foramt_reward": 0.900390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 371 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.15625, |
| "epoch": 2.976, |
| "grad_norm": 0.03529645875096321, |
| "kl": 0.009998321533203125, |
| "learning_rate": 1.995932346278197e-05, |
| "loss": -0.037, |
| "reward": 5.771807551383972, |
| "reward_std": 0.5844480693340302, |
| "rewards/mrr_reward": 0.2539186589419842, |
| "rewards/rank_analyze_format_reward": 0.8147266507148743, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 372 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 587.5625, |
| "epoch": 2.984, |
| "grad_norm": 0.034666452556848526, |
| "kl": 0.01047515869140625, |
| "learning_rate": 1.9959096691760284e-05, |
| "loss": -0.0132, |
| "reward": 6.080157160758972, |
| "reward_std": 0.6419508755207062, |
| "rewards/mrr_reward": 0.34099702909588814, |
| "rewards/rank_analyze_format_reward": 0.8141245543956757, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9978972524404526, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9978972524404526, |
| "step": 373 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 654.375, |
| "epoch": 2.992, |
| "grad_norm": 0.03392947465181351, |
| "kl": 0.008699417114257812, |
| "learning_rate": 1.995886929166759e-05, |
| "loss": -0.0404, |
| "reward": 5.7819143533706665, |
| "reward_std": 0.47098034992814064, |
| "rewards/mrr_reward": 0.260416679084301, |
| "rewards/rank_analyze_format_reward": 0.8203257471323013, |
| "rewards/rank_answer_foramt_reward": 0.927734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 374 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 656.75, |
| "epoch": 3.0, |
| "grad_norm": 0.03569958359003067, |
| "kl": 0.009565353393554688, |
| "learning_rate": 1.9958641262518263e-05, |
| "loss": -0.0069, |
| "reward": 5.463603854179382, |
| "reward_std": 0.41404130309820175, |
| "rewards/mrr_reward": 0.17387152649462223, |
| "rewards/rank_analyze_format_reward": 0.832147404551506, |
| "rewards/rank_answer_foramt_reward": 0.955078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 375 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 661.671875, |
| "epoch": 3.008, |
| "grad_norm": 0.03453488275408745, |
| "kl": 0.008286476135253906, |
| "learning_rate": 3.4816627469912147e-06, |
| "loss": -0.019, |
| "reward": 5.870022416114807, |
| "reward_std": 0.5319755226373672, |
| "rewards/mrr_reward": 0.2689298205077648, |
| "rewards/rank_analyze_format_reward": 0.8235997408628464, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 376 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 626.734375, |
| "epoch": 3.016, |
| "grad_norm": 0.03683853894472122, |
| "kl": 0.009878158569335938, |
| "learning_rate": 3.4341424424704373e-06, |
| "loss": -0.0352, |
| "reward": 6.375778317451477, |
| "reward_std": 0.8507000654935837, |
| "rewards/mrr_reward": 0.4045138992369175, |
| "rewards/rank_analyze_format_reward": 0.8163162767887115, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 377 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 674.953125, |
| "epoch": 3.024, |
| "grad_norm": 0.03564409166574478, |
| "kl": 0.009000778198242188, |
| "learning_rate": 3.3868813467634833e-06, |
| "loss": 0.0035, |
| "reward": 6.099611282348633, |
| "reward_std": 0.6811064556241035, |
| "rewards/mrr_reward": 0.326202891767025, |
| "rewards/rank_analyze_format_reward": 0.8689035177230835, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 378 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.40625, |
| "epoch": 3.032, |
| "grad_norm": 0.033847253769636154, |
| "kl": 0.009696006774902344, |
| "learning_rate": 3.3398813256574847e-06, |
| "loss": -0.0138, |
| "reward": 5.70564591884613, |
| "reward_std": 0.7488968372344971, |
| "rewards/mrr_reward": 0.2354228664189577, |
| "rewards/rank_analyze_format_reward": 0.814735621213913, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 379 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.53125, |
| "epoch": 3.04, |
| "grad_norm": 0.03472684323787689, |
| "kl": 0.009616851806640625, |
| "learning_rate": 3.2931442346328e-06, |
| "loss": 0.005, |
| "reward": 5.8208394050598145, |
| "reward_std": 0.8216791450977325, |
| "rewards/mrr_reward": 0.2550409249961376, |
| "rewards/rank_analyze_format_reward": 0.8592693954706192, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 380 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 670.015625, |
| "epoch": 3.048, |
| "grad_norm": 0.03275707736611366, |
| "kl": 0.009731292724609375, |
| "learning_rate": 3.2466719187897555e-06, |
| "loss": 0.009, |
| "reward": 5.976478338241577, |
| "reward_std": 0.9694596379995346, |
| "rewards/mrr_reward": 0.3182477727532387, |
| "rewards/rank_analyze_format_reward": 0.8373227268457413, |
| "rewards/rank_answer_foramt_reward": 0.900390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9985119104385376, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9985119104385376, |
| "step": 381 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.234375, |
| "epoch": 3.056, |
| "grad_norm": 0.03374367952346802, |
| "kl": 0.009586334228515625, |
| "learning_rate": 3.200466212775808e-06, |
| "loss": -0.0045, |
| "reward": 5.7755879163742065, |
| "reward_std": 0.5715985968708992, |
| "rewards/mrr_reward": 0.2742931507527828, |
| "rewards/rank_analyze_format_reward": 0.8286910802125931, |
| "rewards/rank_answer_foramt_reward": 0.890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 382 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.234375, |
| "epoch": 3.064, |
| "grad_norm": 0.03337114676833153, |
| "kl": 0.00981903076171875, |
| "learning_rate": 3.1545289407131128e-06, |
| "loss": 0.0193, |
| "reward": 6.042637348175049, |
| "reward_std": 0.8147040233016014, |
| "rewards/mrr_reward": 0.3159040194004774, |
| "rewards/rank_analyze_format_reward": 0.7925428599119186, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.013822115026414394, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 383 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.3125, |
| "epoch": 3.072, |
| "grad_norm": 0.03374440222978592, |
| "kl": 0.0095977783203125, |
| "learning_rate": 3.108861916126518e-06, |
| "loss": -0.0052, |
| "reward": 5.525590181350708, |
| "reward_std": 0.4414802975952625, |
| "rewards/mrr_reward": 0.1869729682803154, |
| "rewards/rank_analyze_format_reward": 0.843795970082283, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 384 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.640625, |
| "epoch": 3.08, |
| "grad_norm": 0.03527143970131874, |
| "kl": 0.0100250244140625, |
| "learning_rate": 3.063466941871952e-06, |
| "loss": -0.0349, |
| "reward": 5.4740070104599, |
| "reward_std": 0.39598348736763, |
| "rewards/mrr_reward": 0.17225322499871254, |
| "rewards/rank_analyze_format_reward": 0.8160143941640854, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 385 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 641.03125, |
| "epoch": 3.088, |
| "grad_norm": 0.03699163347482681, |
| "kl": 0.01068115234375, |
| "learning_rate": 3.0183458100652752e-06, |
| "loss": -0.0098, |
| "reward": 5.632686495780945, |
| "reward_std": 0.8547279201447964, |
| "rewards/mrr_reward": 0.21130332723259926, |
| "rewards/rank_analyze_format_reward": 0.836301326751709, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 386 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 615.25, |
| "epoch": 3.096, |
| "grad_norm": 0.035420242697000504, |
| "kl": 0.013090133666992188, |
| "learning_rate": 2.9735003020115095e-06, |
| "loss": 0.0072, |
| "reward": 5.7270954847335815, |
| "reward_std": 0.8438360095024109, |
| "rewards/mrr_reward": 0.26177455112338066, |
| "rewards/rank_analyze_format_reward": 0.7755855619907379, |
| "rewards/rank_answer_foramt_reward": 0.9140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 387 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 661.71875, |
| "epoch": 3.104, |
| "grad_norm": 0.033768534660339355, |
| "kl": 0.008324623107910156, |
| "learning_rate": 2.9289321881345257e-06, |
| "loss": -0.0074, |
| "reward": 5.456307530403137, |
| "reward_std": 0.2819017954170704, |
| "rewards/mrr_reward": 0.1582651287317276, |
| "rewards/rank_analyze_format_reward": 0.8524289280176163, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 388 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 623.921875, |
| "epoch": 3.112, |
| "grad_norm": 0.03795436769723892, |
| "kl": 0.011144638061523438, |
| "learning_rate": 2.884643227907147e-06, |
| "loss": -0.02, |
| "reward": 6.103384494781494, |
| "reward_std": 0.7188520580530167, |
| "rewards/mrr_reward": 0.3517671152949333, |
| "rewards/rank_analyze_format_reward": 0.7562461942434311, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 389 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.984375, |
| "epoch": 3.12, |
| "grad_norm": 0.03798002377152443, |
| "kl": 0.009586334228515625, |
| "learning_rate": 2.840635169781688e-06, |
| "loss": 0.0009, |
| "reward": 5.6863319873809814, |
| "reward_std": 0.8139217495918274, |
| "rewards/mrr_reward": 0.24700520560145378, |
| "rewards/rank_analyze_format_reward": 0.7954290956258774, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9973393976688385, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9973393976688385, |
| "step": 390 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.625, |
| "epoch": 3.128, |
| "grad_norm": 0.03476894274353981, |
| "kl": 0.009393692016601562, |
| "learning_rate": 2.796909751120931e-06, |
| "loss": -0.0152, |
| "reward": 5.828433513641357, |
| "reward_std": 0.6863922253251076, |
| "rewards/mrr_reward": 0.2774987518787384, |
| "rewards/rank_analyze_format_reward": 0.8160946071147919, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 391 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 672.328125, |
| "epoch": 3.136, |
| "grad_norm": 0.03496420755982399, |
| "kl": 0.01062774658203125, |
| "learning_rate": 2.7534686981295335e-06, |
| "loss": 0.0069, |
| "reward": 5.767983317375183, |
| "reward_std": 0.6909240707755089, |
| "rewards/mrr_reward": 0.24921875447034836, |
| "rewards/rank_analyze_format_reward": 0.8529097139835358, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9981617629528046, |
| "step": 392 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 639.734375, |
| "epoch": 3.144, |
| "grad_norm": 0.03465255722403526, |
| "kl": 0.009387969970703125, |
| "learning_rate": 2.7103137257858867e-06, |
| "loss": -0.0177, |
| "reward": 5.525047659873962, |
| "reward_std": 0.7425801493227482, |
| "rewards/mrr_reward": 0.21982267126441002, |
| "rewards/rank_analyze_format_reward": 0.7590381950139999, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 393 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 655.5, |
| "epoch": 3.152, |
| "grad_norm": 0.03880779445171356, |
| "kl": 0.012248992919921875, |
| "learning_rate": 2.667446537774402e-06, |
| "loss": -0.0006, |
| "reward": 5.65397036075592, |
| "reward_std": 0.5698041021823883, |
| "rewards/mrr_reward": 0.22565104067325592, |
| "rewards/rank_analyze_format_reward": 0.786522388458252, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 394 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 655.9375, |
| "epoch": 3.16, |
| "grad_norm": 0.03340061381459236, |
| "kl": 0.009691238403320312, |
| "learning_rate": 2.624868826418262e-06, |
| "loss": 0.0072, |
| "reward": 6.097415328025818, |
| "reward_std": 0.7755601853132248, |
| "rewards/mrr_reward": 0.330853171646595, |
| "rewards/rank_analyze_format_reward": 0.8189245611429214, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 395 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 618.46875, |
| "epoch": 3.168, |
| "grad_norm": 0.037465766072273254, |
| "kl": 0.009607315063476562, |
| "learning_rate": 2.5825822726126095e-06, |
| "loss": -0.0024, |
| "reward": 5.320816397666931, |
| "reward_std": 0.679179236292839, |
| "rewards/mrr_reward": 0.17338790372014046, |
| "rewards/rank_analyze_format_reward": 0.7400810569524765, |
| "rewards/rank_answer_foramt_reward": 0.912109375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9992559552192688, |
| "step": 396 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.21875, |
| "epoch": 3.176, |
| "grad_norm": 0.033823512494564056, |
| "kl": 0.008966445922851562, |
| "learning_rate": 2.5405885457581793e-06, |
| "loss": -0.0125, |
| "reward": 6.29050076007843, |
| "reward_std": 0.8139433264732361, |
| "rewards/mrr_reward": 0.3840401843190193, |
| "rewards/rank_analyze_format_reward": 0.806766077876091, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 397 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.34375, |
| "epoch": 3.184, |
| "grad_norm": 0.034099679440259933, |
| "kl": 0.009246826171875, |
| "learning_rate": 2.4988893036954045e-06, |
| "loss": -0.0128, |
| "reward": 5.263689160346985, |
| "reward_std": 0.5176863595843315, |
| "rewards/mrr_reward": 0.13877107948064804, |
| "rewards/rank_analyze_format_reward": 0.8295836299657822, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9678308814764023, |
| "step": 398 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 622.40625, |
| "epoch": 3.192, |
| "grad_norm": 0.03965727239847183, |
| "kl": 0.00982666015625, |
| "learning_rate": 2.4574861926389615e-06, |
| "loss": 0.0202, |
| "reward": 5.415614366531372, |
| "reward_std": 0.5104451552033424, |
| "rewards/mrr_reward": 0.1703559048473835, |
| "rewards/rank_analyze_format_reward": 0.8391944617033005, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9836309552192688, |
| "step": 399 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 675.515625, |
| "epoch": 3.2, |
| "grad_norm": 0.03522520139813423, |
| "kl": 0.009145736694335938, |
| "learning_rate": 2.4163808471127815e-06, |
| "loss": 0.0108, |
| "reward": 6.274830937385559, |
| "reward_std": 0.9628144055604935, |
| "rewards/mrr_reward": 0.3546936884522438, |
| "rewards/rank_analyze_format_reward": 0.8702747970819473, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.013124999590218067, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 400 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 671.734375, |
| "epoch": 3.208, |
| "grad_norm": 0.03297824412584305, |
| "kl": 0.010050773620605469, |
| "learning_rate": 2.37557488988552e-06, |
| "loss": -0.0181, |
| "reward": 5.695779204368591, |
| "reward_std": 0.6984521001577377, |
| "rewards/mrr_reward": 0.23389137163758278, |
| "rewards/rank_analyze_format_reward": 0.8285732418298721, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 401 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 611.90625, |
| "epoch": 3.216, |
| "grad_norm": 0.03468155115842819, |
| "kl": 0.011409759521484375, |
| "learning_rate": 2.335069931906503e-06, |
| "loss": 0.016, |
| "reward": 5.605130910873413, |
| "reward_std": 0.5643002241849899, |
| "rewards/mrr_reward": 0.23312251828610897, |
| "rewards/rank_analyze_format_reward": 0.7599145472049713, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 402 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 652.015625, |
| "epoch": 3.224, |
| "grad_norm": 0.03274522349238396, |
| "kl": 0.0077266693115234375, |
| "learning_rate": 2.2948675722421086e-06, |
| "loss": -0.0008, |
| "reward": 5.618125796318054, |
| "reward_std": 0.4926639534533024, |
| "rewards/mrr_reward": 0.2079303190112114, |
| "rewards/rank_analyze_format_reward": 0.8157014697790146, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 403 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.703125, |
| "epoch": 3.232, |
| "grad_norm": 0.032266825437545776, |
| "kl": 0.008646011352539062, |
| "learning_rate": 2.254969398012663e-06, |
| "loss": 0.0062, |
| "reward": 5.917828798294067, |
| "reward_std": 1.038107082247734, |
| "rewards/mrr_reward": 0.31374628096818924, |
| "rewards/rank_analyze_format_reward": 0.723275676369667, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 404 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 685.0, |
| "epoch": 3.24, |
| "grad_norm": 0.033160846680402756, |
| "kl": 0.007889747619628906, |
| "learning_rate": 2.215376984329767e-06, |
| "loss": 0.0108, |
| "reward": 5.611984491348267, |
| "reward_std": 0.5307941734790802, |
| "rewards/mrr_reward": 0.20152530074119568, |
| "rewards/rank_analyze_format_reward": 0.8371334373950958, |
| "rewards/rank_answer_foramt_reward": 0.96875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 405 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 651.421875, |
| "epoch": 3.248, |
| "grad_norm": 0.032735127955675125, |
| "kl": 0.009138107299804688, |
| "learning_rate": 2.1760918942341193e-06, |
| "loss": -0.018, |
| "reward": 5.920067191123962, |
| "reward_std": 0.4110058397054672, |
| "rewards/mrr_reward": 0.28723959624767303, |
| "rewards/rank_analyze_format_reward": 0.8022439330816269, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 406 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 661.703125, |
| "epoch": 3.2560000000000002, |
| "grad_norm": 0.0345609113574028, |
| "kl": 0.008047103881835938, |
| "learning_rate": 2.1371156786338108e-06, |
| "loss": -0.0177, |
| "reward": 6.009241461753845, |
| "reward_std": 0.7121211290359497, |
| "rewards/mrr_reward": 0.3027033731341362, |
| "rewards/rank_analyze_format_reward": 0.8433498591184616, |
| "rewards/rank_answer_foramt_reward": 0.955078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 407 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 643.953125, |
| "epoch": 3.2640000000000002, |
| "grad_norm": 0.034407492727041245, |
| "kl": 0.009765625, |
| "learning_rate": 2.098449876243096e-06, |
| "loss": -0.0046, |
| "reward": 5.621447324752808, |
| "reward_std": 0.7794221378862858, |
| "rewards/mrr_reward": 0.22875124216079712, |
| "rewards/rank_analyze_format_reward": 0.8310191482305527, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 408 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 588.671875, |
| "epoch": 3.2720000000000002, |
| "grad_norm": 0.03622359409928322, |
| "kl": 0.008343696594238281, |
| "learning_rate": 2.0600960135216463e-06, |
| "loss": -0.0396, |
| "reward": 5.996233105659485, |
| "reward_std": 0.6990708820521832, |
| "rewards/mrr_reward": 0.3586743548512459, |
| "rewards/rank_analyze_format_reward": 0.678608126938343, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9834558814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 409 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.015625, |
| "epoch": 3.2800000000000002, |
| "grad_norm": 0.03268953040242195, |
| "kl": 0.009521484375, |
| "learning_rate": 2.022055604614289e-06, |
| "loss": -0.0041, |
| "reward": 5.610453367233276, |
| "reward_std": 0.608606144785881, |
| "rewards/mrr_reward": 0.21897321939468384, |
| "rewards/rank_analyze_format_reward": 0.7794823199510574, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 410 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 663.40625, |
| "epoch": 3.288, |
| "grad_norm": 0.03196902945637703, |
| "kl": 0.009124755859375, |
| "learning_rate": 1.984330151291233e-06, |
| "loss": -0.0202, |
| "reward": 5.701419472694397, |
| "reward_std": 0.3155892379581928, |
| "rewards/mrr_reward": 0.2233320865780115, |
| "rewards/rank_analyze_format_reward": 0.8373879790306091, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 411 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 612.28125, |
| "epoch": 3.296, |
| "grad_norm": 0.03455911576747894, |
| "kl": 0.010396957397460938, |
| "learning_rate": 1.9469211428887813e-06, |
| "loss": -0.0327, |
| "reward": 5.262094497680664, |
| "reward_std": 0.5627113878726959, |
| "rewards/mrr_reward": 0.15016741678118706, |
| "rewards/rank_analyze_format_reward": 0.7412733286619186, |
| "rewards/rank_answer_foramt_reward": 0.955078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9981617629528046, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9825367629528046, |
| "step": 412 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 668.09375, |
| "epoch": 3.304, |
| "grad_norm": 0.0337919220328331, |
| "kl": 0.00904083251953125, |
| "learning_rate": 1.9098300562505266e-06, |
| "loss": -0.0192, |
| "reward": 5.6808494329452515, |
| "reward_std": 0.5475155636668205, |
| "rewards/mrr_reward": 0.21426091715693474, |
| "rewards/rank_analyze_format_reward": 0.8667744994163513, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 413 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 603.0625, |
| "epoch": 3.312, |
| "grad_norm": 0.039892613887786865, |
| "kl": 0.009840965270996094, |
| "learning_rate": 1.8730583556690607e-06, |
| "loss": 0.002, |
| "reward": 5.8083416223526, |
| "reward_std": 1.1347919255495071, |
| "rewards/mrr_reward": 0.2994481772184372, |
| "rewards/rank_analyze_format_reward": 0.7936044484376907, |
| "rewards/rank_answer_foramt_reward": 0.861328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9973393976688385, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9817143976688385, |
| "step": 414 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.953125, |
| "epoch": 3.32, |
| "grad_norm": 0.03493841364979744, |
| "kl": 0.010486602783203125, |
| "learning_rate": 1.8366074928281608e-06, |
| "loss": -0.0211, |
| "reward": 6.134096622467041, |
| "reward_std": 1.0277494341135025, |
| "rewards/mrr_reward": 0.3545076847076416, |
| "rewards/rank_analyze_format_reward": 0.8130272477865219, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9974177181720734, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9974177181720734, |
| "step": 415 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 641.890625, |
| "epoch": 3.328, |
| "grad_norm": 0.034219007939100266, |
| "kl": 0.008632659912109375, |
| "learning_rate": 1.8004789067454763e-06, |
| "loss": -0.0222, |
| "reward": 6.18049168586731, |
| "reward_std": 0.5877049472182989, |
| "rewards/mrr_reward": 0.36963665671646595, |
| "rewards/rank_analyze_format_reward": 0.7174552381038666, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 416 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.578125, |
| "epoch": 3.336, |
| "grad_norm": 0.03582116961479187, |
| "kl": 0.010225296020507812, |
| "learning_rate": 1.7646740237157256e-06, |
| "loss": -0.017, |
| "reward": 5.773987531661987, |
| "reward_std": 0.6770635172724724, |
| "rewards/mrr_reward": 0.25188492238521576, |
| "rewards/rank_analyze_format_reward": 0.8191821128129959, |
| "rewards/rank_answer_foramt_reward": 0.955078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 417 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 617.609375, |
| "epoch": 3.344, |
| "grad_norm": 0.03502979502081871, |
| "kl": 0.010501861572265625, |
| "learning_rate": 1.7291942572543806e-06, |
| "loss": -0.0141, |
| "reward": 6.250616192817688, |
| "reward_std": 0.9646867886185646, |
| "rewards/mrr_reward": 0.3685888033360243, |
| "rewards/rank_analyze_format_reward": 0.8325932174921036, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 418 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.5, |
| "epoch": 3.352, |
| "grad_norm": 0.03461702913045883, |
| "kl": 0.009029388427734375, |
| "learning_rate": 1.6940410080418723e-06, |
| "loss": 0.002, |
| "reward": 5.604981422424316, |
| "reward_std": 0.6194628737866879, |
| "rewards/mrr_reward": 0.21886160969734192, |
| "rewards/rank_analyze_format_reward": 0.7708402574062347, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.013382176868617535, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 419 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 629.5625, |
| "epoch": 3.36, |
| "grad_norm": 0.14745499193668365, |
| "kl": 0.036579132080078125, |
| "learning_rate": 1.6592156638682887e-06, |
| "loss": -0.011, |
| "reward": 5.543843626976013, |
| "reward_std": 0.6722467541694641, |
| "rewards/mrr_reward": 0.22320189327001572, |
| "rewards/rank_analyze_format_reward": 0.7316861301660538, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.013099747709929943, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 420 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 638.125, |
| "epoch": 3.368, |
| "grad_norm": 0.03604007884860039, |
| "kl": 0.009619712829589844, |
| "learning_rate": 1.6247195995785836e-06, |
| "loss": -0.0095, |
| "reward": 5.799249887466431, |
| "reward_std": 0.9532105177640915, |
| "rewards/mrr_reward": 0.2738591395318508, |
| "rewards/rank_analyze_format_reward": 0.8365109711885452, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9834558814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 421 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.09375, |
| "epoch": 3.376, |
| "grad_norm": 0.035499464720487595, |
| "kl": 0.012187957763671875, |
| "learning_rate": 1.5905541770183096e-06, |
| "loss": 0.007, |
| "reward": 5.608644723892212, |
| "reward_std": 0.8124164063483477, |
| "rewards/mrr_reward": 0.23054935038089752, |
| "rewards/rank_analyze_format_reward": 0.8543011993169785, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9453125, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 422 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.90625, |
| "epoch": 3.384, |
| "grad_norm": 0.031818851828575134, |
| "kl": 0.008767127990722656, |
| "learning_rate": 1.5567207449798517e-06, |
| "loss": -0.0073, |
| "reward": 5.9224079847335815, |
| "reward_std": 1.0137402415275574, |
| "rewards/mrr_reward": 0.3127914294600487, |
| "rewards/rank_analyze_format_reward": 0.8292285054922104, |
| "rewards/rank_answer_foramt_reward": 0.953125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9678819477558136, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9678819477558136, |
| "step": 423 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 669.125, |
| "epoch": 3.392, |
| "grad_norm": 0.032386232167482376, |
| "kl": 0.009622573852539062, |
| "learning_rate": 1.52322063914917e-06, |
| "loss": -0.0083, |
| "reward": 5.772087574005127, |
| "reward_std": 1.0963159650564194, |
| "rewards/mrr_reward": 0.2973834425210953, |
| "rewards/rank_analyze_format_reward": 0.7504077702760696, |
| "rewards/rank_answer_foramt_reward": 0.873046875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 424 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.953125, |
| "epoch": 3.4, |
| "grad_norm": 0.03259604051709175, |
| "kl": 0.008920669555664062, |
| "learning_rate": 1.490055182053083e-06, |
| "loss": -0.0153, |
| "reward": 6.132978916168213, |
| "reward_std": 0.8273980095982552, |
| "rewards/mrr_reward": 0.349144347012043, |
| "rewards/rank_analyze_format_reward": 0.7989014983177185, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.078125, |
| "epoch": 3.408, |
| "grad_norm": 0.034432653337717056, |
| "kl": 0.008844375610351562, |
| "learning_rate": 1.4572256830070497e-06, |
| "loss": 0.0307, |
| "reward": 5.3963083028793335, |
| "reward_std": 0.7059964388608932, |
| "rewards/mrr_reward": 0.17215402238070965, |
| "rewards/rank_analyze_format_reward": 0.8245711624622345, |
| "rewards/rank_answer_foramt_reward": 0.955078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9835526347160339, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 426 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.421875, |
| "epoch": 3.416, |
| "grad_norm": 0.03640694543719292, |
| "kl": 0.009423255920410156, |
| "learning_rate": 1.4247334380634792e-06, |
| "loss": -0.0005, |
| "reward": 5.745667219161987, |
| "reward_std": 1.3311158269643784, |
| "rewards/mrr_reward": 0.26646826043725014, |
| "rewards/rank_analyze_format_reward": 0.875150740146637, |
| "rewards/rank_answer_foramt_reward": 0.794921875, |
| "rewards/rank_contrast_format_reward": 0.015236318111419678, |
| "rewards/rank_initial_format_reward": 0.9972426444292068, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9972426444292068, |
| "step": 427 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.109375, |
| "epoch": 3.424, |
| "grad_norm": 0.034119006246328354, |
| "kl": 0.008829116821289062, |
| "learning_rate": 1.3925797299605649e-06, |
| "loss": -0.0087, |
| "reward": 5.416736364364624, |
| "reward_std": 0.5500404462218285, |
| "rewards/mrr_reward": 0.16524678096175194, |
| "rewards/rank_analyze_format_reward": 0.8143431395292282, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 428 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 682.46875, |
| "epoch": 3.432, |
| "grad_norm": 0.03242919594049454, |
| "kl": 0.008722305297851562, |
| "learning_rate": 1.3607658280716474e-06, |
| "loss": -0.0019, |
| "reward": 5.716560482978821, |
| "reward_std": 0.5728246569633484, |
| "rewards/mrr_reward": 0.23904389142990112, |
| "rewards/rank_analyze_format_reward": 0.8736661523580551, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 429 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 626.25, |
| "epoch": 3.44, |
| "grad_norm": 0.03218907490372658, |
| "kl": 0.009031295776367188, |
| "learning_rate": 1.3292929883550998e-06, |
| "loss": -0.006, |
| "reward": 6.318280220031738, |
| "reward_std": 0.6036234200000763, |
| "rewards/mrr_reward": 0.38396577537059784, |
| "rewards/rank_analyze_format_reward": 0.8253858089447021, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 430 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 670.171875, |
| "epoch": 3.448, |
| "grad_norm": 0.03488701581954956, |
| "kl": 0.009675979614257812, |
| "learning_rate": 1.2981624533047432e-06, |
| "loss": -0.0275, |
| "reward": 5.669755578041077, |
| "reward_std": 1.1037492379546165, |
| "rewards/mrr_reward": 0.25412946194410324, |
| "rewards/rank_analyze_format_reward": 0.7779058814048767, |
| "rewards/rank_answer_foramt_reward": 0.88671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982128292322159, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9982128292322159, |
| "step": 431 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 649.71875, |
| "epoch": 3.456, |
| "grad_norm": 0.03478289395570755, |
| "kl": 0.009863853454589844, |
| "learning_rate": 1.2673754519008008e-06, |
| "loss": -0.0365, |
| "reward": 5.450997948646545, |
| "reward_std": 0.6340261902660131, |
| "rewards/mrr_reward": 0.1804935522377491, |
| "rewards/rank_analyze_format_reward": 0.8247267752885818, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 432 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 632.875, |
| "epoch": 3.464, |
| "grad_norm": 0.03279885649681091, |
| "kl": 0.0091705322265625, |
| "learning_rate": 1.2369331995613664e-06, |
| "loss": 0.0036, |
| "reward": 5.708705902099609, |
| "reward_std": 0.6798514872789383, |
| "rewards/mrr_reward": 0.24531250447034836, |
| "rewards/rank_analyze_format_reward": 0.7816782742738724, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9992559552192688, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9836309552192688, |
| "step": 433 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.09375, |
| "epoch": 3.472, |
| "grad_norm": 0.03269350156188011, |
| "kl": 0.00931549072265625, |
| "learning_rate": 1.206836898094439e-06, |
| "loss": 0.0103, |
| "reward": 6.359462022781372, |
| "reward_std": 0.8851971626281738, |
| "rewards/mrr_reward": 0.3808903694152832, |
| "rewards/rank_analyze_format_reward": 0.8730098009109497, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 434 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 609.71875, |
| "epoch": 3.48, |
| "grad_norm": 0.03501614183187485, |
| "kl": 0.010263442993164062, |
| "learning_rate": 1.1770877356504684e-06, |
| "loss": 0.0062, |
| "reward": 6.128593564033508, |
| "reward_std": 0.9365501217544079, |
| "rewards/mrr_reward": 0.361879987642169, |
| "rewards/rank_analyze_format_reward": 0.7884955406188965, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 435 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.265625, |
| "epoch": 3.488, |
| "grad_norm": 0.033260658383369446, |
| "kl": 0.008501052856445312, |
| "learning_rate": 1.1476868866754488e-06, |
| "loss": -0.0187, |
| "reward": 5.864734411239624, |
| "reward_std": 0.4621109887957573, |
| "rewards/mrr_reward": 0.25895338132977486, |
| "rewards/rank_analyze_format_reward": 0.8562646806240082, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 436 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 639.625, |
| "epoch": 3.496, |
| "grad_norm": 0.03378913179039955, |
| "kl": 0.009352684020996094, |
| "learning_rate": 1.1186355118645552e-06, |
| "loss": -0.0349, |
| "reward": 5.749386191368103, |
| "reward_std": 0.3994421735405922, |
| "rewards/mrr_reward": 0.2401475664228201, |
| "rewards/rank_analyze_format_reward": 0.8180928528308868, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 437 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 682.265625, |
| "epoch": 3.504, |
| "grad_norm": 0.03421083465218544, |
| "kl": 0.009157180786132812, |
| "learning_rate": 1.0899347581163222e-06, |
| "loss": -0.005, |
| "reward": 5.491220116615295, |
| "reward_std": 0.40433138608932495, |
| "rewards/mrr_reward": 0.15951761417090893, |
| "rewards/rank_analyze_format_reward": 0.8843995481729507, |
| "rewards/rank_answer_foramt_reward": 0.984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 438 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 643.046875, |
| "epoch": 3.512, |
| "grad_norm": 0.03325556218624115, |
| "kl": 0.008181571960449219, |
| "learning_rate": 1.0615857584873624e-06, |
| "loss": 0.0038, |
| "reward": 5.652897953987122, |
| "reward_std": 0.42538975179195404, |
| "rewards/mrr_reward": 0.2011718824505806, |
| "rewards/rank_analyze_format_reward": 0.8716480582952499, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 439 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 664.34375, |
| "epoch": 3.52, |
| "grad_norm": 0.03412213921546936, |
| "kl": 0.009923934936523438, |
| "learning_rate": 1.0335896321476413e-06, |
| "loss": 0.0084, |
| "reward": 5.463203430175781, |
| "reward_std": 0.47837162390351295, |
| "rewards/mrr_reward": 0.17101315408945084, |
| "rewards/rank_analyze_format_reward": 0.8377447873353958, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 440 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 665.5625, |
| "epoch": 3.528, |
| "grad_norm": 0.033871665596961975, |
| "kl": 0.008985519409179688, |
| "learning_rate": 1.0059474843362893e-06, |
| "loss": -0.0253, |
| "reward": 5.863033652305603, |
| "reward_std": 0.856530025601387, |
| "rewards/mrr_reward": 0.2802455462515354, |
| "rewards/rank_analyze_format_reward": 0.8234658539295197, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9983552694320679, |
| "step": 441 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 614.828125, |
| "epoch": 3.536, |
| "grad_norm": 0.03464524820446968, |
| "kl": 0.010242462158203125, |
| "learning_rate": 9.786604063179728e-07, |
| "loss": -0.0112, |
| "reward": 6.178846478462219, |
| "reward_std": 0.7333296239376068, |
| "rewards/mrr_reward": 0.35404886677861214, |
| "rewards/rank_analyze_format_reward": 0.8036665320396423, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 442 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.484375, |
| "epoch": 3.544, |
| "grad_norm": 0.039022162556648254, |
| "kl": 0.010075569152832031, |
| "learning_rate": 9.517294753398066e-07, |
| "loss": -0.0103, |
| "reward": 6.17569887638092, |
| "reward_std": 0.805585939437151, |
| "rewards/mrr_reward": 0.3522135466337204, |
| "rewards/rank_analyze_format_reward": 0.8039542138576508, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 443 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 655.3125, |
| "epoch": 3.552, |
| "grad_norm": 0.03630434721708298, |
| "kl": 0.011911392211914062, |
| "learning_rate": 9.251557545888312e-07, |
| "loss": 0.0073, |
| "reward": 5.2746394872665405, |
| "reward_std": 0.6974633485078812, |
| "rewards/mrr_reward": 0.142398314550519, |
| "rewards/rank_analyze_format_reward": 0.8372418582439423, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983552694320679, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9827302694320679, |
| "step": 444 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 614.171875, |
| "epoch": 3.56, |
| "grad_norm": 0.03797624632716179, |
| "kl": 0.009185791015625, |
| "learning_rate": 8.989402931500434e-07, |
| "loss": -0.0257, |
| "reward": 5.71190345287323, |
| "reward_std": 0.7688554152846336, |
| "rewards/mrr_reward": 0.2587921619415283, |
| "rewards/rank_analyze_format_reward": 0.7587659955024719, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 445 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 661.84375, |
| "epoch": 3.568, |
| "grad_norm": 0.036750566214323044, |
| "kl": 0.009357452392578125, |
| "learning_rate": 8.730841259649725e-07, |
| "loss": 0.0165, |
| "reward": 6.172403573989868, |
| "reward_std": 0.5994044467806816, |
| "rewards/mrr_reward": 0.3739459365606308, |
| "rewards/rank_analyze_format_reward": 0.7410728335380554, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 446 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 661.359375, |
| "epoch": 3.576, |
| "grad_norm": 0.030956851318478584, |
| "kl": 0.009416580200195312, |
| "learning_rate": 8.475882737908248e-07, |
| "loss": -0.0069, |
| "reward": 5.37591028213501, |
| "reward_std": 0.33451657742261887, |
| "rewards/mrr_reward": 0.1444692499935627, |
| "rewards/rank_analyze_format_reward": 0.8814376294612885, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983368366956711, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9827118366956711, |
| "step": 447 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.265625, |
| "epoch": 3.584, |
| "grad_norm": 0.03611503168940544, |
| "kl": 0.010535240173339844, |
| "learning_rate": 8.224537431601886e-07, |
| "loss": 0.0162, |
| "reward": 5.798620223999023, |
| "reward_std": 0.4908381961286068, |
| "rewards/mrr_reward": 0.23276909813284874, |
| "rewards/rank_analyze_format_reward": 0.8812157958745956, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 448 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 632.671875, |
| "epoch": 3.592, |
| "grad_norm": 0.03399351239204407, |
| "kl": 0.009492874145507812, |
| "learning_rate": 7.976815263412963e-07, |
| "loss": -0.0118, |
| "reward": 6.341515421867371, |
| "reward_std": 0.6863338127732277, |
| "rewards/mrr_reward": 0.3844680190086365, |
| "rewards/rank_analyze_format_reward": 0.8559543788433075, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 449 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 618.03125, |
| "epoch": 3.6, |
| "grad_norm": 0.039749711751937866, |
| "kl": 0.009737014770507812, |
| "learning_rate": 7.732726012988512e-07, |
| "loss": -0.0146, |
| "reward": 5.313909411430359, |
| "reward_std": 0.4731576666235924, |
| "rewards/mrr_reward": 0.13981274887919426, |
| "rewards/rank_analyze_format_reward": 0.8499381393194199, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9982585161924362, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9982585161924362, |
| "step": 450 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 629.46875, |
| "epoch": 3.608, |
| "grad_norm": 0.03814476728439331, |
| "kl": 0.009777069091796875, |
| "learning_rate": 7.492279316554207e-07, |
| "loss": 0.002, |
| "reward": 5.441387295722961, |
| "reward_std": 0.964412122964859, |
| "rewards/mrr_reward": 0.2072792761027813, |
| "rewards/rank_analyze_format_reward": 0.7820773273706436, |
| "rewards/rank_answer_foramt_reward": 0.90234375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9834558814764023, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 451 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 621.78125, |
| "epoch": 3.616, |
| "grad_norm": 0.03884231299161911, |
| "kl": 0.010179519653320312, |
| "learning_rate": 7.255484666533874e-07, |
| "loss": -0.0293, |
| "reward": 6.066041827201843, |
| "reward_std": 1.1558443158864975, |
| "rewards/mrr_reward": 0.35128968954086304, |
| "rewards/rank_analyze_format_reward": 0.7974868565797806, |
| "rewards/rank_answer_foramt_reward": 0.888671875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 452 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.5625, |
| "epoch": 3.624, |
| "grad_norm": 0.035573799163103104, |
| "kl": 0.009339332580566406, |
| "learning_rate": 7.022351411174866e-07, |
| "loss": 0.0195, |
| "reward": 5.562433242797852, |
| "reward_std": 0.8050966486334801, |
| "rewards/mrr_reward": 0.21009425073862076, |
| "rewards/rank_analyze_format_reward": 0.816357433795929, |
| "rewards/rank_answer_foramt_reward": 0.916015625, |
| "rewards/rank_contrast_format_reward": 0.010822821408510208, |
| "rewards/rank_initial_format_reward": 0.9972426444292068, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9972426444292068, |
| "step": 453 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 606.40625, |
| "epoch": 3.632, |
| "grad_norm": 0.03481682762503624, |
| "kl": 0.008967399597167969, |
| "learning_rate": 6.792888754178906e-07, |
| "loss": 0.0103, |
| "reward": 5.574246048927307, |
| "reward_std": 0.6400253660976887, |
| "rewards/mrr_reward": 0.20182291604578495, |
| "rewards/rank_analyze_format_reward": 0.8294544816017151, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 454 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 669.28125, |
| "epoch": 3.64, |
| "grad_norm": 0.031867899000644684, |
| "kl": 0.009153366088867188, |
| "learning_rate": 6.567105754338798e-07, |
| "loss": -0.0139, |
| "reward": 5.777210593223572, |
| "reward_std": 0.6435952112078667, |
| "rewards/mrr_reward": 0.2407862152904272, |
| "rewards/rank_analyze_format_reward": 0.8998882919549942, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 455 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.46875, |
| "epoch": 3.648, |
| "grad_norm": 0.03578875586390495, |
| "kl": 0.009320259094238281, |
| "learning_rate": 6.345011325180772e-07, |
| "loss": 0.0063, |
| "reward": 5.383034586906433, |
| "reward_std": 0.5498589277267456, |
| "rewards/mrr_reward": 0.15843254141509533, |
| "rewards/rank_analyze_format_reward": 0.882116824388504, |
| "rewards/rank_answer_foramt_reward": 0.890625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 456 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 663.125, |
| "epoch": 3.656, |
| "grad_norm": 0.03667246922850609, |
| "kl": 0.009763717651367188, |
| "learning_rate": 6.126614234612593e-07, |
| "loss": -0.0018, |
| "reward": 5.633982062339783, |
| "reward_std": 0.5885076597332954, |
| "rewards/mrr_reward": 0.20900297909975052, |
| "rewards/rank_analyze_format_reward": 0.8507044613361359, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 457 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 660.265625, |
| "epoch": 3.664, |
| "grad_norm": 0.03183294087648392, |
| "kl": 0.009653091430664062, |
| "learning_rate": 5.911923104577455e-07, |
| "loss": -0.03, |
| "reward": 6.144891262054443, |
| "reward_std": 0.7887123003602028, |
| "rewards/mrr_reward": 0.35075025632977486, |
| "rewards/rank_analyze_format_reward": 0.784550666809082, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 458 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 653.640625, |
| "epoch": 3.672, |
| "grad_norm": 0.03396729752421379, |
| "kl": 0.009286880493164062, |
| "learning_rate": 5.700946410713548e-07, |
| "loss": -0.0222, |
| "reward": 5.685562252998352, |
| "reward_std": 0.6213907264173031, |
| "rewards/mrr_reward": 0.24081102386116982, |
| "rewards/rank_analyze_format_reward": 0.7749374657869339, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 459 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 633.984375, |
| "epoch": 3.68, |
| "grad_norm": 0.03488519787788391, |
| "kl": 0.008077621459960938, |
| "learning_rate": 5.49369248201953e-07, |
| "loss": -0.0312, |
| "reward": 5.230130910873413, |
| "reward_std": 0.3854878172278404, |
| "rewards/mrr_reward": 0.1375558041036129, |
| "rewards/rank_analyze_format_reward": 0.7379215955734253, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9983368366956711, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9983368366956711, |
| "step": 460 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 638.0625, |
| "epoch": 3.6879999999999997, |
| "grad_norm": 0.035259000957012177, |
| "kl": 0.009763717651367188, |
| "learning_rate": 5.290169500525577e-07, |
| "loss": -0.0113, |
| "reward": 5.952216863632202, |
| "reward_std": 0.7991086803376675, |
| "rewards/mrr_reward": 0.3176587447524071, |
| "rewards/rank_analyze_format_reward": 0.7538475692272186, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 461 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.921875, |
| "epoch": 3.6959999999999997, |
| "grad_norm": 0.03507812321186066, |
| "kl": 0.008602142333984375, |
| "learning_rate": 5.090385500970551e-07, |
| "loss": -0.0063, |
| "reward": 5.87649405002594, |
| "reward_std": 0.4639568105340004, |
| "rewards/mrr_reward": 0.2718812022358179, |
| "rewards/rank_analyze_format_reward": 0.8397503942251205, |
| "rewards/rank_answer_foramt_reward": 0.95703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 462 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 643.6875, |
| "epoch": 3.7039999999999997, |
| "grad_norm": 0.033048536628484726, |
| "kl": 0.008604049682617188, |
| "learning_rate": 4.894348370484648e-07, |
| "loss": 0.0067, |
| "reward": 5.392473220825195, |
| "reward_std": 0.3769769836217165, |
| "rewards/mrr_reward": 0.15872395411133766, |
| "rewards/rank_analyze_format_reward": 0.8239836394786835, |
| "rewards/rank_answer_foramt_reward": 0.94140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 463 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.796875, |
| "epoch": 3.7119999999999997, |
| "grad_norm": 0.0345633290708065, |
| "kl": 0.008055686950683594, |
| "learning_rate": 4.702065848278126e-07, |
| "loss": -0.0091, |
| "reward": 5.816651225090027, |
| "reward_std": 0.8016383498907089, |
| "rewards/mrr_reward": 0.27346230298280716, |
| "rewards/rank_analyze_format_reward": 0.822411373257637, |
| "rewards/rank_answer_foramt_reward": 0.900390625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 464 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.140625, |
| "epoch": 3.7199999999999998, |
| "grad_norm": 0.03394823893904686, |
| "kl": 0.0093536376953125, |
| "learning_rate": 4.5135455253357053e-07, |
| "loss": 0.0025, |
| "reward": 5.927626967430115, |
| "reward_std": 0.6637560278177261, |
| "rewards/mrr_reward": 0.29126983508467674, |
| "rewards/rank_analyze_format_reward": 0.7999017089605331, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.009406094439327717, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 465 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.921875, |
| "epoch": 3.7279999999999998, |
| "grad_norm": 0.03364517539739609, |
| "kl": 0.009876251220703125, |
| "learning_rate": 4.3287948441169457e-07, |
| "loss": -0.0222, |
| "reward": 5.698633670806885, |
| "reward_std": 0.6399536728858948, |
| "rewards/mrr_reward": 0.2584015466272831, |
| "rewards/rank_analyze_format_reward": 0.760730504989624, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 466 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 677.609375, |
| "epoch": 3.7359999999999998, |
| "grad_norm": 0.033980004489421844, |
| "kl": 0.009052276611328125, |
| "learning_rate": 4.1478210982624055e-07, |
| "loss": -0.0266, |
| "reward": 5.83451247215271, |
| "reward_std": 0.7994016855955124, |
| "rewards/mrr_reward": 0.2674727253615856, |
| "rewards/rank_analyze_format_reward": 0.8564186096191406, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 467 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 632.203125, |
| "epoch": 3.7439999999999998, |
| "grad_norm": 0.03298342972993851, |
| "kl": 0.010210037231445312, |
| "learning_rate": 3.9706314323056936e-07, |
| "loss": -0.0358, |
| "reward": 5.20824921131134, |
| "reward_std": 0.5714588239789009, |
| "rewards/mrr_reward": 0.13792782835662365, |
| "rewards/rank_analyze_format_reward": 0.7483347654342651, |
| "rewards/rank_answer_foramt_reward": 0.955078125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.984375, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 468 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 639.03125, |
| "epoch": 3.752, |
| "grad_norm": 0.03382823243737221, |
| "kl": 0.008636474609375, |
| "learning_rate": 3.7972328413914074e-07, |
| "loss": 0.0041, |
| "reward": 5.7235270738601685, |
| "reward_std": 0.36125198751688004, |
| "rewards/mrr_reward": 0.220572916790843, |
| "rewards/rank_analyze_format_reward": 0.8685792237520218, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 469 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.21875, |
| "epoch": 3.76, |
| "grad_norm": 0.03393082320690155, |
| "kl": 0.0091400146484375, |
| "learning_rate": 3.627632170999029e-07, |
| "loss": -0.0087, |
| "reward": 5.6611692905426025, |
| "reward_std": 0.46365745551884174, |
| "rewards/mrr_reward": 0.20757068321108818, |
| "rewards/rank_analyze_format_reward": 0.8445583134889603, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 470 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.671875, |
| "epoch": 3.768, |
| "grad_norm": 0.03466450795531273, |
| "kl": 0.011438369750976562, |
| "learning_rate": 3.4618361166726123e-07, |
| "loss": -0.0145, |
| "reward": 5.37939190864563, |
| "reward_std": 0.3622877076268196, |
| "rewards/mrr_reward": 0.15358383394777775, |
| "rewards/rank_analyze_format_reward": 0.8029301166534424, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.998641312122345, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.998641312122345, |
| "step": 471 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.171875, |
| "epoch": 3.776, |
| "grad_norm": 0.03238410875201225, |
| "kl": 0.009354591369628906, |
| "learning_rate": 3.2998512237565005e-07, |
| "loss": -0.0078, |
| "reward": 5.8616310358047485, |
| "reward_std": 0.5439105778932571, |
| "rewards/mrr_reward": 0.2814174108207226, |
| "rewards/rank_analyze_format_reward": 0.810180202126503, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 472 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 636.328125, |
| "epoch": 3.784, |
| "grad_norm": 0.033094972372055054, |
| "kl": 0.008489608764648438, |
| "learning_rate": 3.1416838871368925e-07, |
| "loss": -0.0358, |
| "reward": 5.760077238082886, |
| "reward_std": 0.7393556013703346, |
| "rewards/mrr_reward": 0.26919643953442574, |
| "rewards/rank_analyze_format_reward": 0.7711820602416992, |
| "rewards/rank_answer_foramt_reward": 0.912109375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 473 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 654.078125, |
| "epoch": 3.792, |
| "grad_norm": 0.07576505839824677, |
| "kl": 0.020990371704101562, |
| "learning_rate": 2.987340350989421e-07, |
| "loss": -0.0251, |
| "reward": 5.671926021575928, |
| "reward_std": 0.7974754720926285, |
| "rewards/mrr_reward": 0.23645834252238274, |
| "rewards/rank_analyze_format_reward": 0.8246570378541946, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9975927919149399, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9975927919149399, |
| "step": 474 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 647.640625, |
| "epoch": 3.8, |
| "grad_norm": 0.03585861995816231, |
| "kl": 0.009164810180664062, |
| "learning_rate": 2.836826708532603e-07, |
| "loss": 0.0167, |
| "reward": 5.570275187492371, |
| "reward_std": 0.47667882964015007, |
| "rewards/mrr_reward": 0.18828125298023224, |
| "rewards/rank_analyze_format_reward": 0.8715294301509857, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 475 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 650.09375, |
| "epoch": 3.808, |
| "grad_norm": 0.032184626907110214, |
| "kl": 0.008043289184570312, |
| "learning_rate": 2.6901489017873375e-07, |
| "loss": -0.015, |
| "reward": 5.561799049377441, |
| "reward_std": 0.6698030084371567, |
| "rewards/mrr_reward": 0.21584821678698063, |
| "rewards/rank_analyze_format_reward": 0.7451662421226501, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 476 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 639.921875, |
| "epoch": 3.816, |
| "grad_norm": 0.0390472486615181, |
| "kl": 0.009185791015625, |
| "learning_rate": 2.547312721342277e-07, |
| "loss": 0.0123, |
| "reward": 5.636685132980347, |
| "reward_std": 0.7052134126424789, |
| "rewards/mrr_reward": 0.2304439563304186, |
| "rewards/rank_analyze_format_reward": 0.8242843002080917, |
| "rewards/rank_answer_foramt_reward": 0.9140625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 477 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 671.875, |
| "epoch": 3.824, |
| "grad_norm": 0.03435278683900833, |
| "kl": 0.008884429931640625, |
| "learning_rate": 2.4083238061252565e-07, |
| "loss": 0.0292, |
| "reward": 5.697912573814392, |
| "reward_std": 0.8254741281270981, |
| "rewards/mrr_reward": 0.25242435559630394, |
| "rewards/rank_analyze_format_reward": 0.7663401514291763, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 478 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 609.203125, |
| "epoch": 3.832, |
| "grad_norm": 0.03665322810411453, |
| "kl": 0.010316848754882812, |
| "learning_rate": 2.273187643180652e-07, |
| "loss": -0.0089, |
| "reward": 5.517371296882629, |
| "reward_std": 0.5238508731126785, |
| "rewards/mrr_reward": 0.2027529776096344, |
| "rewards/rank_analyze_format_reward": 0.747374877333641, |
| "rewards/rank_answer_foramt_reward": 0.958984375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 479 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 671.40625, |
| "epoch": 3.84, |
| "grad_norm": 0.033383507281541824, |
| "kl": 0.007233619689941406, |
| "learning_rate": 2.1419095674527934e-07, |
| "loss": 0.0034, |
| "reward": 5.486920118331909, |
| "reward_std": 0.46073780953884125, |
| "rewards/mrr_reward": 0.17415675148367882, |
| "rewards/rank_analyze_format_reward": 0.8176367580890656, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 480 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 642.09375, |
| "epoch": 3.848, |
| "grad_norm": 0.03508320823311806, |
| "kl": 0.00882720947265625, |
| "learning_rate": 2.014494761575314e-07, |
| "loss": -0.0212, |
| "reward": 6.144862055778503, |
| "reward_std": 0.6686284840106964, |
| "rewards/mrr_reward": 0.3460751511156559, |
| "rewards/rank_analyze_format_reward": 0.846498966217041, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 481 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 650.984375, |
| "epoch": 3.856, |
| "grad_norm": 0.0337209552526474, |
| "kl": 0.009634017944335938, |
| "learning_rate": 1.8909482556666026e-07, |
| "loss": 0.0118, |
| "reward": 5.237658619880676, |
| "reward_std": 0.35679256170988083, |
| "rewards/mrr_reward": 0.12162698619067669, |
| "rewards/rank_analyze_format_reward": 0.8467388600111008, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 482 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 645.1875, |
| "epoch": 3.864, |
| "grad_norm": 0.034814029932022095, |
| "kl": 0.0094757080078125, |
| "learning_rate": 1.7712749271311392e-07, |
| "loss": -0.0091, |
| "reward": 5.641056180000305, |
| "reward_std": 0.48677169997245073, |
| "rewards/mrr_reward": 0.19029638543725014, |
| "rewards/rank_analyze_format_reward": 0.8798703849315643, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 483 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 619.53125, |
| "epoch": 3.872, |
| "grad_norm": 0.040520522743463516, |
| "kl": 0.0150299072265625, |
| "learning_rate": 1.6554795004670389e-07, |
| "loss": -0.0202, |
| "reward": 6.0107786655426025, |
| "reward_std": 0.7183677442371845, |
| "rewards/mrr_reward": 0.3212549705058336, |
| "rewards/rank_analyze_format_reward": 0.81850266456604, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9995265156030655, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9995265156030655, |
| "step": 484 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 654.921875, |
| "epoch": 3.88, |
| "grad_norm": 0.03340727090835571, |
| "kl": 0.008794784545898438, |
| "learning_rate": 1.543566547079467e-07, |
| "loss": 0.0042, |
| "reward": 6.221985816955566, |
| "reward_std": 0.7052921280264854, |
| "rewards/mrr_reward": 0.34005457162857056, |
| "rewards/rank_analyze_format_reward": 0.871224895119667, |
| "rewards/rank_answer_foramt_reward": 1.0, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9991776347160339, |
| "step": 485 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 659.453125, |
| "epoch": 3.888, |
| "grad_norm": 0.033866625279188156, |
| "kl": 0.008701324462890625, |
| "learning_rate": 1.4355404851001953e-07, |
| "loss": -0.0056, |
| "reward": 5.805374503135681, |
| "reward_std": 0.8619978576898575, |
| "rewards/mrr_reward": 0.2608507052063942, |
| "rewards/rank_analyze_format_reward": 0.832169234752655, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 486 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.03125, |
| "epoch": 3.896, |
| "grad_norm": 0.03655744716525078, |
| "kl": 0.009066581726074219, |
| "learning_rate": 1.3314055792131964e-07, |
| "loss": -0.0147, |
| "reward": 5.252833724021912, |
| "reward_std": 0.5554525479674339, |
| "rewards/mrr_reward": 0.15148189663887024, |
| "rewards/rank_analyze_format_reward": 0.7708722352981567, |
| "rewards/rank_answer_foramt_reward": 0.931640625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.984375, |
| "step": 487 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 641.859375, |
| "epoch": 3.904, |
| "grad_norm": 0.035562798380851746, |
| "kl": 0.008677482604980469, |
| "learning_rate": 1.231165940486234e-07, |
| "loss": -0.01, |
| "reward": 5.259171485900879, |
| "reward_std": 0.41013093292713165, |
| "rewards/mrr_reward": 0.14392981678247452, |
| "rewards/rank_analyze_format_reward": 0.7830992192029953, |
| "rewards/rank_answer_foramt_reward": 0.91796875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9911921620368958, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9911921620368958, |
| "step": 488 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 656.796875, |
| "epoch": 3.912, |
| "grad_norm": 0.049858458340168, |
| "kl": 0.011943817138671875, |
| "learning_rate": 1.134825526208605e-07, |
| "loss": 0.0081, |
| "reward": 5.8922260999679565, |
| "reward_std": 0.6895529553294182, |
| "rewards/mrr_reward": 0.2795138917863369, |
| "rewards/rank_analyze_format_reward": 0.8414849489927292, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9975927919149399, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9975927919149399, |
| "step": 489 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 630.03125, |
| "epoch": 3.92, |
| "grad_norm": 0.03522248566150665, |
| "kl": 0.009510040283203125, |
| "learning_rate": 1.0423881397349067e-07, |
| "loss": -0.0013, |
| "reward": 6.210868835449219, |
| "reward_std": 0.7074924185872078, |
| "rewards/mrr_reward": 0.3486483208835125, |
| "rewards/rank_analyze_format_reward": 0.8533848524093628, |
| "rewards/rank_answer_foramt_reward": 0.970703125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 490 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 620.046875, |
| "epoch": 3.928, |
| "grad_norm": 0.033086903393268585, |
| "kl": 0.009950637817382812, |
| "learning_rate": 9.538574303348813e-08, |
| "loss": -0.0079, |
| "reward": 6.068167686462402, |
| "reward_std": 1.18388731777668, |
| "rewards/mrr_reward": 0.35264757089316845, |
| "rewards/rank_analyze_format_reward": 0.7939876317977905, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9835526347160339, |
| "rewards/rank_overall_format_reward_more": 0.953125, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 491 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 620.5625, |
| "epoch": 3.936, |
| "grad_norm": 0.03736709803342819, |
| "kl": 0.009596824645996094, |
| "learning_rate": 8.692368930493522e-08, |
| "loss": -0.0075, |
| "reward": 6.095898747444153, |
| "reward_std": 0.8004505969583988, |
| "rewards/mrr_reward": 0.3334883488714695, |
| "rewards/rank_analyze_format_reward": 0.8049141466617584, |
| "rewards/rank_answer_foramt_reward": 0.97265625, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.984375, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 492 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.53125, |
| "epoch": 3.944, |
| "grad_norm": 0.036802154034376144, |
| "kl": 0.010059356689453125, |
| "learning_rate": 7.885298685522235e-08, |
| "loss": -0.0271, |
| "reward": 5.225739121437073, |
| "reward_std": 0.7527187168598175, |
| "rewards/mrr_reward": 0.1519965250045061, |
| "rewards/rank_analyze_format_reward": 0.7541633769869804, |
| "rewards/rank_answer_foramt_reward": 0.904296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9991776347160339, |
| "rewards/rank_overall_format_reward_more": 0.9765625, |
| "rewards/rank_verify_format_reward": 0.9835526347160339, |
| "step": 493 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 651.234375, |
| "epoch": 3.952, |
| "grad_norm": 0.03432526811957359, |
| "kl": 0.009029388427734375, |
| "learning_rate": 7.117395430186414e-08, |
| "loss": 0.0325, |
| "reward": 5.624658584594727, |
| "reward_std": 0.5472202897071838, |
| "rewards/mrr_reward": 0.24223089963197708, |
| "rewards/rank_analyze_format_reward": 0.7981982976198196, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9834558814764023, |
| "rewards/rank_overall_format_reward_more": 0.9609375, |
| "rewards/rank_verify_format_reward": 0.9834558814764023, |
| "step": 494 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 680.015625, |
| "epoch": 3.96, |
| "grad_norm": 0.0339614674448967, |
| "kl": 0.009700775146484375, |
| "learning_rate": 6.388689479991606e-08, |
| "loss": -0.0091, |
| "reward": 6.06891131401062, |
| "reward_std": 0.9122689664363861, |
| "rewards/mrr_reward": 0.3243551626801491, |
| "rewards/rank_analyze_format_reward": 0.8358288407325745, |
| "rewards/rank_answer_foramt_reward": 0.9453125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 495 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 666.359375, |
| "epoch": 3.968, |
| "grad_norm": 0.03345409035682678, |
| "kl": 0.00970458984375, |
| "learning_rate": 5.699209603001077e-08, |
| "loss": 0.0057, |
| "reward": 5.823601126670837, |
| "reward_std": 0.8781716674566269, |
| "rewards/mrr_reward": 0.2674107179045677, |
| "rewards/rank_analyze_format_reward": 0.8760750144720078, |
| "rewards/rank_answer_foramt_reward": 0.943359375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9828869104385376, |
| "rewards/rank_overall_format_reward_more": 0.96875, |
| "rewards/rank_verify_format_reward": 0.9828869104385376, |
| "step": 496 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 640.28125, |
| "epoch": 3.976, |
| "grad_norm": 0.0337567999958992, |
| "kl": 0.008623123168945312, |
| "learning_rate": 5.048983018699827e-08, |
| "loss": -0.0062, |
| "reward": 6.0533905029296875, |
| "reward_std": 0.9021812565624714, |
| "rewards/mrr_reward": 0.329222459346056, |
| "rewards/rank_analyze_format_reward": 0.8165788054466248, |
| "rewards/rank_answer_foramt_reward": 0.927734375, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 497 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.0, |
| "epoch": 3.984, |
| "grad_norm": 0.03243754804134369, |
| "kl": 0.01001739501953125, |
| "learning_rate": 4.438035396920004e-08, |
| "loss": -0.0205, |
| "reward": 6.078030347824097, |
| "reward_std": 0.4891853742301464, |
| "rewards/mrr_reward": 0.31452134251594543, |
| "rewards/rank_analyze_format_reward": 0.8414293229579926, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 498 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 653.953125, |
| "epoch": 3.992, |
| "grad_norm": 0.032491762191057205, |
| "kl": 0.009099960327148438, |
| "learning_rate": 3.866390856827495e-08, |
| "loss": -0.0154, |
| "reward": 5.358732223510742, |
| "reward_std": 0.5416415482759476, |
| "rewards/mrr_reward": 0.15270957723259926, |
| "rewards/rank_analyze_format_reward": 0.8260188400745392, |
| "rewards/rank_answer_foramt_reward": 0.9296875, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 1.0, |
| "rewards/rank_overall_format_reward_more": 0.9921875, |
| "rewards/rank_verify_format_reward": 1.0, |
| "step": 499 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 644.09375, |
| "epoch": 4.0, |
| "grad_norm": 0.03646353259682655, |
| "kl": 0.009246826171875, |
| "learning_rate": 3.3340719659701315e-08, |
| "loss": -0.0217, |
| "reward": 5.45065975189209, |
| "reward_std": 0.54698271676898, |
| "rewards/mrr_reward": 0.19483507424592972, |
| "rewards/rank_analyze_format_reward": 0.6868295818567276, |
| "rewards/rank_answer_foramt_reward": 0.986328125, |
| "rewards/rank_contrast_format_reward": 0.0, |
| "rewards/rank_initial_format_reward": 0.9990808814764023, |
| "rewards/rank_overall_format_reward_more": 1.0, |
| "rewards/rank_verify_format_reward": 0.9990808814764023, |
| "step": 500 |
| }, |
| { |
| "epoch": 4.0, |
| "step": 500, |
| "total_flos": 0.0, |
| "train_loss": -0.0017955374517478048, |
| "train_runtime": 38748.2534, |
| "train_samples_per_second": 0.826, |
| "train_steps_per_second": 0.013 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|